In [12]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.lines as mlines
import matplotlib.patches as mpatches
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, MiniBatchKMeans, SpectralClustering
import pickle

In [13]:
def load_language_dataset(language:str, prepath='data', template='_dedupe_definitions_v2.pkl'):
    '''
    valid languages are determined by which zips you've unpickled: java, javascript, go, python
    '''
    with open(os.path.join(prepath, language + template), 'rb') as f:
        raw_data = pickle.load(f)
    return raw_data

token_filter = {
    'python': tr.NOISE_TOKEN_PYTHON | tr.LOGIC_TOKEN_PYTHON | tr.SYNTAX_TOKEN_PYTHON,
    'go': tr.NOISE_TOKEN_GO | tr.LOGIC_TOKEN_GO | tr.SYNTAX_TOKEN_GO,
    'java': tr.NOISE_TOKEN_JAVA | tr.LOGIC_TOKEN_JAVA | tr.SYNTAX_TOKEN_JAVA,
    'javascript': tr.NOISE_TOKEN_JS | tr.LOGIC_TOKEN_JS | tr.SYNTAX_TOKEN_JS
}

In [19]:
os.listdir('../data')

['go',
 'go.zip',
 'go_dedupe_definitions_v2.pkl',
 'go_licenses.pkl',
 'python',
 'python.zip',
 'python_dedupe_definitions_v2.pkl',
 'python_licenses.pkl',
 'ruby.zip']

In [None]:
p = load_language_dataset('python', prepath='..\\data')
g = load_language_dataset('go', prepath='..\\data')
j = load_language_dataset('java', prepath='..\\data')
js = load_language_dataset('javascript', prepath='..\\data')

In [None]:
# Taken From https://github.com/github/CodeSearchNet/blob/master/notebooks/ExploreData.ipynb
columns_long_list = ['repo', 'path', 'url', 'code', 
                     'code_tokens', 'docstring', 'docstring_tokens', 
                     'language', 'partition']

columns_short_list = ['code_tokens', 'docstring_tokens', 
                      'language', 'partition']

In [None]:
python_data = pd.DataFrame(p)

In [None]:
g_data = pd.DataFrame(g)
j_data = pd.DataFrame(j)
js_data = pd.DataFrame(js)

In [None]:
all_df = pd.concatenate([python_data, g_data, j_data, js_data])

In [None]:
all_df.language.value_counts()
all_df.groupby(['partition', 'language'])['code_tokens'].count()


In [None]:
all_df['code_len'] = all_df.code_tokens.apply(lambda x: len(x))
all_df['query_len'] = all_df.docstring_tokens.apply(lambda x: len(x))

In [None]:
code_len_summary = all_df.groupby('language')['code_len'].quantile([.5, .7, .8, .9, .95])
display(pd.DataFrame(code_len_summary))