In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.lines as mlines
import matplotlib.patches as mpatches
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, MiniBatchKMeans, SpectralClustering
import pickle

In [3]:
def load_language_dataset(language:str, prepath='data', template='_dedupe_definitions_v2.pkl'):
    '''
    valid languages are determined by which zips you've unpickled: java, javascript, go, python
    '''
    with open(os.path.join(prepath, language + template), 'rb') as f:
        raw_data = pickle.load(f)
    return raw_data

token_filter = {
    'python': tr.NOISE_TOKEN_PYTHON | tr.LOGIC_TOKEN_PYTHON | tr.SYNTAX_TOKEN_PYTHON,
    'go': tr.NOISE_TOKEN_GO | tr.LOGIC_TOKEN_GO | tr.SYNTAX_TOKEN_GO,
    'java': tr.NOISE_TOKEN_JAVA | tr.LOGIC_TOKEN_JAVA | tr.SYNTAX_TOKEN_JAVA,
    'javascript': tr.NOISE_TOKEN_JS | tr.LOGIC_TOKEN_JS | tr.SYNTAX_TOKEN_JS
}

In [None]:
# Taken From https://github.com/github/CodeSearchNet/blob/master/notebooks/ExploreData.ipynb
columns_long_list = ['repo', 'path', 'url', 'code', 
                     'code_tokens', 'docstring', 'docstring_tokens', 
                     'language', 'partition']

columns_short_list = ['code_tokens', 'docstring_tokens', 
                      'language', 'partition']

# Python Language Statistics

In [None]:
p = load_language_dataset('python', prepath='../data')
p_data = pd.DataFrame(p)
print('n =', p_data.shape)

In [None]:
python_fts = p_data['function_tokens'].apply(lambda x: len(x))
python_dcs = p_data['docstring_tokens'].apply(lambda x: len(x))
print(python_fts.quantile([.5, .75, .9, .95]))
print('mean = ', python_fts.mean())
print(python_dcs.quantile([.5, .75, .9, .95]))
print('mean = ', python_dcs.mean())

a = p_data['function_tokens'].apply(lambda x : set(map(lambda x: x.lower(), x)))
print('total number of tokens = ', np.sum(a.apply(lambda x: len(x))))
print('number of unique tokens =', len(set().union(*a)))

In [1]:
p_data['function'][0]

NameError: name 'p_data' is not defined

In [None]:
p_data['function_tokens'][0]

# Go Language Statistics

In [4]:
g = load_language_dataset('go', prepath='../data')
g_data = pd.DataFrame(g)
print('n =', g_data.shape)

n = (726768, 15)


In [5]:
go_fts = g_data['function_tokens'].apply(lambda x: len(x))
go_dcs = g_data['docstring_tokens'].apply(lambda x: len(x))
print(go_fts.quantile([.5, .75, .9, .95]))
print('mean = ', go_fts.mean())
print(go_dcs.quantile([.5, .75, .9, .95]))
print('mean = ', go_dcs.mean())

a = g_data['function_tokens'].apply(lambda x : set(map(lambda x: x.lower(), x)))
print('total number of tokens = ', np.sum(a.apply(lambda x: len(x))))
print('number of unique tokens =', len(set().union(*a)))

0.50     64.0
0.75    125.0
0.90    234.0
0.95    348.0
Name: function_tokens, dtype: float64
mean =  116.62444136230545
0.50     0.0
0.75    11.0
0.90    27.0
0.95    47.0
Name: docstring_tokens, dtype: float64
mean =  11.785700526165158
total number of tokens =  24831834
number of unique tokens = 1336504


# Java Language Statistics

In [3]:
j = load_language_dataset('java', prepath='../data')
j_data = pd.DataFrame(j)
print('n =', j_data.shape)

n = (1569889, 15)


In [4]:
j_fts = j_data['function_tokens'].apply(lambda x: len(x))
j_dcs = j_data['docstring_tokens'].apply(lambda x: len(x))
print(j_fts.quantile([.5, .75, .9, .95]))
print('mean = ', j_fts.mean())
print(j_dcs.quantile([.5, .75, .9, .95]))
print('mean = ', j_dcs.mean())

a = j_data['function_tokens'].apply(lambda x : set(map(lambda x: x.lower(), x)))
print('total number of tokens = ', np.sum(a.apply(lambda x: len(x))))
print('number of unique tokens =', len(set().union(*a)))

0.50     65.0
0.75    119.0
0.90    218.0
0.95    319.0
Name: function_tokens, dtype: float64
mean =  110.23698745580101
0.50     0.0
0.75     6.0
0.90    18.0
0.95    29.0
Name: docstring_tokens, dtype: float64
mean =  6.14824614988703
total number of tokens =  56004908
number of unique tokens = 4178498


# Javascript Language Statistics

In [6]:
js = load_language_dataset('javascript', prepath='../data')
js_data = pd.DataFrame(js)
print('n =', js_data.shape)

n = (1857835, 15)


In [None]:
js_fts = js_data['function_tokens'].apply(lambda x: len(x))
js_dcs = js_data['docstring_tokens'].apply(lambda x: len(x))
print(js_fts.quantile([.5, .75, .9, .95]))
print('mean = ', js_fts.mean())
print(js_dcs.quantile([.5, .75, .9, .95]))
print('mean = ', js_dcs.mean())

a = js_data['function_tokens'].apply(lambda x : set(map(lambda x: x.lower(), x)))
print('total number of tokens = ', np.sum(a.apply(lambda x: len(x))))
print('number of unique tokens =', len(set().union(*a)))

0.50     83.0
0.75    162.0
0.90    347.0
0.95    603.0
Name: function_tokens, dtype: float64
mean =  261.9426671367479
0.50    0.0
0.75    0.0
0.90    0.0
0.95    7.0
Name: docstring_tokens, dtype: float64
mean =  1.1774500964832721


In [None]:
all_df.language.value_counts()
all_df.groupby(['partition', 'language'])['code_tokens'].count()


In [None]:
all_df['code_len'] = all_df.code_tokens.apply(lambda x: len(x))
all_df['query_len'] = all_df.docstring_tokens.apply(lambda x: len(x))

In [None]:
code_len_summary = all_df.groupby('language')['code_len'].quantile([.5, .7, .8, .9, .95])
display(pd.DataFrame(code_len_summary))