In [1]:
import os
import pickle
import json
import gzip

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.cm as cm

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import train as tr # library for TF-IDF

In [2]:
def train_and_save_model(corpus: list, model_path: str, vs=5, wdw=2, mc=1, epch=10):
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(corpus)]
    # train model
    model = Doc2Vec(documents, vector_size=vs, window=wdw, min_count=mc, epochs=epch, workers=28)
    model.save(model_path)
    return model

def load_model(model_path):
    return Doc2Vec.load(model_path)

def get_top_n_query_similarities(doc2vec_model, query, top_n=10):
    sdlq = query.split(' ')
    print(sdlq)
    query_tfidf = doc2vec_model.infer_vector(sdlq)
    print(query_tfidf)
    raise NotImplementedError
    
    print(doc2vec_model.vc.shape)
    print(doc2vec_model.vc)

    return

In [3]:
token_filter = {
    'python': tr.NOISE_TOKEN_PYTHON | tr.LOGIC_TOKEN_PYTHON | tr.SYNTAX_TOKEN_PYTHON,
    'go': tr.NOISE_TOKEN_GO | tr.LOGIC_TOKEN_GO | tr.SYNTAX_TOKEN_GO,
    'java': tr.NOISE_TOKEN_JAVA | tr.LOGIC_TOKEN_JAVA | tr.SYNTAX_TOKEN_JAVA,
    'javascript': tr.NOISE_TOKEN_JS | tr.LOGIC_TOKEN_JS | tr.SYNTAX_TOKEN_JS
}

In [4]:
raw_dataset = tr.load_language_dataset('python')
cleaned_data = [tr.create_doc(x, tfilter=token_filter['python']) for x in raw_dataset]

In [None]:
# model = train_and_save_model(cleaned_data, 'doc2vec_models/gensim_model')
# model2 = train_and_save_model(cleaned_data, 'doc2vec_models/gensim_model_vs300,wdw5', vs=300, wdw=5)
# model3 = train_and_save_model(cleaned_data, 'doc2vec_models/gensim_model_vs500_wdw15', vs=500, wdw=20)
# model4 = train_and_save_model(cleaned_data, 'doc2vec_models/gensim_model_vs2000_wdw30_epoch30', vs=2000, wdw=30, epch=30)
# model5 = train_and_save_model(cleaned_data, 'doc2vec_models/gensim_model_vs3000_wdw30_epoch30', vs=3000, wdw=30, epch=30)
# do not have enough memory to run below
# model6 = train_and_save_model(cleaned_data, 'doc2vec_models/gensim_model_vs5000_wdw30_epoch30', vs=5000, wdw=30, epch=40)

In [5]:
os.listdir('doc2vec_models')

['gensim_model_vs3000_wdw30_epoch30',
 'gensim_model_vs300,wdw5',
 'gensim_model_vs2000_wdw30_epoch30.wv.vectors.npy',
 'gensim_model',
 'gensim_model_vs3000_wdw30_epoch30.wv.vectors.npy',
 'gensim_model_vs2000_wdw30_epoch30.dv.vectors.npy',
 'gensim_model_vs2000_wdw30_epoch30.syn1neg.npy',
 'gensim_model_vs2000_wdw30_epoch30',
 'gensim_model_vs3000_wdw30_epoch30.syn1neg.npy',
 'gensim_model_vs300,wdw5.dv.vectors.npy',
 'gensim_model_vs500_wdw15',
 'gensim_model_vs3000_wdw30_epoch30.dv.vectors.npy',
 'gensim_model_vs500_wdw15.dv.vectors.npy']

# Vectorspace output = 2000; wordwindow = 30 

In [None]:
v2000wdw30 = load_model('doc2vec_models/gensim_model_vs2000_wdw30_epoch30')

In [21]:
query_vector = v2000wdw30.infer_vector("convert int to string".split(' '), epochs=100)
ms = v2000wdw30.dv.most_similar([query_vector])

In [19]:
for x, sim in ms:
    print(sim, raw_dataset[x]['path'])

0.08777601271867752 src/python/pants/goal/run_tracker.py
0.08639977872371674 python/spark_sklearn/base_search.py
0.08386426419019699 js2py/legecy_translators/nparser.py
0.07874813675880432 openquake/hazardlib/calc/stochastic.py
0.07862395793199539 scanpy/preprocessing/_qc.py
0.07692356407642365 historical/vpc/differ.py
0.07614123076200485 drogher/package/ontrac.py
0.0755046159029007 clear/database.py
0.07528843730688095 samples/hello.py
0.07510476559400558 satpy/composites/viirs.py


In [28]:
v2000wdw30.wv.items()

AttributeError: 'KeyedVectors' object has no attribute 'items'

# Vectorspace output = 500; wordwindow = 15

In [None]:
v3000wdw30 = load_model('doc2vec_models/gensim_model_vs3000_wdw30_epoch30')
query_vector = v500wdw15.infer_vector("convert int to string".split(' '))
print(v500wdw15.dv.most_similar([query_vector]))

# Vectorspace output = 3000; wordwindow = 30

In [None]:
v500wdw15 = load_model('doc2vec_models/gensim_model_vs500_wdw15')
query_vector = v3000wdw30.infer_vector("convert int to string".split(' '))
print(v3000wdw30.dv.most_similar([query_vector]))

In [None]:
query_vector = v2000wdw30.infer_vector("convert int to string".split(' '))
print(v2000wdw30.dv.most_similar([query_vector]))

In [None]:
m1.infer_vector("convert in to string".split(' '))

In [None]:
qv = m1.infer_vector("convert int into string".split(' '))
m1.dv.most_similar([qv])

In [None]:
m1.dv[555998]

In [None]:
raw_dataset[555998]

In [None]:
qv = m2.infer_vector("convert int into string".split(' '))
res = m2.dv.most_similar([qv], topn=20)
for ind, sim in res:
    print(sim, raw_dataset[ind]['function'])

In [None]:
qv = model3.infer_vector("convert int to string".split(' '))
res = model3.dv.most_similar([qv], topn=20)
for ind, sim in res:
    print(sim, raw_dataset[ind]['function'])

In [None]:
os.listdir('doc2vec_models/')

In [None]:
def load_queries_and_evaluate_doc2vec(output, 
                                    queries_path='evaluation_results/queries.csv', 
                                    language='python', 
                                    top_n=5, 
                                    tfilter=tr.NOISE_TOKEN_PYTHON):
    print('---------preprocessing dataset---------')
    raw_dataset = tr.load_language_dataset(language)
    cleaned_data = [tr.create_doc(x, tfilter=tfilter) for x in raw_dataset]
    
    # train gensim
    print('---------training model---------')
    vectorizer = TfidfVectorizer(min_df=mindf, max_df=maxdf)
    X = vectorizer.fit_transform(cleaned_data)
    
    # generate file mapping query to result - why is this the most complicated part...
    print('---------running queries---------')
    queries = list(pd.read_csv(queries_path)['query'])
    num_queries = len(queries)
    
    # avoid excessive appending
    # langs = [language] * top_n * num_queries #TODO: DELETE
    qs = [None] * top_n * num_queries
    m_urls = [None] * top_n * num_queries
    snippets = [None] * top_n * num_queries

    for ind, query in enumerate(queries):
        print('{}: executing query: {}'.format(ind, query))
        top_n_indices = get_top_n_query_similarities(vectorizer, X, query, top_n=top_n)
        
        for ind2, close_ind in enumerate(top_n_indices):
            li = (ind * top_n) + ind2
            m_urls[li] = raw_dataset[close_ind]['url']
            snippets[li] = raw_dataset[close_ind]['function']
            qs[li] = query

    pd.DataFrame.from_dict({'model_name': 'tfidf_{}_{}_{}'.format(language, mindf, maxdf), 
                            'query': qs, 
                            'language': language,
                            'function': snippets,
                            'url': m_urls}).to_csv(output, index=False)
    print('done')
    return

In [None]:
for lang in ['python', 'go', 'java', 'javascript']:
    empty_filter_fname = 'evaluation_results/doc2vec/gensim_{}_nofilter.csv'.format(lang)
    print("generating {}".format(empty_filter_fname))
    if os.path.exists(empty_filter_fname):
        print('skipping because {} already exists - skipping'.format(empty_filter_fname))
        continue
    
    load_queries_and_evaluate_word2vec(empty_filter_fname, language=lang, top_n=100, tfilter=set())