In [1]:
from google.cloud import bigquery
import pandas as pd
import ast
from tools import glove_helper
import tensorflow as tf
import numpy as np
import scipy

from itertools import groupby
from os.path import basename, splitext
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

  from ._conv import register_converters as _register_converters


Before running the script, you will need to CMD and authenticate with 

'gcloud auth application-default login'


In [2]:
client = bigquery.Client(project='manifest-frame-203601')



In [3]:
QUERY = (
    """
    select distinct repo_path,c_content from w266_final.final_20k
    """)
query_job = client.query(QUERY)  # API request
rows = query_job.result()  # Waits for query to finish

df = []
for row in rows:
    df.append([row.repo_path,row.c_content])

In [4]:
df = pd.DataFrame(df)
df.columns = ['repo_path','content']
df.shape

(172413, 2)

In [5]:
def cleanup(docstring_list):
    
    """takes a list of doc strings and converts to a single flat list of tokens"""
    
    tokens = [tf.keras.preprocessing.text.text_to_word_sequence(i) for i in docstring_list]
    flat_tokens = [item for sublist in tokens for item in sublist]
    flat_string = " ".join(flat_tokens)
    
    return flat_string

def get_docstrings(source):
    
    """function to walk through parse tree and return list of docstrings"""
    
    NODE_TYPES = {
    ast.ClassDef: 'Class',
    ast.FunctionDef: 'Function/Method',
    ast.Module: 'Module'
    }
    
    docstrings = []
    
    try:
        tree = ast.parse(source)
    except:
        return " "
       
    for node in ast.walk(tree):
        if isinstance(node, tuple(NODE_TYPES)):
            docstring = ast.get_docstring(node)
            docstrings.append(docstring)
    
    docstrings =  [x for x in docstrings if x is not None]
    clean_string = cleanup(docstrings)
            
    return clean_string

In [6]:
df['docstrings'] = [get_docstrings(x) for x in list(df['content'])]

In [7]:
hands = glove_helper.Hands(ndim=100)

Loading vectors from data/glove/glove.6B.zip
Parsing file: data/glove/glove.6B.zip:glove.6B.100d.txt
Found 400,000 words.
Parsing vectors... Done! (W.shape = (400003, 100))


In [8]:
#Set up corpus for count vectorizer
corpus = list(df['docstrings'])

#count values for tfidf calculations
count_vect = CountVectorizer()
count_vect = count_vect.fit(corpus)
freq_term_matrix = count_vect.transform(corpus)

#to grab columns for words
vocab = count_vect.vocabulary_

#create a holder for the new df column
embeddings_df = []

In [9]:
def words_to_embed(words):
    
    global count_vect, freq_term_matrix, vocab
    
    #verify there are docstrings available
    if len(words)==0:
        return np.zeros(100)
         
    #create tfidf for each document
    tfidf = TfidfTransformer(norm="l2")
    tfidf.fit(freq_term_matrix)
    doc_freq_term = count_vect.transform([words])
    idfs = tfidf.transform(doc_freq_term)

    #split the docstrings to individual words for average
    sent_list = words.split(" ")
    embeddings = []

    #cycle through list of words in docstring
    for i in range(len(sent_list)):

        if sent_list[i] in vocab:

            col = vocab[sent_list[i]]
            embed = hands.get_vector(sent_list[i], strict=False)
            tfidf = idfs[0, col]
            embeddings.append(np.multiply(embed, tfidf))

        embed_array = np.asarray(embeddings)
        
        if len(embed_array)==0:
            return np.zeros(100)

        return np.mean(embed_array, axis=0)
    
def find_nn(words, embeddings):
    
    search = words_to_embed(words)
    distances = [scipy.spatial.distance.cosine(search, i) for i in embeddings]
    nn = np.argsort(np.asarray(distances))
    
    return nn

In [10]:
df['embeddings'] = [words_to_embed(x) for x in list(df['docstrings'])]

In [11]:
def top_n_code(search_terms, docstrings, embeddings, n):
    
    top_n = find_nn(search_terms, embeddings)[0:n]
    code = [df['content'][i] for i in top_n]
    
    return code

doc_strings = list(df['docstrings'])
embed_vecs = list(df['embeddings'])

def make_query_file(query, results, filename):
    
    output = open(filename, 'w')
    for item in results:
        output.write("Query: "+query+'\n')
        output.write("\n************************** NEXT RESULT **************************************\n")
        output.write("%s\n" % item)
        
    return 

In [12]:
search1 = "function that calculates distance"
search2 = 'merge two lists'
search3 = 'remove duplicates from sorted array'
search3 = 'determine if a Sudoku is valid'
search4 = 'unique binary search tree'
search5 = 'voice recognition function'
search6 = 'LSTM model for semantic search'

searches = [search1, search2, search3, search4, search5, search6]

In [14]:
for i in range(len(searches)):
    query = top_n_code(searches[i], doc_strings, embed_vecs, 10)
    x=i+1
    filename = 'model_1_queries/query'+str(x)+'.txt'
    make_query_file(searches[i], query, filename)

  dist = 1.0 - uv / np.sqrt(uu * vv)
