In [1]:
from google.cloud import bigquery
import pandas as pd
import ast
from tools import glove_helper
import tensorflow as tf
import numpy as np
import scipy

from itertools import groupby
from os.path import basename, splitext
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

  from ._conv import register_converters as _register_converters


Before running the script, you will need to CMD and authenticate with 

'gcloud auth application-default login'


In [2]:
client = bigquery.Client(project='manifest-frame-203601')



In [3]:
QUERY = (
    """
    select distinct repo_path,c_content from w266_final.final_20k
    """)
query_job = client.query(QUERY)  # API request
rows = query_job.result()  # Waits for query to finish

df = []
for row in rows:
    df.append([row.repo_path,row.c_content])

In [18]:
df = pd.DataFrame(df)
df.columns = ['repo_path','content']
df.shape

172413

In [5]:
def cleanup(docstring_list):
    
    """takes a list of doc strings and converts to a single flat list of tokens"""
    
    tokens = [tf.keras.preprocessing.text.text_to_word_sequence(i) for i in docstring_list]
    flat_tokens = [item for sublist in tokens for item in sublist]
    flat_string = " ".join(flat_tokens)
    
    return flat_string

def get_docstrings(source):
    
    """function to walk through parse tree and return list of docstrings"""
    
    NODE_TYPES = {
    ast.ClassDef: 'Class',
    ast.FunctionDef: 'Function/Method',
    ast.Module: 'Module'
    }
    
    docstrings = []
    
    try:
        tree = ast.parse(source)
    except:
        return " "
       
    for node in ast.walk(tree):
        if isinstance(node, tuple(NODE_TYPES)):
            docstring = ast.get_docstring(node)
            docstrings.append(docstring)
    
    docstrings =  [x for x in docstrings if x is not None]
    clean_string = cleanup(docstrings)
            
    return clean_string

In [6]:
df['docstrings'] = [get_docstrings(x) for x in list(df['content'])]

In [7]:
hands = glove_helper.Hands(ndim=100)

Loading vectors from data/glove/glove.6B.zip
Parsing file: data/glove/glove.6B.zip:glove.6B.100d.txt
Found 400,000 words.
Parsing vectors... Done! (W.shape = (400003, 100))


In [8]:
#Set up corpus for count vectorizer
corpus = list(df['docstrings'])

#count values for tfidf calculations
count_vect = CountVectorizer()
count_vect = count_vect.fit(corpus)
freq_term_matrix = count_vect.transform(corpus)

#to grab columns for words
vocab = count_vect.vocabulary_

#create a holder for the new df column
embeddings_df = []

In [9]:
def words_to_embed(words):
    
    global count_vect, freq_term_matrix, vocab
    
    #verify there are docstrings available
    if len(words)==0:
        return np.zeros(100)
         
    #create tfidf for each document
    tfidf = TfidfTransformer(norm="l2")
    tfidf.fit(freq_term_matrix)
    doc_freq_term = count_vect.transform([words])
    idfs = tfidf.transform(doc_freq_term)

    #split the docstrings to individual words for average
    sent_list = words.split(" ")
    embeddings = []

    #cycle through list of words in docstring
    for i in range(len(sent_list)):

        if sent_list[i] in vocab:

            col = vocab[sent_list[i]]
            embed = hands.get_vector(sent_list[i], strict=False)
            tfidf = idfs[0, col]
            embeddings.append(np.multiply(embed, tfidf))

        embed_array = np.asarray(embeddings)
        
        if len(embed_array)==0:
            return np.zeros(100)

        return np.mean(embed_array, axis=0)
    
def find_nn(words, embeddings):
    
    search = words_to_embed(words)
    distances = [scipy.spatial.distance.cosine(search, i) for i in embeddings]
    nn = np.argsort(np.asarray(distances))
    
    return nn

In [12]:
#df['embeddings'] = [words_to_embed(x) for x in list(df['docstrings'])]

embeddings = []
i=0
for x in list(df['docstrings']):
    i+=1
    embeddings.append(words_to_embed(x))
    if i%1000==0:
        print(i/172413)
        
df['embeddings'] = embeddings
del embeddings

0.005800026680122729
0.011600053360245458
0.017400080040368186
0.023200106720490915
0.029000133400613644
0.03480016008073637
0.0406001867608591
0.04640021344098183
0.05220024012110456
0.05800026680122729
0.06380029348135001
0.06960032016147275
0.07540034684159547
0.0812003735217182
0.08700040020184092
0.09280042688196366
0.09860045356208638
0.10440048024220912
0.11020050692233184
0.11600053360245458
0.1218005602825773
0.12760058696270002
0.13340061364282277
0.1392006403229455
0.1450006670030682
0.15080069368319093
0.15660072036331368
0.1624007470434364
0.16820077372355913
0.17400080040368185
0.1798008270838046
0.18560085376392732
0.19140088044405004
0.19720090712417276
0.2030009338042955
0.20880096048441824
0.21460098716454096
0.22040101384466368
0.2262010405247864
0.23200106720490915
0.23780109388503187
0.2436011205651546
0.24940114724527732
0.25520117392540004
0.2610012006055228
0.26680122728564554
0.27260125396576823
0.278401280645891
0.2842013073260137
0.2900013340061364
0.29580136

In [13]:
df.head()

Unnamed: 0,repo_path,content,docstrings,embeddings
0,beavyHQ/beavy beavy_modules/private_messaging/...,"""""""empty message\n\nRevision ID: 47de1903b00\n...",empty message revision id 47de1903b00 revises ...,"[-0.05100566, 0.11278218, 0.043621335, -0.0586..."
1,Antergos/Cnchi src/pages/dialogs/partition_bas...,#!/usr/bin/env python\n# -*- coding: utf-8 -*-...,base class for create edit partition dialogs c...,"[-0.2872601, 0.15455353, 0.062218327, 0.038352..."
2,wolkstein/OpenDroneMap-GCP_LIST.TXT-generator ...,import numpy as np\nimport cv2\nimport argpars...,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,dstndstn/astrometry.net doc/UCAC3_guide/build-...,#Script for automatically running astrometry.n...,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,ClusterLabs/booth test/assertions.py,import re\n\nclass BoothAssertions:\n def c...,fail the test unless the text matches the regu...,"[-0.11991147, 0.037409756, 0.09766561, -0.0026..."


In [19]:
def top_n_code(search_terms, docstrings, embeddings, n):
    
    top_n = find_nn(search_terms, embeddings)[0:n]
    code = [df['content'][i] for i in top_n]
    
    return code

doc_strings = list(df['docstrings'])
embed_vecs = list(df['embeddings'])

def make_query_file(query, results, filename):
    
    output = open(filename, 'w')
    for item in results:
        output.write("Query: "+query+'\n')
        output.write("\n************************** NEXT RESULT **************************************\n")
        output.write("%s\n" % item)
        
    return 

def keyword_search(word, corpus, content):
    
    word = word.lower()
    query_results = []
    for i in range(len(corpus)):
        if word in corpus[i]:
            query_results.append(content[i])
            
    return query_results

In [20]:
search1 = "function that calculates distance"
search2 = 'merge two lists'
search3 = 'remove duplicates from sorted array'
search3 = 'determine if a Sudoku is valid'
search4 = 'unique binary search tree'
search5 = 'voice recognition function'
search6 = 'LSTM model for semantic search'

searches = [search1, search2, search3, search4, search5, search6]

In [14]:
for i in range(len(searches)):
    query = top_n_code(searches[i], doc_strings, embed_vecs, 10)
    x=i+1
    filename = 'model_1_queries/query'+str(x)+'.txt'
    make_query_file(searches[i], query, filename)

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [23]:
#Search 3 Backcheck
content = df['content']
q3_check = keyword_search('sudoku', doc_strings, content)
print(q3_check[0])

"""
Chapter 6: Arrays
"""
import random
import sys
from datetime import datetime

import attr
from attr.validators import instance_of


def swap(arr, idx, idy):
    tmp = arr[idx]
    arr[idx] = arr[idy]
    arr[idy] = tmp


def dutch_national_partition(idx, arr):
    """
    Problem 6.1
    Partition such that all elements less than
    arr[idx] come first, then all elements equal
    to arr[idx], then all elements greater than
    arr[idx]
    """
    pivot = arr[idx]

    smaller = 0
    idy = smaller
    while idy < len(arr):
        # group all elements less than pivot at
        # the bottom
        if arr[idy] < pivot:
            swap(arr, smaller, idy)
            smaller += 1
        idy += 1

    larger = len(arr) - 1
    idy = larger
    while idy >= 0:
        # group all elements greater than pivot
        # at the bottom
        if arr[idy] > pivot:
            swap(arr, larger, idy)
            larger -= 1
        idy -= 1


def dutch_partition_better(idx, arr):
    """

In [32]:
q6_check = keyword_search('LSTM', doc_strings, content)
print(q6_check[2])

import numpy
import six

import chainer
from chainer.backends import cuda
from chainer.backends import intel64
from chainer import function
from chainer import function_node
from chainer.utils import type_check


def _extract_gates(x):
    r = x.reshape((len(x), x.shape[1] // 4, 4) + x.shape[2:])
    return [r[:, :, i] for i in six.moves.range(4)]


def _sigmoid(x, xp=numpy):
    half = x.dtype.type(0.5)
    return xp.tanh(x * half) * half + half


def _grad_sigmoid(x):
    return x * (1 - x)


def _grad_grad_sigmoid(x):
    return x * (1 - x) * (1 - 2 * x)


def _grad_tanh(x):
    return 1 - x * x


def _grad_grad_tanh(x, gx):
    return -2 * x * gx


_preamble = '''
template <typename T> __device__ T sigmoid(T x) {
    const T half = 0.5;
    return tanh(x * half) * half + half;
}
template <typename T> __device__ T grad_sigmoid(T y) { return y * (1 - y); }
template <typename T> __device__ T grad_tanh(T y) { return 1 - y * y; }

#define COMMON_ROUTINE \
    T aa = tanh(a); \
    T ai 

In [30]:
q5_check = keyword_search('voice', doc_strings, content)
print(q5_check[0])

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Trains a neural network for singing voice detection.

For usage information, call with --help.

Author: Jan Schlüter
"""

from __future__ import print_function

import sys
import os
import io
from argparse import ArgumentParser

import numpy as np
import theano
import theano.tensor as T
floatX = theano.config.floatX
import lasagne

from progress import progress
from simplecache import cached
import audio
import znorm
from labels import create_aligned_targets
import model
import augment

def opts_parser():
    descr = "Trains a neural network for singing voice detection."
    parser = ArgumentParser(description=descr)
    parser.add_argument('modelfile', metavar='MODELFILE',
            type=str,
            help='File to save the learned weights to (.npz format)')
    parser.add_argument('--dataset',
            type=str, default='jamendo',
            help='Name of the dataset to use (default: %(default)s)')
    parser.add_argument('-