In [28]:
from google.cloud import bigquery
import pandas as pd
import ast
import glove_helper
import tensorflow as tf
import numpy as np
import scipy

from itertools import groupby
from os.path import basename, splitext
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

Before running the script, you will need to CMD and authenticate with 

'gcloud auth application-default login'


In [29]:
client = bigquery.Client(project='manifest-frame-203601')



In [30]:
QUERY = (
    """
    select * from w266_final.final_20k
    LIMIT 10000""")
query_job = client.query(QUERY)  # API request
rows = query_job.result()  # Waits for query to finish

df = []
for row in rows:
    df.append([row.repo_path,row.c_content])

In [31]:
df = pd.DataFrame(df)
df.columns = ['repo_path','content']
df

Unnamed: 0,repo_path,content
0,watchdogpolska/feder feder/records/types.py,"from abc import abstractmethod, ABCMeta\n\n\nc..."
1,softappeal/yass py2/test/contract_test.py,import unittest\nfrom typing import Any\n\nimp...
2,gcarq/freqtrade freqtrade/tests/test_fiat_conv...,"# pragma pylint: disable=missing-docstring, to..."
3,devilry/devilry-django devilry/devilry_compres...,# -*- coding: utf-8 -*-\n# Generated by Django...
4,erigones/esdc-ce api/dc/storage/serializers.py,from api import serializers as s\nfrom vms.mod...
5,moddevices/mod-ui mod/settings.py,# -*- coding: utf-8 -*-\n\n# Copyright 2012-20...
6,ms-iot/python cpython/Tools/unicode/gencodec.py,""""""" Unicode Mapping Parser and Codec Generator..."
7,flammified/terrabot terrabot/packets/packet39.py,import struct\n\n\nclass Packet39Parser(object...
8,lukasmonk/lucaschess Code/QT/PantallaConfig.py,from PyQt4 import QtCore\n\nfrom Code import D...
9,MetaMetricsInc/django-static-version example/e...,"""""""\nDjango settings for example project.\n\nG..."


In [32]:
def cleanup(docstring_list):
    
    """takes a list of doc strings and converts to a single flat list of tokens"""
    
    tokens = [tf.keras.preprocessing.text.text_to_word_sequence(i) for i in docstring_list]
    flat_tokens = [item for sublist in tokens for item in sublist]
    flat_string = " ".join(flat_tokens)
    
    return flat_string

def get_docstrings(source):
    
    """function to walk through parse tree and return list of docstrings"""
    
    NODE_TYPES = {
    ast.ClassDef: 'Class',
    ast.FunctionDef: 'Function/Method',
    ast.Module: 'Module'
    }
    
    docstrings = []
    
    try:
        tree = ast.parse(source)
    except:
        return " "
       
    for node in ast.walk(tree):
        if isinstance(node, tuple(NODE_TYPES)):
            docstring = ast.get_docstring(node)
            docstrings.append(docstring)
    
    docstrings =  [x for x in docstrings if x is not None]
    clean_string = cleanup(docstrings)
            
    return clean_string

In [33]:
df['docstrings'] = [get_docstrings(x) for x in list(df['content'])]

In [7]:
hands = glove_helper.Hands(ndim=100)

Loading vectors from data/glove/glove.6B.zip
Parsing file: data/glove/glove.6B.zip:glove.6B.100d.txt
Found 400,000 words.
Parsing vectors... Done! (W.shape = (400003, 100))


In [34]:
#Set up corpus for count vectorizer
corpus = list(df['docstrings'])

#count values for tfidf calculations
count_vect = CountVectorizer()
count_vect = count_vect.fit(corpus)
freq_term_matrix = count_vect.transform(corpus)

#to grab columns for words
vocab = count_vect.vocabulary_

#create a holder for the new df column
embeddings_df = []

In [35]:
def words_to_embed(words):
    
    global count_vect, freq_term_matrix, vocab
    
    #verify there are docstrings available
    if len(words)==0:
        return np.zeros(100)
         
    #create tfidf for each document
    tfidf = TfidfTransformer(norm="l2")
    tfidf.fit(freq_term_matrix)
    doc_freq_term = count_vect.transform([words])
    idfs = tfidf.transform(doc_freq_term)

    #split the docstrings to individual words for average
    sent_list = words.split(" ")
    embeddings = []

    #cycle through list of words in docstring
    for i in range(len(sent_list)):

        if sent_list[i] in vocab:

            col = vocab[sent_list[i]]
            embed = hands.get_vector(sent_list[i], strict=False)
            tfidf = idfs[0, col]
            embeddings.append(np.multiply(embed, tfidf))

        embed_array = np.asarray(embeddings)
        
        if len(embed_array)==0:
            return np.zeros(100)

        return np.mean(embed_array, axis=0)
    
def find_nn(words, embeddings):
    
    search = words_to_embed(words)
    distances = [scipy.spatial.distance.cosine(search, i) for i in embeddings]
    nn = np.argsort(np.asarray(distances))
    
    return nn

In [36]:
df['embeddings'] = [words_to_embed(x) for x in list(df['docstrings'])]

In [37]:
def top_n_code(search_terms, docstrings, embeddings, n):
    
    top_n = find_nn(search_terms, embeddings)[0:n]
    code = [df['content'][i] for i in top_n]
    
    return code

doc_strings = list(df['docstrings'])
embed_vecs = list(df['embeddings'])

In [38]:
search1 = "function that calculates distance"
query1 = top_n_code(search1, doc_strings, embed_vecs, 10)
print(query1[1])

  dist = 1.0 - uv / np.sqrt(uu * vv)


#!/usr/bin/python
# -*- coding: utf-8 -*-

import dateutil.parser, datetime
from bson.objectid import ObjectId
from gluon.custom_import import track_changes
from oauth import OAuth2
from oauth.storage import web2pyStorage as storage  # change to MongoStorage if you aren't using DAL
from oauth.exceptions import *
track_changes(True)

CODES = {'ok': 200}
MESSAGES = {'ok': 'success'}

def validate_access_token(f):
    """
    Function decorator which validates an access token.
    """

    from oauth.storage import web2pyStorage as storage  # change to MongoStorage if you aren't using DAL
    storage = storage()
    storage.connect()
    oauth = OAuth2(storage)
    
    response.headers['Content-Type'] = json_headers()
    response.view = json_service()

    header = request.env['http_authorization']
    token = oauth.validate_access_params(request.get_vars, request.post_vars,
                                         header)
                                    
    return f  # what does f

In [39]:
search2 = "code to merge two files"
query2 = top_n_code(search2, doc_strings, embed_vecs, 10)
print(query2[0])

  dist = 1.0 - uv / np.sqrt(uu * vv)


# Copyright (c) 2017, Henrique Miranda
# All rights reserved.
#
# This file is part of the phononwebsite project
#
""" Code the dictionary in json format """
import json
import numpy as np

class JsonEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, (np.ndarray,np.number)):
            if np.iscomplexobj(obj):
                return [obj.real, obj.imag]
            else:
                return obj.tolist()
        return(json.JSONEncoder.default(self, obj))





In [40]:
search3 = "train a neural network for image reconition"
query3 = top_n_code(search3, doc_strings, embed_vecs, 10)
print(query3[0])

  dist = 1.0 - uv / np.sqrt(uu * vv)


import os
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import cPickle as pickle
import copy
import json
from tqdm import tqdm

from utils.nn import NN
from utils.coco.coco import COCO
from utils.coco.pycocoevalcap.eval import COCOEvalCap
from utils.misc import ImageLoader, CaptionData, TopN

class BaseModel(object):
    def __init__(self, config):
        self.config = config
        self.is_train = True if config.phase == 'train' else False
        self.train_cnn = self.is_train and config.train_cnn
        self.image_loader = ImageLoader('./utils/ilsvrc_2012_mean.npy')
        self.image_shape = [224, 224, 3]
        self.nn = NN(config)
        self.global_step = tf.Variable(0,
                                       name = 'global_step',
                                       trainable = False)
        self.build()

    def build(self):
        raise NotImplementedError()

    def train(self, sess, train_data):
        """ Train the 

In [41]:
search4 = "list the first 100 Fibonacci Numbers"
query4 = top_n_code(search4, doc_strings, embed_vecs, 10)
print(query4[0])

  dist = 1.0 - uv / np.sqrt(uu * vv)


# Rekall Memory Forensics
# Copyright (C) 2007-2011 Volatile Systems
# Copyright 2013 Google Inc. All Rights Reserved.
#
# Additional Authors:
# Michael Cohen <scudette@users.sourceforge.net>
# Mike Auty <mike.auty@gmail.com>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at
# your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#

# pylint: disable=protected-access

from future import standard_library


In [42]:
search5 = "semantic search tool for text"
query5 = top_n_code(search5, doc_strings, embed_vecs, 10)
print(query5[0])

  dist = 1.0 - uv / np.sqrt(uu * vv)


#     Copyright 2018, Kay Hayen, mailto:kay.hayen@gmail.com
#
#     Part of "Nuitka", an optimizing Python compiler that is compatible and
#     integrates with CPython, but also works on its own.
#
#     Licensed under the Apache License, Version 2.0 (the "License");
#     you may not use this file except in compliance with the License.
#     You may obtain a copy of the License at
#
#        http://www.apache.org/licenses/LICENSE-2.0
#
#     Unless required by applicable law or agreed to in writing, software
#     distributed under the License is distributed on an "AS IS" BASIS,
#     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#     See the License for the specific language governing permissions and
#     limitations under the License.
#
""" Syntax highlighting for Python.

Inspired/copied from by http://diotavelli.net/PyQtWiki/Python%20syntax%20highlighting
"""

from PyQt5.QtCore import (
    QRegExp  # @UnresolvedImport pylint: disable=I0021,import-err