In [1]:
import pandas as pd
import numpy as np
import pickle
from transformers import AutoTokenizer, AutoModel
from datasets import Dataset
from datasets import load_dataset
import datasets
import torch
from collections import Counter

# import string library function  
import string

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# dataset = load_dataset("code_search_net", "all")

dataset_dict = datasets.load_from_disk("./Dataset/CodeSearchCorpus/")

In [3]:
print(torch.backends.cudnn.enabled)
print(torch.cuda.is_available()) #We have GPU on deck and ready
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")

True
True
CUDA device: NVIDIA GeForce RTX 3060 Laptop GPU


In [4]:
print(len(dataset_dict["train"]))
print(len(dataset_dict["validation"]))
print(len(dataset_dict["test"]))

1880853
89154
100529


In [5]:
train_dataset = dataset_dict["train"]
train_dataset

Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
    num_rows: 1880853
})

In [6]:
test_dataset = dataset_dict["test"]
test_dataset

# Yeah, 1.8M is too much. For week 5 at least, we've decided to train on a random sample of 10k from the training, 1k validation and 1k test

# Column for semantic search: func_documentation_string
# Column for tfidf: func_code_tokens


Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
    num_rows: 100529
})

In [7]:
num_rows = 10000

In [8]:
np.random.seed(1)
train_subset_indices = np.random.choice(len(train_dataset), num_rows, replace = False)
train_dataset_subset = train_dataset.select(train_subset_indices)

len(train_dataset_subset)

10000

In [9]:
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
# device

#Following code from: https://huggingface.co/learn/nlp-course/chapter5/6?fw=pt

In [10]:
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [11]:
device = torch.device("cuda")
model.to(device)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [12]:
#From Hugging Face Tutorials
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [13]:
# Credit: https://huggingface.co/docs/datasets/use_with_pytorch
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [14]:
#Trained embeddings for semantic search portion

In [15]:
#Train embeddings

#REMEMBER TO KEEP THE FILENAMES THE SAME 0_0
try:
    with open('./pickleObjects/train_subset_embeddings_dataset.pkl', 'rb') as f:  # open a text file
        train_subset_embeddings_dataset = pickle.load(f) # serialize the list
        f.close()
except:
    train_subset_embeddings_dataset = train_dataset_subset.map(
        lambda x: {"embeddings": get_embeddings(x["func_documentation_string"]).detach().cpu().numpy()[0]}
    )

    with open('./pickleObjects/train_subset_embeddings_dataset.pkl', 'wb') as f:  # open a text file
        pickle.dump(train_subset_embeddings_dataset, f) # serialize the list
        f.close()



Map: 100%|██████████| 10000/10000 [04:35<00:00, 36.36 examples/s]


In [None]:

# with open('./pickleObjects/train_subset_embeddings_dataset.pkl', 'wb') as f:  # open a text file
#     pickle.dump(train_subset_embeddings_dataset, f) # serialize the list
#     f.close()


In [16]:
train_subset_embeddings_dataset

Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url', 'embeddings'],
    num_rows: 10000
})

In [None]:
#Creating dictionary for tf-idf

In [17]:
train_dataset_subset[0]["func_code_tokens"]

['def',
 'load_config',
 '(',
 'path',
 ')',
 ':',
 'with',
 'path',
 '.',
 'open',
 '(',
 "'rb'",
 ')',
 'as',
 'fi',
 ':',
 'file_bytes',
 '=',
 'fi',
 '.',
 'read',
 '(',
 ')',
 'config',
 '=',
 'yaml',
 '.',
 'load',
 '(',
 'file_bytes',
 '.',
 'decode',
 '(',
 "'utf-8'",
 ')',
 ')',
 'return',
 'config']

In [18]:
tsed_DF = train_subset_embeddings_dataset.to_pandas() #train-subset-embeddings-dataset_DF

In [19]:
def clean_code_tokens(lst):
    result = string.punctuation 
    new_lst = [] 
    for character in lst:
        if character in result:
            continue
        else:
            new_lst.append(character)
    return new_lst


# # Creating inverted index based off this article: https://www.geeksforgeeks.org/inverted-index/
# def make_documents(data, col_name):
#     documents = data[col_name].dropna().apply(process_text).to_dict()
#     return documents

# def make_inverted_index(documents):
#     word_array = np.array(list(documents.values()))
#     all_words = []
#     for words in word_array:
#         all_words +=  words.split(" ")
# #     terms = dict(zip( range(len(set(all_words))),set(all_words)))
# #     return terms
#     all_words = set(all_words)
#     inverted_index = {}
    
#     for word in all_words:
#         if word != "":
#             lst_docs = []
#             for i, doc in documents.items():
#                 if word in doc.split():
#                     lst_docs.append(i)
        
#             inverted_index[word] = lst_docs
#     return inverted_index

In [20]:
# torch.cuda.empty_cache()

#Cleaned func_code_tokens and set to "clean_code_tokens"
tsed_DF["clean_code_tokens"] =  tsed_DF["func_code_tokens"].apply(clean_code_tokens)

# list(tsed_DF["clean_code_tokens"].to_dict().values())

### Much of this code was based off of William Scott's implementation of TF-IDF: https://github.com/williamscott701/Information-Retrieval/blob/master/2.%20TF-IDF%20Ranking%20-%20Cosine%20Similarity%2C%20Matching%20Score/TF-IDF.ipynb


In [22]:
documents = tsed_DF["clean_code_tokens"].to_dict()

all_words = []
for i in list(tsed_DF["clean_code_tokens"].to_dict().values()):
    all_words += i

all_words = list(set(all_words)) #Get rid of all repeats
all_words

try:
     with open('./pickleObjects/inverted_index.pkl', 'rb') as f:
        inverted_index = pickle.load(f) # deserialize using load()
        f.close()
except:
    inverted_index = {}

    for word in all_words:
            if word != "":
                lst_docs = []
                for i, doc in documents.items():
                    if word in doc:
                        lst_docs.append(i)
            
                inverted_index[word] = lst_docs
    
    #Pickle afterwards
    with open('./pickleObjects/inverted_index.pkl', 'wb') as f:  # open a text file
        pickle.dump(inverted_index, f) # serialize the list
        f.close()

In [24]:
len(all_words) == len(inverted_index)

True

In [25]:
#pickle inverted indices
inverted_index

{'# Try next leader or partition': [7286],
 'GuacamoleUpstreamNotFoundException': [4376],
 "'601'": [8705],
 'GetVpnSitesConfigurationRequest': [2685],
 '// Create SystemConfiguration and start': [8110],
 '/** @var $charset string */': [1899],
 'arguments': [132,
  263,
  264,
  553,
  579,
  879,
  896,
  1022,
  1195,
  1422,
  1461,
  1639,
  2005,
  2022,
  2226,
  2474,
  2555,
  2574,
  2587,
  2633,
  2707,
  2896,
  2975,
  3167,
  3180,
  3622,
  3625,
  3632,
  3672,
  3938,
  3963,
  4067,
  4070,
  4359,
  4503,
  4747,
  5074,
  5101,
  5180,
  6023,
  6215,
  6262,
  6348,
  7021,
  7185,
  7444,
  7519,
  7619,
  7689,
  7834,
  7929,
  8031,
  8168,
  8504,
  8714,
  8844,
  8873,
  8881,
  9377,
  9563,
  9613,
  9647,
  9675,
  9969,
  9996],
 'file_index': [980],
 'FrontendController': [4545],
 'FileDescriptorProto': [7895],
 'usePath': [9088],
 'otherTable': [7134],
 'neighbourTile': [6729],
 'chrom': [9469],
 '// Remove unreferenced children': [1391],
 'oldSchema':

In [26]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [28]:
def doc_freq(word):
    c = 0
    try:
        c = inverted_index[word]
    except:
        pass

    if type(c) == list:
        return len(c)
    else:
        return 0



#number of rows sampled
tf_idf = {}
for i in range(num_rows):
    # print(i)
    tokens = tsed_DF["clean_code_tokens"].iloc[i]
    counter = Counter(tokens)
    words_count = len(tokens)

    for token in np.unique(tokens):
        tf = counter[token] / words_count
        df = doc_freq(token)
        idf = np.log((num_rows + 1) / (df + 1))

        tf_idf[i, token] = tf * idf


In [29]:
tf_idf

{(0, "'rb'"): 0.3406272547089073,
 (0, "'utf-8'"): 0.2888226581245685,
 (0, 'as'): 0.12951835802230802,
 (0, 'config'): 0.3657480782054899,
 (0, 'decode'): 0.26295983241975446,
 (0, 'def'): 0.0701668538425268,
 (0, 'fi'): 0.6812545094178146,
 (0, 'file_bytes'): 0.8517293186416572,
 (0, 'load'): 0.23780465353615043,
 (0, 'load_config'): 0.4258646593208286,
 (0, 'open'): 0.23178646942365666,
 (0, 'path'): 0.3156001020707146,
 (0, 'read'): 0.23076602584209632,
 (0, 'return'): 0.013320602085796086,
 (0, 'with'): 0.22114743120972352,
 (0, 'yaml'): 0.34539276369912353,
 (1, 'Appendable'): 0.4506571154615781,
 (1, 'ControlFlowGraph'): 0.4731829548009206,
 (1, 'DotFormatter'): 0.4731829548009206,
 (1, 'IOException'): 0.2062928967601841,
 (1, 'Node'): 0.5648117757701582,
 (1, 'appendDot'): 0.4731829548009206,
 (1, 'builder'): 0.5595614552312088,
 (1, 'false'): 0.12308835450796916,
 (1, 'inCFG'): 0.9463659096018412,
 (1, 'n'): 0.46444471617600896,
 (1, 'new'): 0.08409851585833636,
 (1, 'static')

In [27]:
# def make_tfidf_DF(documents, inverted_index, total_vocab):
#     tf_idf = {}
#     df = pd.DataFrame()
#     for i, doc in documents.items():
#         term_lst = []
#         for term in total_vocab:
#             # doc_lst = doc.split()
#             tf = doc.count(term) / len(doc)

#             idf = np.log(len(documents) / len(inverted_index[term]))
#     #         if tf*idf > 0:
#     #             print(tf*idf)
#     #             print(term)
#             term_lst.append(tf*idf)
#             tf_idf[i, term] = tf*idf
#         df[i] = term_lst
#     return df


In [30]:
all_words_dict = dict(zip(all_words, range(len(all_words))))


tf_idf_array = np.zeros((num_rows, len(all_words)))

for i in tf_idf:
    try:
        ind = all_words_dict[i[1]]
        tf_idf_array[i[0]][ind] = tf_idf[i]
    except:
        pass

In [36]:
def gen_vector(s):
    # This is where we'd do more processing of the query
    tokens = s.split()

    q_vector = np.zeros((len(all_words)))
    
    counter = Counter(tokens)
    words_count = len(tokens)

    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((num_rows+1)/(df+1))

        try:
            ind = all_words_dict[token]
            q_vector[ind] = tf*idf
        except:
            pass
    return q_vector

In [48]:
gen_vector("pandas how to select first 10 rows").shape

(98114,)

In [50]:
tf_idf_array[0].shape

(98114,)

In [37]:
# #Process query. Make it into a vector of tf-idfs
# def process_query(s, inverted_index, total_vocab, documents):
    
# #     print(processed_s)
#     lst_words = s.split()
# #     print(lst_words)
#     q = np.zeros(len(total_vocab))
# #     print(len(q))
#     counter = Counter(lst_words)
#     for word in np.unique(lst_words):
#         if word in inverted_index:
#             tf = counter[word] / len(lst_words)
#             df = len(inverted_index[word])
#             idf = np.log(len(documents) / df)
#             q[total_vocab.index(word)] = tf*idf
    
#     return q


In [38]:
# process_query("flatten nested loop python", inverted_index, total_vocab, documents)

In [39]:
#Got from William Scott https://github.com/williamscott701/Information-Retrieval/blob/master/2.%20TF-IDF%20Ranking%20-%20Cosine%20Similarity%2C%20Matching%20Score/TF-IDF.ipynb
def cosine_sim(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

In [40]:
tsed_DF

Unnamed: 0,repository_name,func_path_in_repository,func_name,whole_func_string,language,func_code_string,func_code_tokens,func_documentation_string,func_documentation_tokens,split_name,func_code_url,embeddings,clean_code_tokens
0,dillonhicks/rekt,rekt/utils.py,load_config,"def load_config(path):\n """"""\n Loads a yam...",python,"def load_config(path):\n """"""\n Loads a yam...","[def, load_config, (, path, ), :, with, path, ...",Loads a yaml configuration.\n\n :param path:...,"[Loads, a, yaml, configuration, .]",train,https://github.com/dillonhicks/rekt/blob/3848b...,"[-0.057537895, -0.52463245, -0.03275, -0.05936...","[def, load_config, path, with, path, open, 'rb..."
1,google/closure-compiler,src/com/google/javascript/jscomp/DotFormatter....,DotFormatter.appendDot,"static void appendDot(Node n, ControlFlowGraph...",java,"static void appendDot(Node n, ControlFlowGraph...","[static, void, appendDot, (, Node, n, ,, Contr...",Converts an AST to dot representation and appe...,"[Converts, an, AST, to, dot, representation, a...",train,https://github.com/google/closure-compiler/blo...,"[-0.13995743, -0.6146675, -0.11137302, 0.05012...","[static, void, appendDot, Node, n, ControlFlow..."
2,OpenLiberty/open-liberty,dev/com.ibm.ws.messaging.common/src/com/ibm/ws...,ControlHighestGeneratedTickImpl.getTraceSummar...,public void getTraceSummaryLine(StringBuilder ...,java,public void getTraceSummaryLine(StringBuilder ...,"[public, void, getTraceSummaryLine, (, StringB...",/*\nGet summary trace line for this message\n\...,"[/, *, Get, summary, trace, line, for, this, m...",train,https://github.com/OpenLiberty/open-liberty/bl...,"[-0.17273049, -0.3561717, -0.11823353, 0.12623...","[public, void, getTraceSummaryLine, StringBuil..."
3,keybase/client,go/kbfs/libdokan/dir.go,CanDeleteDirectory,func (d *Dir) CanDeleteDirectory(ctx context.C...,go,func (d *Dir) CanDeleteDirectory(ctx context.C...,"[func, (, d, *, Dir, ), CanDeleteDirectory, (,...",// CanDeleteDirectory - return just nil\n// TO...,"[CanDeleteDirectory, -, return, just, nil, TOD...",train,https://github.com/keybase/client/blob/b352622...,"[0.12481375, 0.041516956, -0.22884864, -0.0181...","[func, d, Dir, CanDeleteDirectory, ctx, contex..."
4,jblas-project/jblas,src/main/java/org/jblas/Solve.java,Solve.pinv,public static DoubleMatrix pinv(DoubleMatrix A...,java,public static DoubleMatrix pinv(DoubleMatrix A...,"[public, static, DoubleMatrix, pinv, (, Double...","Computes the pseudo-inverse.\n\nNote, this fun...","[Computes, the, pseudo, -, inverse, .]",train,https://github.com/jblas-project/jblas/blob/28...,"[0.101087354, -0.43265706, -0.16817348, -0.188...","[public, static, DoubleMatrix, pinv, DoubleMat..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,tsnaomi/finnsyll,finnsyll/prev/v01.py,apply_T2,def apply_T2(word):\n '''There is a syllabl...,python,def apply_T2(word):\n '''There is a syllabl...,"[def, apply_T2, (, word, ), :, WORD, =, _split...",There is a syllable boundary within a sequence...,"[There, is, a, syllable, boundary, within, a, ...",train,https://github.com/tsnaomi/finnsyll/blob/6a427...,"[0.3334481, -0.9179637, -0.19431245, -0.049634...","[def, apply_T2, word, WORD, _split_consonants_..."
9996,novemberborn/legendary,lib/concurrent.js,sequence,function sequence(arrayOfTasks) {\n return se...,javascript,function sequence(arrayOfTasks) {\n return se...,"[function, sequence, (, arrayOfTasks, ), {, re...",**Thenables returned by tasks are not assimila...,"[**, Thenables, returned, by, tasks, are, not,...",train,https://github.com/novemberborn/legendary/blob...,"[-0.11193894, -0.4076155, -0.21008785, 0.01438...","[function, sequence, arrayOfTasks, return, ser..."
9997,chippyash/Currency,src/Chippyash/Currency/Factory.php,Factory.create,"public static function create($code, $value = ...",php,"public static function create($code, $value = ...","[public, static, function, create, (, $, code,...",Create a currency\n\n@param string $code Curr...,"[Create, a, currency]",train,https://github.com/chippyash/Currency/blob/d2c...,"[-0.27575833, -0.28741768, -0.1733198, 0.36597...","[public, static, function, create, code, value..."
9998,vesln/r...e,lib/range.js,Range,"function Range(min, max, step) {\n var scope ...",javascript,"function Range(min, max, step) {\n var scope ...","[function, Range, (, min, ,, max, ,, step, ), ...",Range constructor.\n\n@param {Number|String} m...,"[Range, constructor, .]",train,https://github.com/vesln/r...e/blob/e252b3b634...,"[-0.33972165, -0.54678905, -0.29340428, -0.123...","[function, Range, min, max, step, var, scope, ..."


In [51]:
def find_best_matches(query, k, alpha = 0.5):
    q_vector = gen_vector(query)
    q_embedding_vector = get_embeddings([query]).cpu().detach().numpy()[0]
    
    print(q_vector)
    print(q_vector.shape)
#     print(len(q_vector_space))
#     print(len(q_vector_title))
    
    cosine_lst = [[x,0] for x in range(num_rows)]

#     print(len(cosine_lst_title))
#     print(len(cosine_lst_space))
    
    for i, x in enumerate(tf_idf_array):
        # col = tfidf_DF[x].to_numpy()
        # Tensor.cpu()
        embedding = tsed_DF.iloc[i]["embeddings"]

        cosine_lst[i] = [i, (alpha) * cosine_sim(q_vector, x) + (1 - alpha) * cosine_sim(q_embedding_vector, embedding)]
    
    
    cosine_lst.sort(reverse = True, key = lambda x: x[1])
    return cosine_lst[:k]

In [52]:
for lst in find_best_matches("How to split string by newline PYTHON", 10, 0.2):
    print(tsed_DF.iloc[lst[0]]["func_name"])
    print(tsed_DF.iloc[lst[0]]["func_documentation_string"])
    print(f"SCORE: {lst[1]}")
    print("-" * 100)

[0. 0. 0. ... 0. 0. 0.]
(98114,)
Split.apply
/*
split(input, delimiter = ' ')

Split a string on a matching pattern

E.g. {{ "a~b" | split:'~' | first }} #=> 'a'
SCORE: 0.5274941957273278
----------------------------------------------------------------------------------------------------
cut
Split a string into a list of N characters each.

    .. code:: python

        reusables.cut("abcdefghi")
        # ['ab', 'cd', 'ef', 'gh', 'i']

    trailing gives you the following options:

    * normal: leaves remaining characters in their own last position
    * remove: return the list without the remainder characters
    * combine: add the remainder characters to the previous set
    * error: raise an IndexError if there are remaining characters

    .. code:: python

        reusables.cut("abcdefghi", 2, "error")
        # Traceback (most recent call last):
        #     ...
        # IndexError: String of length 9 not divisible by 2 to splice

        reusables.cut("abcdefghi", 2, "remove

In [None]:
"can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first."
