In [1]:
import pandas as pd
import numpy as np
import pickle
from transformers import AutoTokenizer, AutoModel
from datasets import Dataset
from datasets import load_dataset
import datasets
import torch
from collections import Counter

# import string library function  
import string

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# dataset = load_dataset("code_search_net", "all")

dataset_dict = datasets.load_from_disk("./Dataset/CodeSearchCorpus/")

In [3]:
print(torch.backends.cudnn.enabled)
print(torch.cuda.is_available()) #We have GPU on deck and ready
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")

True
True
CUDA device: NVIDIA GeForce RTX 3060 Laptop GPU


In [4]:
print(len(dataset_dict["train"]))
print(len(dataset_dict["validation"]))
print(len(dataset_dict["test"]))

1880853
89154
100529


In [5]:
train_dataset = dataset_dict["train"]
train_dataset

Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
    num_rows: 1880853
})

In [6]:
test_dataset = dataset_dict["test"]
test_dataset

# Yeah, 1.8M is too much. For week 5 at least, we've decided to train on a random sample of 10k from the training, 1k validation and 1k test

# Column for semantic search: func_documentation_string
# Column for tfidf: func_code_tokens


Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
    num_rows: 100529
})

In [7]:
np.random.seed(1)
train_subset_indices = np.random.choice(len(train_dataset), 1000, replace = False)
train_dataset_subset = train_dataset.select(train_subset_indices)

len(train_dataset_subset)

1000

In [8]:
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
# device

#Following code from: https://huggingface.co/learn/nlp-course/chapter5/6?fw=pt

In [9]:
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [10]:
device = torch.device("cuda")
model.to(device)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [11]:
#From Hugging Face Tutorials
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [12]:
# Credit: https://huggingface.co/docs/datasets/use_with_pytorch
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [13]:
#Trained embeddings for semantic search portion

In [14]:
train_subset_embeddings_dataset = train_dataset_subset.map(
    lambda x: {"embeddings": get_embeddings(x["func_documentation_string"]).detach().cpu().numpy()[0]}
)

Map: 100%|██████████| 1000/1000 [00:28<00:00, 35.68 examples/s]


In [15]:
with open('./pickleObjects/train_subset_embeddings_dataset_1000.pkl', 'wb') as f:  # open a text file
    pickle.dump(train_subset_embeddings_dataset, f) # serialize the list
    f.close()

# with open('./pickleObjects/train_subset_embeddings_dataset.pkl', 'wb') as f:  # open a text file
#     pickle.dump(train_subset_embeddings_dataset, f) # serialize the list
#     f.close()


In [16]:
train_subset_embeddings_dataset

Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url', 'embeddings'],
    num_rows: 1000
})

In [None]:
#Creating dictionary for tf-idf

In [17]:
train_dataset_subset[0]["func_code_tokens"]

['def',
 'load_config',
 '(',
 'path',
 ')',
 ':',
 'with',
 'path',
 '.',
 'open',
 '(',
 "'rb'",
 ')',
 'as',
 'fi',
 ':',
 'file_bytes',
 '=',
 'fi',
 '.',
 'read',
 '(',
 ')',
 'config',
 '=',
 'yaml',
 '.',
 'load',
 '(',
 'file_bytes',
 '.',
 'decode',
 '(',
 "'utf-8'",
 ')',
 ')',
 'return',
 'config']

In [18]:
tsed_DF = train_subset_embeddings_dataset.to_pandas() #train-subset-embeddings-dataset_DF

In [19]:
def clean_code_tokens(lst):
    result = string.punctuation 
    new_lst = [] 
    for character in lst:
        if character in result:
            continue
        else:
            new_lst.append(character)
    return new_lst


# # Creating inverted index based off this article: https://www.geeksforgeeks.org/inverted-index/
# def make_documents(data, col_name):
#     documents = data[col_name].dropna().apply(process_text).to_dict()
#     return documents

# def make_inverted_index(documents):
#     word_array = np.array(list(documents.values()))
#     all_words = []
#     for words in word_array:
#         all_words +=  words.split(" ")
# #     terms = dict(zip( range(len(set(all_words))),set(all_words)))
# #     return terms
#     all_words = set(all_words)
#     inverted_index = {}
    
#     for word in all_words:
#         if word != "":
#             lst_docs = []
#             for i, doc in documents.items():
#                 if word in doc.split():
#                     lst_docs.append(i)
        
#             inverted_index[word] = lst_docs
#     return inverted_index

In [20]:
# torch.cuda.empty_cache()

#Cleaned func_code_tokens and set to "clean_code_tokens"
tsed_DF["clean_code_tokens"] =  tsed_DF["func_code_tokens"].apply(clean_code_tokens)

# list(tsed_DF["clean_code_tokens"].to_dict().values())

In [21]:
documents = tsed_DF["clean_code_tokens"].to_dict()

all_words = []
for i in list(tsed_DF["clean_code_tokens"].to_dict().values()):
    all_words += i

all_words = set(all_words)
all_words

inverted_index = {}

for word in all_words:
        if word != "":
            lst_docs = []
            for i, doc in documents.items():
                if word in doc:
                    lst_docs.append(i)
        
            inverted_index[word] = lst_docs


In [22]:
#pickle inverted indices
inverted_index


# with open('./pickleObjects/inverted_index.pkl', 'wb') as f:  # open a text file
#     pickle.dump(inverted_index, f) # serialize the list
#     f.close()

with open('./pickleObjects/inverted_index_1000.pkl', 'wb') as f:  # open a text file
    pickle.dump(inverted_index, f) # serialize the list
    f.close()

In [None]:
string.punctuation

In [26]:
def make_tfidf_DF(documents, inverted_index, total_vocab):
    tf_idf = {}
    df = pd.DataFrame()
    for i, doc in documents.items():
        term_lst = []
        for term in total_vocab:
            # doc_lst = doc.split()
            tf = doc.count(term) / len(doc)

            idf = np.log(len(documents) / len(inverted_index[term]))
    #         if tf*idf > 0:
    #             print(tf*idf)
    #             print(term)
            term_lst.append(tf*idf)
            tf_idf[i, term] = tf*idf
        df[i] = term_lst
    return df

In [27]:
total_vocab = [x for x in inverted_index]

tfidf_DF = make_tfidf_DF(documents, inverted_index, total_vocab)

  df[i] = term_lst


In [25]:
len(total_vocab) * 1000


12330000

In [34]:
tfidf_DF

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12327,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
#Process query. Make it into a vector of tf-idfs
def process_query(s, inverted_index, total_vocab, documents):
    
#     print(processed_s)
    lst_words = s.split()
#     print(lst_words)
    q = np.zeros(len(total_vocab))
#     print(len(q))
    counter = Counter(lst_words)
    for word in np.unique(lst_words):
        if word in inverted_index:
            tf = counter[word] / len(lst_words)
            df = len(inverted_index[word])
            idf = np.log(len(documents) / df)
            q[total_vocab.index(word)] = tf*idf
    
    return q


In [39]:
process_query("flatten nested loop python", inverted_index, total_vocab, documents)

array([0., 0., 0., ..., 0., 0., 0.])

In [40]:
#Got from William Scott https://github.com/williamscott701/Information-Retrieval/blob/master/2.%20TF-IDF%20Ranking%20-%20Cosine%20Similarity%2C%20Matching%20Score/TF-IDF.ipynb
def cosine_sim(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

In [52]:
tsed_DF

Unnamed: 0,repository_name,func_path_in_repository,func_name,whole_func_string,language,func_code_string,func_code_tokens,func_documentation_string,func_documentation_tokens,split_name,func_code_url,embeddings,clean_code_tokens
0,dillonhicks/rekt,rekt/utils.py,load_config,"def load_config(path):\n """"""\n Loads a yam...",python,"def load_config(path):\n """"""\n Loads a yam...","[def, load_config, (, path, ), :, with, path, ...",Loads a yaml configuration.\n\n :param path:...,"[Loads, a, yaml, configuration, .]",train,https://github.com/dillonhicks/rekt/blob/3848b...,"[-0.057537895, -0.52463245, -0.03275, -0.05936...","[def, load_config, path, with, path, open, 'rb..."
1,google/closure-compiler,src/com/google/javascript/jscomp/DotFormatter....,DotFormatter.appendDot,"static void appendDot(Node n, ControlFlowGraph...",java,"static void appendDot(Node n, ControlFlowGraph...","[static, void, appendDot, (, Node, n, ,, Contr...",Converts an AST to dot representation and appe...,"[Converts, an, AST, to, dot, representation, a...",train,https://github.com/google/closure-compiler/blo...,"[-0.13995743, -0.6146675, -0.11137302, 0.05012...","[static, void, appendDot, Node, n, ControlFlow..."
2,OpenLiberty/open-liberty,dev/com.ibm.ws.messaging.common/src/com/ibm/ws...,ControlHighestGeneratedTickImpl.getTraceSummar...,public void getTraceSummaryLine(StringBuilder ...,java,public void getTraceSummaryLine(StringBuilder ...,"[public, void, getTraceSummaryLine, (, StringB...",/*\nGet summary trace line for this message\n\...,"[/, *, Get, summary, trace, line, for, this, m...",train,https://github.com/OpenLiberty/open-liberty/bl...,"[-0.17273049, -0.3561717, -0.11823353, 0.12623...","[public, void, getTraceSummaryLine, StringBuil..."
3,keybase/client,go/kbfs/libdokan/dir.go,CanDeleteDirectory,func (d *Dir) CanDeleteDirectory(ctx context.C...,go,func (d *Dir) CanDeleteDirectory(ctx context.C...,"[func, (, d, *, Dir, ), CanDeleteDirectory, (,...",// CanDeleteDirectory - return just nil\n// TO...,"[CanDeleteDirectory, -, return, just, nil, TOD...",train,https://github.com/keybase/client/blob/b352622...,"[0.12481375, 0.041516956, -0.22884864, -0.0181...","[func, d, Dir, CanDeleteDirectory, ctx, contex..."
4,jblas-project/jblas,src/main/java/org/jblas/Solve.java,Solve.pinv,public static DoubleMatrix pinv(DoubleMatrix A...,java,public static DoubleMatrix pinv(DoubleMatrix A...,"[public, static, DoubleMatrix, pinv, (, Double...","Computes the pseudo-inverse.\n\nNote, this fun...","[Computes, the, pseudo, -, inverse, .]",train,https://github.com/jblas-project/jblas/blob/28...,"[0.101087354, -0.43265706, -0.16817348, -0.188...","[public, static, DoubleMatrix, pinv, DoubleMat..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,brocade/pynos,pynos/versions/ver_6/ver_6_0_1/yang/brocade_nt...,brocade_ntp.show_ntp_output_node_active_server...,def show_ntp_output_node_active_server_rbridge...,python,def show_ntp_output_node_active_server_rbridge...,"[def, show_ntp_output_node_active_server_rbrid...",Auto Generated Code,"[Auto, Generated, Code]",train,https://github.com/brocade/pynos/blob/bd8a34e9...,"[-0.02366774, -0.3845483, -0.25812605, 0.00860...","[def, show_ntp_output_node_active_server_rbrid..."
996,sixty-north/cosmic-ray,src/cosmic_ray/cli.py,handle_exec,"def handle_exec(args):\n """"""usage: cosmic-r...",python,"def handle_exec(args):\n """"""usage: cosmic-r...","[def, handle_exec, (, args, ), :, session_file...",usage: cosmic-ray exec <session-file>\n\n P...,"[usage, :, cosmic, -, ray, exec, <session, -, ...",train,https://github.com/sixty-north/cosmic-ray/blob...,"[-0.1944713, -0.41359708, -0.14166526, -0.3504...","[def, handle_exec, args, session_file, get_db_..."
997,wonambi-python/wonambi,wonambi/attr/annotations.py,Annotations.set_stage_for_epoch,"def set_stage_for_epoch(self, epoch_start, nam...",python,"def set_stage_for_epoch(self, epoch_start, nam...","[def, set_stage_for_epoch, (, self, ,, epoch_s...",Change the stage for one specific epoch.\n\n ...,"[Change, the, stage, for, one, specific, epoch...",train,https://github.com/wonambi-python/wonambi/blob...,"[-0.5645278, 0.013677543, -0.056194477, -0.112...","[def, set_stage_for_epoch, self, epoch_start, ..."
998,titon/toolkit,dist/toolkit.js,,"function(element, options) {\n element ...",javascript,"function(element, options) {\n element ...","[function, (, element, ,, options, ), {, eleme...",Initialize the pin.\n\n@param {jQuery} element...,"[Initialize, the, pin, .]",train,https://github.com/titon/toolkit/blob/f0ed36d1...,"[-0.092111215, -0.49511465, -0.1434023, -0.178...","[function, element, options, element, this, se..."


In [82]:
def find_best_matches(query, k, alpha = 0.5):
    q_vector = process_query(query, inverted_index, total_vocab, documents)
    q_embedding_vector = get_embeddings([query]).cpu().detach().numpy()[0]
    
  
#     print(len(q_vector_space))
#     print(len(q_vector_title))
    
    cosine_lst = [[x,0] for x in range(len(tfidf_DF.columns.to_list()))]

#     print(len(cosine_lst_title))
#     print(len(cosine_lst_space))
    
    for x in tfidf_DF.columns.to_list():
        col = tfidf_DF[x].to_numpy()
        # Tensor.cpu()
        embedding = tsed_DF.iloc[x]["embeddings"]

        cosine_lst[x] = [x, (alpha) * cosine_sim(q_vector, col) + (1 - alpha) * cosine_sim(q_embedding_vector, embedding)]
    
    
    cosine_lst.sort(reverse = True, key = lambda x: x[1])
    return cosine_lst[:k]

In [83]:
for lst in find_best_matches("How to split string by newline PYTHON", 10, 0.2):
    print(tsed_DF.iloc[lst[0]]["func_name"])
    print(tsed_DF.iloc[lst[0]]["func_documentation_string"])
    print(f"SCORE: {lst[1]}")
    print("-" * 100)

Split.apply
/*
split(input, delimiter = ' ')

Split a string on a matching pattern

E.g. {{ "a~b" | split:'~' | first }} #=> 'a'
SCORE: 0.5327710579078077
----------------------------------------------------------------------------------------------------
explicit_line_join
r"""Avoid explicit line join between brackets.

    The preferred way of wrapping long lines is by using Python's implied line
    continuation inside parentheses, brackets and braces.  Long lines can be
    broken over multiple lines by wrapping expressions in parentheses.  These
    should be used in preference to using a backslash for line continuation.

    E502: aaa = [123, \\n       123]
    E502: aaa = ("bbb " \\n       "ccc")

    Okay: aaa = [123,\n       123]
    Okay: aaa = ("bbb "\n       "ccc")
    Okay: aaa = "bbb " \\n    "ccc"
    Okay: aaa = 123  # \\
SCORE: 0.4863744735717774
----------------------------------------------------------------------------------------------------
StringCharacterIterator

In [70]:
"can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first."


"can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first."