In [2]:
import pandas as pd
import numpy as np
import pickle
from transformers import AutoTokenizer, AutoModel
from datasets import Dataset
from datasets import load_dataset
import datasets
import torch
from collections import Counter
import string

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# dataset = load_dataset("code_search_net", "all")

dataset_dict = datasets.load_from_disk("./Dataset/CodeSearchCorpus/")

In [4]:
# Testing if the pytorch GPU functions work
print(torch.backends.cudnn.enabled)
print(torch.cuda.is_available()) #We have GPU on deck and ready
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")

True
True
CUDA device: NVIDIA GeForce RTX 3060 Laptop GPU


In [5]:
# Seeing the size of the CodeSearchNet database
print(len(dataset_dict["train"]))
print(len(dataset_dict["validation"]))
print(len(dataset_dict["test"]))

1880853
89154
100529


In [6]:
# Taking only the training dataset
train_dataset = dataset_dict["train"]
train_dataset

Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
    num_rows: 1880853
})

In [7]:
# Seeing the test_dataset
test_dataset = dataset_dict["test"]
test_dataset

Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
    num_rows: 100529
})

Yeah, 1.8M is too much. For week 5 at least, we've decided to train on a random sample of 10k from the training, 1k validation and 1k test

Column for semantic search: func_documentation_string
Column for tfidf: func_code_tokens

In [8]:
# Seeing what one sample row of the training dataset is like
train_dataset[0]

{'repository_name': 'ageitgey/face_recognition',
 'func_path_in_repository': 'examples/face_recognition_knn.py',
 'func_name': 'train',
 'whole_func_string': 'def train(train_dir, model_save_path=None, n_neighbors=None, knn_algo=\'ball_tree\', verbose=False):\n    """\n    Trains a k-nearest neighbors classifier for face recognition.\n\n    :param train_dir: directory that contains a sub-directory for each known person, with its name.\n\n     (View in source code to see train_dir example tree structure)\n\n     Structure:\n        <train_dir>/\n        ├── <person1>/\n        │   ├── <somename1>.jpeg\n        │   ├── <somename2>.jpeg\n        │   ├── ...\n        ├── <person2>/\n        │   ├── <somename1>.jpeg\n        │   └── <somename2>.jpeg\n        └── ...\n\n    :param model_save_path: (optional) path to save model on disk\n    :param n_neighbors: (optional) number of neighbors to weigh in classification. Chosen automatically if not specified\n    :param knn_algo: (optional) unde

In [9]:
# Decide number of rows, the filepath to where to store the pickle files
# The pickled objects are are the inverted index and embeddings dataset

num_rows = 20000
filepath_pkl_obj = "./PickleObjects/"
inverted_index_name = f"inverted_index_{num_rows}.pkl"
tsed_name = f"train_subset_embeddings_dataset_{num_rows}.pkl"

print(inverted_index_name, tsed_name)

inverted_index_20000.pkl train_subset_embeddings_dataset_20000.pkl


In [10]:
# Taking a sample of the training dataset
# There are SO MANY PROBLEMS WHEN WE DO THIS THO, need to ask colin what to do i suppose?

np.random.seed(1)
train_subset_indices = np.random.choice(len(train_dataset), num_rows, replace = False)
train_dataset_subset = train_dataset.select(train_subset_indices)

len(train_dataset_subset)

20000

### Semantic Embeddings Portion

In [11]:
#Following code from: https://huggingface.co/learn/nlp-course/chapter5/6?fw=pt
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [12]:
# Load the model to the GPU. Mine is a 3060
device = torch.device("cuda")
model.to(device)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [13]:
#From Hugging Face Tutorials
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [13]:
#Train embeddings
#If the filename exists, load the pickle object. If not, train it and then save it as a pickle object
#REMEMBER TO KEEP THE FILENAMES THE SAME 0_0
try:
    with open(f'{filepath_pkl_obj}{tsed_name}', 'rb') as f:  # open a text file
        train_subset_embeddings_dataset = pickle.load(f) # serialize the list
        f.close()
except:
    train_subset_embeddings_dataset = train_dataset_subset.map(
        lambda x: {"embeddings": get_embeddings(x["func_documentation_string"]).detach().cpu().numpy()[0]}
    )

    with open(f'{filepath_pkl_obj}{tsed_name}', 'wb') as f:  # open a text file
        pickle.dump(train_subset_embeddings_dataset, f) # serialize the list
        f.close()



In [19]:
train_subset_embeddings_dataset

Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url', 'embeddings'],
    num_rows: 20000
})

### TF-IDF Portion

In [14]:
train_dataset_subset[0]["func_code_tokens"]

['def',
 'load_config',
 '(',
 'path',
 ')',
 ':',
 'with',
 'path',
 '.',
 'open',
 '(',
 "'rb'",
 ')',
 'as',
 'fi',
 ':',
 'file_bytes',
 '=',
 'fi',
 '.',
 'read',
 '(',
 ')',
 'config',
 '=',
 'yaml',
 '.',
 'load',
 '(',
 'file_bytes',
 '.',
 'decode',
 '(',
 "'utf-8'",
 ')',
 ')',
 'return',
 'config']

In [15]:
# Convert the embeddings to a pandas dataframe
tsed_DF = train_subset_embeddings_dataset.to_pandas()

In [16]:
# function to clean the code tokens. Super rudimentary, 
# as of right now, we're just taking rid of the single punctuation
def clean_code_tokens(lst):
    result = string.punctuation 
    new_lst = [] 
    for character in lst:
        if character in result:
            continue
        else:
            new_lst.append(character)
    return new_lst

In [17]:
# creating a column of "clean" code tokens
# There's many many issues with this strategy
tsed_DF["clean_code_tokens"] =  tsed_DF["func_code_tokens"].apply(clean_code_tokens)

##### Much of this code was based off of William Scott's implementation of TF-IDF: https://github.com/williamscott701/Information-Retrieval/blob/master/2.%20TF-IDF%20Ranking%20-%20Cosine%20Similarity%2C%20Matching%20Score/TF-IDF.ipynb


In [19]:
# Creates s list of documents
documents = tsed_DF["clean_code_tokens"].to_dict()

# Compiles a list of the words 
all_words = []
for i in list(tsed_DF["clean_code_tokens"].to_dict().values()):
    all_words += i

#convert all words to a set, eliminates, duplicates
all_words = list(set(all_words)) #Get rid of all repeats
all_words

# Create the inverted index if its not in a pickle file (and save it)
try:
     with open(f'{filepath_pkl_obj}{inverted_index_name}', 'rb') as f:
        inverted_index = pickle.load(f) # deserialize using load()
        f.close()
except:
    inverted_index = {}

    for word in all_words:
            if word != "":
                lst_docs = []
                for i, doc in documents.items():
                    if word in doc:
                        lst_docs.append(i)
            
                inverted_index[word] = lst_docs
    
    #Pickle afterwards
    with open(f'{filepath_pkl_obj}{inverted_index_name}', 'wb') as f:  # open a text file
        pickle.dump(inverted_index, f) # serialize the list
        f.close()

In [20]:
# sanity check that the inverted index is right
len(all_words) == len(inverted_index)

True

In [21]:
#pickle inverted indices
# Note: this is bad, that's an absurd amount of unique tokens. For code, 
# There needs to be much better tokenization or look into pre-trained embeddings
len(inverted_index)

175692

In [22]:
# Function to get the document frequency of a word/token
def doc_freq(word):
    c = 0
    try:
        c = inverted_index[word]
    except:
        pass

    if type(c) == list:
        return len(c)
    else:
        return 0



# Creating a tf_idf object. WILL TURN THIS INTO A FUNCTION LATER
tf_idf = {}
for i in range(num_rows):
    # print(i)
    tokens = tsed_DF["clean_code_tokens"].iloc[i]
    counter = Counter(tokens)
    words_count = len(tokens)

    for token in np.unique(tokens):
        tf = counter[token] / words_count
        df = doc_freq(token)
        idf = np.log((num_rows + 1) / (df + 1))

        tf_idf[i, token] = tf * idf


In [34]:
# Turning the tf_idf object into a numpy array for much faster calculations
all_words_dict = dict(zip(all_words, range(len(all_words))))


tf_idf_array = np.zeros((num_rows, len(all_words)))

for i in tf_idf:
    try:
        ind = all_words_dict[i[1]]
        tf_idf_array[i[0]][ind] = tf_idf[i]
    except:
        pass

In [23]:
#Function which given a query, returns in a tf_idf vector
def gen_vector(s):
    # This is where we'd do more processing of the query
    tokens = s.split()

    q_vector = np.zeros((len(all_words)))
    
    counter = Counter(tokens)
    words_count = len(tokens)

    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((num_rows+1)/(df+1))

        try:
            ind = all_words_dict[token]
            q_vector[ind] = tf*idf
        except:
            pass
    return q_vector

In [25]:
#Making sure the method returns a vector of (len(all_words),) dimensions
gen_vector("pandas how to select first 10 rows").shape

(175692,)

In [26]:
#Function for cosine_similarity
def cosine_sim(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

In [27]:
# Method to find the best match
# query param: the string query
# k param: the k number of results to return
# alpha: the value which determines the linear split alpha * tfidf portion + (1-alpha)*semantic search portion
def find_best_matches(query, k, alpha = 0.5):
    q_vector = gen_vector(query)
    q_embedding_vector = get_embeddings([query]).cpu().detach().numpy()[0]
    
    # print(q_vector)
    # print(q_vector.shape)
#     print(len(q_vector_space))
#     print(len(q_vector_title))
    
    cosine_lst = [[x,0] for x in range(num_rows)]

#     print(len(cosine_lst_title))
#     print(len(cosine_lst_space))
    
    for i, x in enumerate(tf_idf_array):
        # col = tfidf_DF[x].to_numpy()
        # Tensor.cpu()
        embedding = tsed_DF.iloc[i]["embeddings"]

        cosine_lst[i] = [i, (alpha) * cosine_sim(q_vector, x) + (1 - alpha) * cosine_sim(q_embedding_vector, embedding)]
    
    
    cosine_lst.sort(reverse = True, key = lambda x: x[1])
    return cosine_lst[:k]

In [None]:
# Function which runs all 99 queries, and returns a pd df of the results
def create_results(query_filepath, results_per_query = 100):
    queries = pd.read_csv(query_filepath)
    # display(queries)
    q_lst = queries["query"].to_list()
    # print(q_lst)

    lang_lst = []
    func_code_url_lst = []
    query_lst = []

    for i, query in enumerate(q_lst):
        print(i)
        fbm_lst = find_best_matches(query, results_per_query, 0.2)
        query_lst += [query for j in range(len(fbm_lst))]
        
        for lst in fbm_lst:
            # print(tsed_DF.iloc[lst[0]]["language"])
            # print(tsed_DF.iloc[lst[0]]["func_name"])
            # print(tsed_DF.iloc[lst[0]]["func_code_url"])
            # print(f"SCORE: {lst[1]}")
            # print("-" * 100)

            lang_lst.append(tsed_DF.iloc[lst[0]]["language"])
            func_code_url_lst.append(tsed_DF.iloc[lst[0]]["func_code_url"])
        
        # break

    # print(lang_lst)
    # print(func_code_url_lst)
    # print(query_lst)
    prediction_df = pd.DataFrame({'language' : lang_lst, 'url': func_code_url_lst, "query" : query_lst})
    return prediction_df
        
    

In [None]:
res_df = create_results("./Dataset/Testing/queries.csv", results_per_query=50)

In [None]:
res_df.to_csv("./csv_output/baseline_20k.csv")