In [1]:
import pandas as pd
import numpy as np
import pickle
from transformers import AutoTokenizer, AutoModel
from datasets import Dataset
from datasets import load_dataset
import datasets
import torch
from collections import Counter
import string
from scipy import spatial
from sklearn.metrics.pairwise import cosine_similarity
import faiss

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# dataset = load_dataset("code_search_net", "all")

dataset_dict = datasets.load_from_disk("./Dataset/CodeSearchCorpus/")

In [3]:
# Testing if the pytorch GPU functions work
print(torch.backends.cudnn.enabled)
print(torch.cuda.is_available()) #We have GPU on deck and ready
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")

True
True
CUDA device: NVIDIA GeForce RTX 3060 Laptop GPU


In [4]:
# Seeing the size of the CodeSearchNet database
print(len(dataset_dict["train"]))
print(len(dataset_dict["validation"]))
print(len(dataset_dict["test"]))

1880853
89154
100529


In [5]:
# Taking only the training dataset
train_dataset = dataset_dict["train"]
train_dataset

Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
    num_rows: 1880853
})

In [6]:
# Seeing the test_dataset
# test_dataset = dataset_dict["test"]
# test_dataset

Yeah, 1.8M is too much. For week 5 at least, we've decided to train on a random sample of 10k from the training, 1k validation and 1k test

Column for semantic search: func_documentation_string
Column for tfidf: func_code_tokens

In [7]:
# Seeing what one sample row of the training dataset is like
train_dataset[0]

{'repository_name': 'ageitgey/face_recognition',
 'func_path_in_repository': 'examples/face_recognition_knn.py',
 'func_name': 'train',
 'whole_func_string': 'def train(train_dir, model_save_path=None, n_neighbors=None, knn_algo=\'ball_tree\', verbose=False):\n    """\n    Trains a k-nearest neighbors classifier for face recognition.\n\n    :param train_dir: directory that contains a sub-directory for each known person, with its name.\n\n     (View in source code to see train_dir example tree structure)\n\n     Structure:\n        <train_dir>/\n        ├── <person1>/\n        │   ├── <somename1>.jpeg\n        │   ├── <somename2>.jpeg\n        │   ├── ...\n        ├── <person2>/\n        │   ├── <somename1>.jpeg\n        │   └── <somename2>.jpeg\n        └── ...\n\n    :param model_save_path: (optional) path to save model on disk\n    :param n_neighbors: (optional) number of neighbors to weigh in classification. Chosen automatically if not specified\n    :param knn_algo: (optional) unde

In [8]:
# Decide number of rows, the filepath to where to store the pickle files
# The pickled objects are are the inverted index and embeddings dataset

num_rows = 50000
filepath_pkl_obj = "./PickleObjects/"
inverted_index_name = f"inverted_index_{num_rows}.pkl"
tsed_name = f"train_subset_embeddings_dataset_{num_rows}.pkl"

print(inverted_index_name, tsed_name)

inverted_index_50000.pkl train_subset_embeddings_dataset_50000.pkl


In [9]:
# Taking a sample of the training dataset
# There are SO MANY PROBLEMS WHEN WE DO THIS THO, need to ask colin what to do i suppose?

np.random.seed(1)
train_subset_indices = np.random.choice(len(train_dataset), num_rows, replace = False)
train_dataset_subset = train_dataset.select(train_subset_indices)

len(train_dataset_subset)

50000

### Semantic Embeddings Portion

In [10]:
#Following code from: https://huggingface.co/learn/nlp-course/chapter5/6?fw=pt
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1" #Can/Should test different models
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [11]:
# Load the model to the GPU. Mine is a 3060
device = torch.device("cuda")
model.to(device)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [12]:
#From Hugging Face Tutorials
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [13]:
#Train embeddings
#If the filename exists, load the pickle object. If not, train it and then save it as a pickle object
#REMEMBER TO KEEP THE FILENAMES THE SAME 0_0
try:
    with open(f'{filepath_pkl_obj}{tsed_name}', 'rb') as f:  # open a text file
        train_subset_embeddings_dataset = pickle.load(f) # serialize the list
        f.close()
except:
    train_subset_embeddings_dataset = train_dataset_subset.map(
        lambda x: {"embeddings": get_embeddings(x["func_documentation_string"]).detach().cpu().numpy()[0]}
    )

    train_subset_embeddings_dataset.add_faiss_index(column="embeddings")

    with open(f'{filepath_pkl_obj}{tsed_name}', 'wb') as f:  # open a text file
        pickle.dump(train_subset_embeddings_dataset, f) # serialize the list
        f.close()



In [14]:
train_subset_embeddings_dataset

Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url', 'embeddings'],
    num_rows: 50000
})

### TF-IDF Portion

In [15]:
train_subset_embeddings_dataset

Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url', 'embeddings'],
    num_rows: 50000
})

In [16]:
# Convert the embeddings to a pandas dataframe
tsed_DF = train_subset_embeddings_dataset.to_pandas()

In [17]:
# function to clean the code tokens. Super rudimentary, 
# as of right now, we're just taking rid of the single punctuation
def clean_code_tokens(lst):
    result = string.punctuation 
    new_lst = [] 
    for character in lst:
        if character in result:
            continue
        else:
            new_lst.append(character)
    return new_lst

In [18]:
# creating a column of "clean" code tokens
# There's many many issues with this strategy
tsed_DF["clean_code_tokens"] =  tsed_DF["func_code_tokens"].apply(clean_code_tokens)

##### Much of this code was based off of William Scott's implementation of TF-IDF: https://github.com/williamscott701/Information-Retrieval/blob/master/2.%20TF-IDF%20Ranking%20-%20Cosine%20Similarity%2C%20Matching%20Score/TF-IDF.ipynb


In [19]:
# Creates s list of documents
documents = tsed_DF["clean_code_tokens"].to_dict()

# Compiles a list of the words 
all_words = []
for i in list(tsed_DF["clean_code_tokens"].to_dict().values()):
    all_words += i

#convert all words to a set, eliminates, duplicates
all_words = list(set(all_words)) #Get rid of all repeats
# all_words

In [71]:
f'{filepath_pkl_obj}{inverted_index_name}'

'./PickleObjects/inverted_index_50000.pkl'

In [75]:
# inverted_index tf_idf
tf_idf = create_tfidf(num_rows, tsed_DF)

Looking into much more efficient method of querying results

In [66]:
def tf_idf_query(query_string, inverted_index, tf_idf, k = 10):
    query_tokens = query_string.split()

    rel_indices = []
    
    for token in query_tokens:
        if token in inverted_index:
            rel_indices += list(inverted_index[token].keys())
    
    rel_indices = set(rel_indices)

    result_lst = []
    for i in rel_indices:
        for token in query_tokens:
            score = 0
            try:
                score += (tf_idf[(i, token)])
            except: continue
        result_lst.append([i, score])
    
    result_lst.sort(reverse=True, key = lambda x: x[1])
    return result_lst[:k]

In [67]:
# Create the inverted index if its not in a pickle file (and save it)
def create_inverted_index(filepath_to_search):
    try:
        with open(filepath_to_search, 'rb') as f:
            inverted_index = pickle.load(f) # deserialize using load()
            f.close()
    except:
        inverted_index = {}
        for i in range(num_rows):
            token_counter = Counter(tsed_DF.iloc[i]["clean_code_tokens"])

            for token in token_counter:
                if token not in inverted_index:
                    inverted_index[token] = {}
                inverted_index[token][i] = token_counter[token]
        
        #Pickle afterwards
        with open(filepath_to_search, 'wb') as f:  # open a text file
            pickle.dump(inverted_index, f) # serialize the list
            f.close()
    return inverted_index

In [22]:
inverted_index = create_inverted_index(f'{filepath_pkl_obj}{inverted_index_name}')

In [23]:
# Creating a tf_idf object. WILL TURN THIS INTO A FUNCTION LATER
def create_tfidf(num_rows, tsed_DF):
    tf_idf = {}
    for i in range(num_rows):
        # print(i)
        tokens = tsed_DF["clean_code_tokens"].iloc[i]
        counter = Counter(tokens)
        words_count = len(tokens)

        for token in np.unique(tokens):
            tf = counter[token] / words_count
            df = len(inverted_index[token])
            idf = np.log((num_rows + 1) / (df + 1))

            tf_idf[i, token] = tf * idf
    return tf_idf

In [24]:
# inverted_index tf_idf
tf_idf = create_tfidf(num_rows, tsed_DF)

In [25]:
# tf_idf

Looking into much more efficient method of querying results

In [48]:
#Function for cosine_similarity. #Look into np.cos Annoy FAISS. look into applying and vectorizing
def cosine_sim(a, b):
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

In [49]:
def query_results(query_string, inverted_index, tf_idf, k = 10):
    query_tokens = query_string.split()

    rel_indices = []
    
    for token in query_tokens:
        if token in inverted_index:
            rel_indices += list(inverted_index[token].keys())
    
    rel_indices = set(rel_indices)

    query_embedding = get_embeddings(["string to date"]).cpu().detach().numpy()
    # len(query_embedding[0])
    # len(tsed_DF["embeddings"][0])
    

    result_lst = []
    for i in rel_indices:
        for token in query_tokens:
            tf_score = 0
            try:
                tf_score += (tf_idf[(i, token)])
            except: continue #this is bad, make sure this isn't the play
        # print(i)

        result_lst.append([i, tf_score, cosine_sim(tsed_DF["embeddings"][i], query_embedding[0])])
    
    result_lst.sort(reverse=True, key = lambda x: 0.5 * x[1] + 0.5*x[2])
    return result_lst[:k]

In [50]:
test_query_results = query_results("string to date", inverted_index, tf_idf, 10)

In [51]:
test_query_results

[[19040, 1.913807353640272, 0.5304062],
 [8903, 1.6193754530802302, 0.67891794],
 [24467, 1.6745814344352379, 0.43565905],
 [2617, 1.4034587260028661, 0.4528301],
 [30965, 1.315742555627687, 0.50129217],
 [18983, 1.315742555627687, 0.48587227],
 [24729, 1.2383459347084111, 0.5489989],
 [34231, 1.3581858638737414, 0.41593793],
 [48278, 1.0525940445021496, 0.66434515],
 [28433, 1.1512747361742262, 0.51342237]]

In [58]:
# for tqr in test_query_results:
#     display(tsed_DF.iloc[tqr[0]][["func_name", "language",  "func_code_string", "func_documentation_string"]])

In [64]:
# Function which runs all 99 queries, and returns a pd df of the results
def create_results(query_filepath, results_per_query = 100):
    queries = pd.read_csv(query_filepath)
    # display(queries)
    q_lst = queries["query"].to_list()
    # print(q_lst)

    lang_lst = []
    func_code_url_lst = []
    query_lst = []

    for i, query in enumerate(q_lst):
        # print(i)
        fbm_lst = query_results(query, inverted_index, tf_idf, results_per_query)
        query_lst += [query for j in range(len(fbm_lst))]
        
        for lst in fbm_lst:
            # print(tsed_DF.iloc[lst[0]]["language"])
            # print(tsed_DF.iloc[lst[0]]["func_name"])
            # print(tsed_DF.iloc[lst[0]]["func_code_url"])
            # print(f"SCORE: {lst[1]}")
            # print("-" * 100)

            lang_lst.append(tsed_DF.iloc[lst[0]]["language"])
            func_code_url_lst.append(tsed_DF.iloc[lst[0]]["func_code_url"])
        
        # break

    # print(lang_lst)
    # print(func_code_url_lst)
    # print(query_lst)
    prediction_df = pd.DataFrame({'language' : lang_lst, 'url': func_code_url_lst, "query" : query_lst})
    return prediction_df
        


In [65]:
res_df = create_results("./Dataset/Testing/queries.csv", results_per_query=50)
res_df.to_csv("./csv_output/baseline_50k.csv")

In [28]:
# for res in test_query_results:
#     # print(len(tsed_DF.iloc[res[0]]["embeddings"]))
#     print("-" * 100)

# query_embedding = get_embeddings(["string to date"]).cpu().detach().numpy()
    
    
# desc_scores, desc_results = train_subset_embeddings_dataset.get_nearest_examples("embeddings", query_embedding, 1000)

In [29]:
# desc_scores
# desc_results.keys()
# test_df = pd.DataFrame(desc_results)


In [53]:
# tsed_DF.columns

Index(['repository_name', 'func_path_in_repository', 'func_name',
       'whole_func_string', 'language', 'func_code_string', 'func_code_tokens',
       'func_documentation_string', 'func_documentation_tokens', 'split_name',
       'func_code_url', 'embeddings', 'clean_code_tokens'],
      dtype='object')

In [47]:
# query_embedding = get_embeddings(["string to date"]).cpu().detach().numpy()
# len(query_embedding[0])

In [44]:
# query_embedding = get_embeddings(["string to date"]).cpu().detach().numpy()
# len(query_embedding[0])
# len(tsed_DF["embeddings"][0])
# cosine_sim(tsed_DF["embeddings"][0], query_embedding[0])

768

In [None]:
# test_query = "string to date"
# test_query_tokens = test_query.split()
# rel_indices = []
# for token in test_query_tokens:
#     # print(inverted_index[token])
#     # print(token in inverted_index)
    
#     if token in inverted_index:
#         print(len(list(inverted_index[token].keys())))
#         rel_indices += list(inverted_index[token].keys())
#     #     num_docs_with_term = (len(inverted_index[token]))
#     #     for i in inverted_index[token].keys():
#     #         print(tf_idf[(i, token)])

# # len(rel_indices) == 2074 + 83 + 99
# rel_indices = set(rel_indices)
# len(rel_indices)

# test_answers = []
# for i in rel_indices:
#     # print(i)
#     for token in test_query_tokens:
#         score = 0
#         try:
#             score += (tf_idf[(i, token)])
#         except: continue
#     test_answers.append([i, score])

# test_answers.sort(key = lambda x: x[1], reverse=True)
# test_answers[:10]
# tsed_DF.iloc[19040]

In [None]:
# new_tf_idf
# all_words_dict = dict(zip(all_words, range(len(all_words))))


# tf_idf_array = np.zeros((num_rows, len(all_words)), dtype="float32")

# for i in tf_idf:
#     try:
#         ind = all_words_dict[i[1]]
#         tf_idf_array[i[0]][ind] = tf_idf[i]
#     except:
#         pass


In [None]:
# # Function to get the document frequency of a word/token
# def doc_freq(word):
#     c = 0
#     try:
#         c = inverted_index[word]
#     except:
#         pass

#     if type(c) == list:
#         return len(c)
#     else:
#         return 0

In [None]:
# # Function to get the document frequency of a word/token
# def doc_freq(word):
#     c = 0
#     try:
#         c = inverted_index[word]
#     except:
#         pass

#     if type(c) == list:
#         return len(c)
#     else:
#         return 0

In [24]:
#Function which given a query, returns in a tf_idf vector
# def gen_vector(s):
#     # This is where we'd do more processing of the query
#     tokens = s.split()

#     q_vector = np.zeros((len(all_words)))
    
#     counter = Counter(tokens)
#     words_count = len(tokens)

#     for token in np.unique(tokens):
        
#         tf = counter[token]/words_count
#         try:
#             df = len(inverted_index[token])
#         except:
#             df = 0
#         # df = doc_freq(token)
#         idf = np.log((num_rows+1)/(df+1))

#         try:
#             ind = all_words_dict[token]
#             q_vector[ind] = tf*idf
#         except:
#             pass
#     return q_vector

In [None]:
# gen_vector("pandas how to select first 10 rows").shape

In [None]:
# def test_find_best_tfidf(query, df, colname):
#     query_vector = gen_vector(query)
#     df["cosine_sim"] = df[colname].apply(lambda x: 1 - (spatial.distance.cosine(query_vector, x)))
#     return df


In [None]:
# test_find_best_tfidf("pandas how to select first 10 rows", tsed_DF, "tf_idf_vector")
# [ for x in range(num_rows)]
# test_q_vector = gen_vector("pandas how to select first 10 rows")
# xa = tsed_DF["tf_idf_vector"]""

In [None]:
# This code took 90s. That's buttcheeks
# tsed_DF["tf_idf_vector"].apply(lambda row: 1 - (spatial.distance.cosine(test_q_vector, row)))

In [None]:
# (np.linalg.norm(xa, axis = 1) * np.linalg.norm(test_q_vector))


In [None]:
# faiss.normalize_L2(xa)
# vector_dimension = tf_idf_array.shape[1]
# index = faiss.IndexFlatIP(vector_dimension)
# faiss.normalize_L2(tf_idf_array)


In [None]:
# index.add(tf_idf_array)

In [None]:
# _vector = np.array([test_q_vector], dtype="float32")


In [None]:
# faiss.normalize_L2(test_q_vector)

In [None]:
# _vector.shape
# _vector

In [None]:
# test_find_best_tfidf("pandas how to select first 10 rows", tsed_DF, "tf_idf_vector")
# [ for x in range(num_rows)]
# test_q_vector = gen_vector("pandas how to select first 10 rows")
# xa = tsed_DF["tf_idf_vector"]""

In [28]:
# This code took 90s. That's buttcheeks
# tsed_DF["tf_idf_vector"].apply(lambda row: 1 - (spatial.distance.cosine(test_q_vector, row)))

In [29]:
# (np.linalg.norm(xa, axis = 1) * np.linalg.norm(test_q_vector))


In [30]:
# faiss.normalize_L2(xa)
# vector_dimension = tf_idf_array.shape[1]
# index = faiss.IndexFlatIP(vector_dimension)
# faiss.normalize_L2(tf_idf_array)


In [31]:
# index.add(tf_idf_array)

In [32]:
# _vector = np.array([test_q_vector], dtype="float32")


In [33]:
# faiss.normalize_L2(test_q_vector)

In [34]:
# _vector.shape
# _vector

In [35]:
#Function for cosine_similarity. #Look into np.cos Annoy FAISS. look into applying and vectorizing
def cosine_sim(a, b):
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

In [36]:
# Method to find the best match
# query param: the string query
# k param: the k number of results to return
# alpha: the value which determines the linear split alpha * tfidf portion + (1-alpha)*semantic search portion
# def find_best_matches(query, k, alpha = 0.5):
#     q_vector = gen_vector(query)
#     # q_embedding_vector = get_embeddings([query]).cpu().detach().numpy()[0]
    
    
#     cosine_lst = []
    
#     for i, x in enumerate(tf_idf_array):
#         # col = tfidf_DF[x].to_numpy()
#         # Tensor.cpu()
#         # embedding = tsed_DF.iloc[i]["embeddings"]

#         # cosine_lst[i] = [i, (alpha) * cosine_sim(q_vector, x) + (1 - alpha) * cosine_sim(q_embedding_vector, embedding)]

#         # cosine_lst[i] = [i, (alpha) * 1 - (spatial.distance.cosine(q_vector, x))]
#         cosine_lst.append([i, (alpha) * cosine_sim(q_vector, x)])
    
    
#     cosine_lst.sort(reverse = True, key = lambda x: x[1])
#     return cosine_lst[:k]

In [None]:
# test_vector = gen_vector("pandas how to select first 10 rows")

# for i, x in enumerate(tf_idf_array):
#     (cosine_sim(x, test_vector))

# for i, x in enumerate(tf_idf_array):
#     (1 - spatial.distance.cosine(x, test_vector))

# tfidf_series = pd.Series(list(tf_idf_array))

# vectorized_test = tfidf_series.apply(lambda x: cosine_sim(x, test_vector))

# tfidf_series.apply(lambda x: np.dot(x, test_vector)/(np.linalg.norm(x)*np.linalg.norm(test_vector)))

In [None]:
# find_best_matches("read csv to dataframe", 10, alpha = 1)

In [None]:
# # Function which runs all 99 queries, and returns a pd df of the results
# def create_results(query_filepath, results_per_query = 100):
#     queries = pd.read_csv(query_filepath)
#     # display(queries)
#     q_lst = queries["query"].to_list()
#     # print(q_lst)

#     lang_lst = []
#     func_code_url_lst = []
#     query_lst = []

#     for i, query in enumerate(q_lst):
#         print(i)
#         fbm_lst = find_best_matches(query, results_per_query, 0.2)
#         query_lst += [query for j in range(len(fbm_lst))]
        
#         for lst in fbm_lst:
#             # print(tsed_DF.iloc[lst[0]]["language"])
#             # print(tsed_DF.iloc[lst[0]]["func_name"])
#             # print(tsed_DF.iloc[lst[0]]["func_code_url"])
#             # print(f"SCORE: {lst[1]}")
#             # print("-" * 100)

#             lang_lst.append(tsed_DF.iloc[lst[0]]["language"])
#             func_code_url_lst.append(tsed_DF.iloc[lst[0]]["func_code_url"])
        
#         # break

#     # print(lang_lst)
#     # print(func_code_url_lst)
#     # print(query_lst)
#     prediction_df = pd.DataFrame({'language' : lang_lst, 'url': func_code_url_lst, "query" : query_lst})
#     return prediction_df
        
# res_df = create_results("./Dataset/Testing/queries.csv", results_per_query=50)
# res_df.to_csv("./csv_output/baseline_20k.csv")