In [None]:
import pandas as pd
import numpy as np
import pickle
from transformers import AutoTokenizer, AutoModel
from datasets import Dataset
from datasets import load_dataset
import datasets
import torch
from collections import Counter

# import string library function  
import string

In [None]:
# dataset = load_dataset("code_search_net", "all")

dataset_dict = datasets.load_from_disk("./Dataset/CodeSearchCorpus/")

In [None]:
print(torch.backends.cudnn.enabled)
print(torch.cuda.is_available()) #We have GPU on deck and ready
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")

In [None]:
print(len(dataset_dict["train"]))
print(len(dataset_dict["validation"]))
print(len(dataset_dict["test"]))

In [None]:
train_dataset = dataset_dict["train"]
train_dataset

In [None]:
test_dataset = dataset_dict["test"]
test_dataset

# Yeah, 1.8M is too much. For week 5 at least, we've decided to train on a random sample of 10k from the training, 1k validation and 1k test

# Column for semantic search: func_documentation_string
# Column for tfidf: func_code_tokens


In [None]:
num_rows = 20000
filepath_pkl_obj = "./PickleObjects/"
inverted_index_name = f"inverted_index_{num_rows}.pkl"
tsed_name = f"train_subset_embeddings_dataset_{num_rows}.pkl"

print(inverted_index_name, tsed_name)

In [None]:
np.random.seed(1)
train_subset_indices = np.random.choice(len(train_dataset), num_rows, replace = False)
train_dataset_subset = train_dataset.select(train_subset_indices)

len(train_dataset_subset)

In [None]:
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
# device

#Following code from: https://huggingface.co/learn/nlp-course/chapter5/6?fw=pt

In [None]:
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [None]:
device = torch.device("cuda")
model.to(device)

In [None]:
#From Hugging Face Tutorials
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [None]:
# Credit: https://huggingface.co/docs/datasets/use_with_pytorch
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
#Trained embeddings for semantic search portion
f'{filepath_pkl_obj}{tsed_name}'

In [None]:
#Train embeddings

#REMEMBER TO KEEP THE FILENAMES THE SAME 0_0
try:
    with open(f'{filepath_pkl_obj}{tsed_name}', 'rb') as f:  # open a text file
        train_subset_embeddings_dataset = pickle.load(f) # serialize the list
        f.close()
except:
    train_subset_embeddings_dataset = train_dataset_subset.map(
        lambda x: {"embeddings": get_embeddings(x["func_documentation_string"]).detach().cpu().numpy()[0]}
    )

    with open(f'{filepath_pkl_obj}{tsed_name}', 'wb') as f:  # open a text file
        pickle.dump(train_subset_embeddings_dataset, f) # serialize the list
        f.close()



In [None]:
# with open(f'{filepath_pkl_obj}{tsed_name}', 'wb') as f:  # open a text file
#         pickle.dump(train_subset_embeddings_dataset, f) # serialize the list
#         f.close()

In [None]:

# with open('./pickleObjects/train_subset_embeddings_dataset.pkl', 'wb') as f:  # open a text file
#     pickle.dump(train_subset_embeddings_dataset, f) # serialize the list
#     f.close()


In [None]:
train_subset_embeddings_dataset

In [None]:
#Creating dictionary for tf-idf

In [None]:
train_dataset_subset[0]["func_code_tokens"]

In [None]:
tsed_DF = train_subset_embeddings_dataset.to_pandas() #train-subset-embeddings-dataset_DF

In [None]:
def clean_code_tokens(lst):
    result = string.punctuation 
    new_lst = [] 
    for character in lst:
        if character in result:
            continue
        else:
            new_lst.append(character)
    return new_lst


# # Creating inverted index based off this article: https://www.geeksforgeeks.org/inverted-index/
# def make_documents(data, col_name):
#     documents = data[col_name].dropna().apply(process_text).to_dict()
#     return documents

# def make_inverted_index(documents):
#     word_array = np.array(list(documents.values()))
#     all_words = []
#     for words in word_array:
#         all_words +=  words.split(" ")
# #     terms = dict(zip( range(len(set(all_words))),set(all_words)))
# #     return terms
#     all_words = set(all_words)
#     inverted_index = {}
    
#     for word in all_words:
#         if word != "":
#             lst_docs = []
#             for i, doc in documents.items():
#                 if word in doc.split():
#                     lst_docs.append(i)
        
#             inverted_index[word] = lst_docs
#     return inverted_index

In [None]:
# torch.cuda.empty_cache()

#Cleaned func_code_tokens and set to "clean_code_tokens"
tsed_DF["clean_code_tokens"] =  tsed_DF["func_code_tokens"].apply(clean_code_tokens)

# list(tsed_DF["clean_code_tokens"].to_dict().values())

In [None]:
f'{filepath_pkl_obj}{inverted_index_name}'

### Much of this code was based off of William Scott's implementation of TF-IDF: https://github.com/williamscott701/Information-Retrieval/blob/master/2.%20TF-IDF%20Ranking%20-%20Cosine%20Similarity%2C%20Matching%20Score/TF-IDF.ipynb


In [None]:
documents = tsed_DF["clean_code_tokens"].to_dict()

all_words = []
for i in list(tsed_DF["clean_code_tokens"].to_dict().values()):
    all_words += i

all_words = list(set(all_words)) #Get rid of all repeats
all_words

try:
     with open(f'{filepath_pkl_obj}{inverted_index_name}.pkl', 'rb') as f:
        inverted_index = pickle.load(f) # deserialize using load()
        f.close()
except:
    inverted_index = {}

    for word in all_words:
            if word != "":
                lst_docs = []
                for i, doc in documents.items():
                    if word in doc:
                        lst_docs.append(i)
            
                inverted_index[word] = lst_docs
    
    #Pickle afterwards
    with open(f'{filepath_pkl_obj}{inverted_index_name}.pkl', 'wb') as f:  # open a text file
        pickle.dump(inverted_index, f) # serialize the list
        f.close()

In [None]:
len(all_words) == len(inverted_index)

In [None]:
#pickle inverted indices
len(inverted_index)

In [None]:
string.punctuation

In [None]:
def doc_freq(word):
    c = 0
    try:
        c = inverted_index[word]
    except:
        pass

    if type(c) == list:
        return len(c)
    else:
        return 0



#number of rows sampled
tf_idf = {}
for i in range(num_rows):
    # print(i)
    tokens = tsed_DF["clean_code_tokens"].iloc[i]
    counter = Counter(tokens)
    words_count = len(tokens)

    for token in np.unique(tokens):
        tf = counter[token] / words_count
        df = doc_freq(token)
        idf = np.log((num_rows + 1) / (df + 1))

        tf_idf[i, token] = tf * idf


In [None]:
tf_idf

In [None]:
# def make_tfidf_DF(documents, inverted_index, total_vocab):
#     tf_idf = {}
#     df = pd.DataFrame()
#     for i, doc in documents.items():
#         term_lst = []
#         for term in total_vocab:
#             # doc_lst = doc.split()
#             tf = doc.count(term) / len(doc)

#             idf = np.log(len(documents) / len(inverted_index[term]))
#     #         if tf*idf > 0:
#     #             print(tf*idf)
#     #             print(term)
#             term_lst.append(tf*idf)
#             tf_idf[i, term] = tf*idf
#         df[i] = term_lst
#     return df


In [None]:
all_words_dict = dict(zip(all_words, range(len(all_words))))


tf_idf_array = np.zeros((num_rows, len(all_words)))

for i in tf_idf:
    try:
        ind = all_words_dict[i[1]]
        tf_idf_array[i[0]][ind] = tf_idf[i]
    except:
        pass

In [None]:
def gen_vector(s):
    # This is where we'd do more processing of the query
    tokens = s.split()

    q_vector = np.zeros((len(all_words)))
    
    counter = Counter(tokens)
    words_count = len(tokens)

    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((num_rows+1)/(df+1))

        try:
            ind = all_words_dict[token]
            q_vector[ind] = tf*idf
        except:
            pass
    return q_vector

In [None]:
gen_vector("pandas how to select first 10 rows").shape

In [None]:
tf_idf_array[0].shape

In [None]:
# #Process query. Make it into a vector of tf-idfs
# def process_query(s, inverted_index, total_vocab, documents):
    
# #     print(processed_s)
#     lst_words = s.split()
# #     print(lst_words)
#     q = np.zeros(len(total_vocab))
# #     print(len(q))
#     counter = Counter(lst_words)
#     for word in np.unique(lst_words):
#         if word in inverted_index:
#             tf = counter[word] / len(lst_words)
#             df = len(inverted_index[word])
#             idf = np.log(len(documents) / df)
#             q[total_vocab.index(word)] = tf*idf
    
#     return q


In [None]:
# process_query("flatten nested loop python", inverted_index, total_vocab, documents)

In [None]:
#Got from William Scott https://github.com/williamscott701/Information-Retrieval/blob/master/2.%20TF-IDF%20Ranking%20-%20Cosine%20Similarity%2C%20Matching%20Score/TF-IDF.ipynb
def cosine_sim(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

In [None]:
# tsed_DF

In [None]:
def find_best_matches(query, k, alpha = 0.5):
    q_vector = gen_vector(query)
    q_embedding_vector = get_embeddings([query]).cpu().detach().numpy()[0]
    
    # print(q_vector)
    # print(q_vector.shape)
#     print(len(q_vector_space))
#     print(len(q_vector_title))
    
    cosine_lst = [[x,0] for x in range(num_rows)]

#     print(len(cosine_lst_title))
#     print(len(cosine_lst_space))
    
    for i, x in enumerate(tf_idf_array):
        # col = tfidf_DF[x].to_numpy()
        # Tensor.cpu()
        embedding = tsed_DF.iloc[i]["embeddings"]

        cosine_lst[i] = [i, (alpha) * cosine_sim(q_vector, x) + (1 - alpha) * cosine_sim(q_embedding_vector, embedding)]
    
    
    cosine_lst.sort(reverse = True, key = lambda x: x[1])
    return cosine_lst[:k]

In [None]:
# fbm_result = find_best_matches("How to split string by newline PYTHON", 10, 0.2)

In [None]:
# tsed_DF.iloc[0]

In [None]:
# lang_lst = []
# func_code_url_lst = []
# for lst in fbm_result:
#     # print(tsed_DF.iloc[lst[0]]["language"])
#     # print(tsed_DF.iloc[lst[0]]["func_name"])
#     # print(tsed_DF.iloc[lst[0]]["func_code_url"])
#     # print(f"SCORE: {lst[1]}")
#     # print("-" * 100)

#     lang_lst.append(tsed_DF.iloc[lst[0]]["language"])
#     func_code_url_lst.append(tsed_DF.iloc[lst[0]]["func_code_url"])



In [None]:
# lang_lst
# func_code_url_lst
# pd.DataFrame({'language' : lang_lst, 'url': func_code_url_lst})

In [None]:
"can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first."


In [None]:
def create_results(query_filepath, results_per_query = 100):
    queries = pd.read_csv(query_filepath)
    # display(queries)
    q_lst = queries["query"].to_list()
    # print(q_lst)

    lang_lst = []
    func_code_url_lst = []
    query_lst = []

    for i, query in enumerate(q_lst):
        print(i)
        fbm_lst = find_best_matches(query, results_per_query, 0.2)
        query_lst += [query for j in range(len(fbm_lst))]
        
        for lst in fbm_lst:
            # print(tsed_DF.iloc[lst[0]]["language"])
            # print(tsed_DF.iloc[lst[0]]["func_name"])
            # print(tsed_DF.iloc[lst[0]]["func_code_url"])
            # print(f"SCORE: {lst[1]}")
            # print("-" * 100)

            lang_lst.append(tsed_DF.iloc[lst[0]]["language"])
            func_code_url_lst.append(tsed_DF.iloc[lst[0]]["func_code_url"])
        
        # break

    # print(lang_lst)
    # print(func_code_url_lst)
    # print(query_lst)
    return lang_lst, func_code_url_lst, query_lst
        
    

In [None]:
lang_lst, func_code_url_lst, query_lst = create_results("./Dataset/Testing/queries.csv", results_per_query=50)

In [None]:
prediction_df = pd.DataFrame({'language' : lang_lst, 'url': func_code_url_lst, "query" : query_lst})

In [None]:
prediction_df

In [None]:
prediction_df.to_csv("./csv_output/baseline_20k.csv")