### Two Million Dataset Training

This jupyter notebook's purpose is to run our most optimal model from the searchmodel (The 75% BM-25/ 25% CodeBert Embeddings Linear Combination) on the entire training set and calculating the subsequent NDCG score of it.

In [1]:
# Import packages
import pandas as pd
import numpy as np
import pickle
from transformers import AutoTokenizer, AutoModel
from datasets import Dataset
from datasets import load_dataset
import datasets
import torch
from collections import Counter
import string
from scipy import spatial
from sklearn.metrics.pairwise import cosine_similarity
import faiss
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import RegexpStemmer
from nltk.util import ngrams
from nltk.corpus import stopwords
import gen_results
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Check GPU, initialize models
device = ("cuda" if torch.cuda.is_available() else "cpu")
# device = "cpu"
hg_model = "huggingface/CodeBERTa-small-v1" #"sentence-transformers/multi-qa-mpnet-base-dot-v1"
model_ckpt = hg_model #Can/Should test different models
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
trained_model = AutoModel.from_pretrained(model_ckpt)
trained_model.to(device)

#Tokenizer initialization
st = RegexpStemmer('ing$|s$|e$|able$', min=4)
rgx_tokenizer = RegexpTokenizer(r'\w+')

In [3]:
def cls_pooling(model_output):
    """
    Function that helps in creating the semantic search embeddings
    Code was taken from HuggingFace tutorials

    Parameters
    ----------
    model_output : str
        The file location of the answers csv file
    """
    return model_output.last_hidden_state[:, 0]

def get_embeddings(text_list):
    """
    Function that obtains the semantic search embeddings
    given a list of strings comprising of the tokens

    Parameters
    ----------
    text_list : list
        The list of string tokens to obtain the embeddings
    """
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = trained_model(**encoded_input)
    return cls_pooling(model_output)

def cosine_sim(a, b):
    """
    Class Function that calculates the cosine similarities of two vectors

    Parameters
    ----------
    a : list
        The list vectors to be compared 
    """
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

In [4]:
# Load in the CodeSearchNet Dataset
dataset_dict = datasets.load_from_disk("./Dataset/CodeSearchCorpus/")
train_dataset = dataset_dict["train"]

In [5]:
# Take the relevant columns only
train_dataset = train_dataset.select_columns(["func_documentation_string", "language", "func_code_url", "whole_func_string", "func_name"])

In [6]:
check_file_lst = ([os.path.isfile("./pickleObjects/two_million_stuff/two_million_inverted_index.pkl"), os.path.isfile("./pickleObjects/two_million_stuff/bigram_set.pkl"), os.path.isfile("./pickleObjects/two_million_stuff/bm_avg_DL.pkl")])


In [7]:
if sum(check_file_lst) != 3:
    print("Files don't exist. Creating")

    # Create tsed_DF variable comprising only of the "function_documentation_string"
    tsed_DF = train_dataset.select_columns(["func_documentation_string"]).to_pandas()

    # Initialize English stopwords
    stop_words = set(stopwords.words('english'))

    # Clean the "func_documentation_string" column and tokenize and stem it
    tsed_DF["func_doc_tokens"] = tsed_DF["func_documentation_string"].apply(lambda x: rgx_tokenizer.tokenize(x))
    tsed_DF["func_doc_stem_tokens"] = tsed_DF["func_doc_tokens"].apply(lambda x: [st.stem(word.lower()) for word in x if word not in stop_words])
    inverted_index = {}

    #Create bigrams
    bigram_lst = []
    for i in range(len(tsed_DF)):
        bigram_lst += list(ngrams(tsed_DF.iloc[i]["func_doc_stem_tokens"], 2))

    # Make a bigram counter
    bigram_counter = Counter(bigram_lst)

    # Limit the bigrams such they occur more than or equal to 50 times, and less than or equal to 1000
    bigram_counter = (Counter({k: c for k, c in bigram_counter.items() if c <= 1000 and c >= 50}))

    # Make a set of the bigrams to add to the inverted index
    bigram_set = set(list(bigram_counter.keys()))
    total_len_sum = 0

    #Go through and populate the inverted index
    for i in range(len(tsed_DF)):
        token_counter = Counter(tsed_DF.iloc[i]["func_doc_stem_tokens"])
        bigram_counter_i = Counter(list(ngrams(tsed_DF.iloc[i]["func_doc_stem_tokens"], 2)))
        total_len = sum(token_counter.values()) + sum(bigram_counter_i.values())
        total_len_sum += total_len

        for token in token_counter:
            if token not in inverted_index:
                inverted_index[token] = {}
            inverted_index[token][i] = (token_counter[token], total_len)
        
        for bigram in bigram_counter_i:
            if bigram in bigram_set:
                if bigram not in inverted_index:
                    inverted_index[bigram] = {}
                inverted_index[bigram][i] = (bigram_counter_i[bigram], total_len)

    bm_avg_DL = total_len_sum / len(tsed_DF)

    #Pickle afterwards
    with open(f"./pickleObjects/two_million_stuff/two_million_inverted_index.pkl", 'wb') as f:  # open a text file
        pickle.dump(inverted_index, f) # serialize the list
        f.close()

    with open(f"./pickleObjects/two_million_stuff/bigram_set.pkl", 'wb') as f:  # open a text file
        pickle.dump(bigram_set, f) # serialize the list
        f.close()

    with open(f"./pickleObjects/two_million_stuff/bm_avg_DL.pkl", 'wb') as f:  # open a text file
        pickle.dump(bm_avg_DL, f) # serialize the list
        f.close()
else:
    print("Files already exist. Loading")

    with open('./pickleObjects/two_million_stuff/two_million_inverted_index.pkl', 'rb') as f:
        inverted_index = pickle.load(f) # deserialize using load()
        f.close()

    with open('./pickleObjects/two_million_stuff/bigram_set.pkl', 'rb') as f:
        bigram_set = pickle.load(f) # deserialize using load()
        f.close()
    
    with open('./pickleObjects/two_million_stuff/bm_avg_DL.pkl', 'rb') as f:
        bm_avg_DL = pickle.load(f) # deserialize using load()
        f.close()


Files already exist. Loading


In [8]:
with open('./pickleObjects/tsed_codeBERT_FULL.pkl', 'rb') as f:
        tsed = pickle.load(f) # deserialize using load()
        f.close()

In [9]:
def query_results_lc_naive_custom(query_string, k = 10, tf_alpha = 0.5, bigrams = True, kw_method = "TFIDF", bm_k = 1.2, bm_b = 0.75): 
    # lc = Linear-combination, Implicitly weighted towards Keyword Matching Method
    # kw_method can either be "TFIDF" OR "BM25"
    stop_words = set(stopwords.words('english'))
    query_tokens = [st.stem(word.lower()) for word in tokenizer.tokenize(query_string) if word not in stop_words]
    query_embedding = get_embeddings([query_string]).cpu().detach().numpy()
    # query_embedding = get_embeddings([query_string]).detach().numpy()

    # rel_indices = []
    answer_dict = {}

    for token in query_tokens:
        if token in inverted_index:
            rel_indices = list(set(inverted_index[token].keys()))

            for rel_i in rel_indices:
                if rel_i not in answer_dict:
                    # query_embedding_2 = get_embeddings([train_dataset[rel_i]["whole_func_string"]]).detach().numpy()
                    answer_dict[rel_i] = [0,cosine_sim(query_embedding[0], tsed[rel_i]["embeddings"])]
                tf = inverted_index[token][rel_i][0] / inverted_index[token][rel_i][1]
                df = len(inverted_index[token])
                idf = np.log((len(train_dataset) + 1) / (df + 1))
                if kw_method == "TFIDF":
                    answer_dict[rel_i][0] += tf * idf
                else: #kw_method == "BM25"
                    bm_comp = (tf * (bm_k + 1)) / (tf + bm_k * (1 - bm_b + bm_b * (inverted_index[token][rel_i][1] / bm_avg_DL)))
                    answer_dict[rel_i][0] += bm_comp * idf

    if bigrams:
        bigram_lst = list(ngrams(query_tokens, 2))
        for bigram in bigram_lst:
            if bigram in inverted_index:
                rel_indices = list(set(inverted_index[bigram].keys()))
                for rel_i in rel_indices:
                    if rel_i not in answer_dict:
                        # query_embedding_2 = get_embeddings([train_dataset[rel_i]["whole_func_string"]]).cpu().detach().numpy()
                        # query_embedding_2 = get_embeddings([train_dataset[rel_i]["whole_func_string"]]).detach().numpy()
                        answer_dict[rel_i] = [0,cosine_sim(query_embedding[0], tsed[rel_i]["embeddings"])]
                    tf = inverted_index[bigram][rel_i][0] / inverted_index[bigram][rel_i][1]
                    df = len(inverted_index[bigram])
                    idf = np.log((len(train_dataset) + 1) / (df + 1))
                    
                    if kw_method == "TFIDF":
                        answer_dict[rel_i][0] += 2 * tf * idf
                    else:
                        bm_comp = (tf * (bm_k + 1)) / (tf + bm_k * (1 - bm_b + bm_b * (inverted_index[bigram][rel_i][1] / bm_avg_DL)))
                        answer_dict[rel_i][0] += 2 * bm_comp * idf

    result_lst = [[a,b] for (a,b) in answer_dict.items()]
    result_lst.sort(reverse=True, key = lambda x: x[1][0] * tf_alpha + x[1][1] * (1-tf_alpha))
    # print(result_lst[:k])
    return result_lst[:k]

In [11]:
def create_results(query_filepath, results_per_query = 100):
    queries = pd.read_csv(query_filepath)
    q_lst = queries["query"].to_list()

    lang_lst = []
    func_code_url_lst = []
    query_lst = []
    func_docs_lst = []

    for i, query in enumerate(q_lst):
        print(i)
        fbm_lst = query_results_lc_naive_custom(query, results_per_query, kw_method="BM25", tf_alpha=0.75,bigrams=True) #CHANGE THIS LINE TO CHECK DIFFERENT METHODS. FALSE = NO BIGRAMS
        query_lst += [query for j in range(len(fbm_lst))]
        
        for lst in fbm_lst:
            # print(lst)
            lang_lst.append(train_dataset[lst[0]]["language"])
            func_code_url_lst.append(train_dataset[lst[0]]["func_code_url"])
            func_docs_lst.append(train_dataset[lst[0]]['func_documentation_string'])
            # func_names_lst.append(self.tsed_DF.iloc[lst[0]]['func_name'])


    prediction_df = pd.DataFrame({'language' : lang_lst, 'url': func_code_url_lst, "query" : query_lst, "documentation" : func_docs_lst})
    return prediction_df

In [20]:
res_df = create_results("./Dataset/Testing/queries.csv", results_per_query=50) #CHANGE THIS
res_df.to_csv(f"./csv_output/baseline_2MILLION.csv") #CHANGE THIS

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98


In [21]:
gen_results.create_lj_answers_NEW(f"./csv_output/baseline_2MILLION.csv", "./Dataset/Testing/annotationStore_UNIQUE.csv")

For the predictions, a total of 20 queries were matched. 5 of the Queries had no relevant results
['convert a date string into yyyymmdd', 'priority queue', 'sort string list', 'postgresql connection', 'linear regression', 'scatter plot', 'html encode string', 'pretty print json', 'replace in file', 'readonly array', 'copying a file to a path', 'regex case insensitive', 'heatmap from 3d coordinates', 'unzipping large files', 'convert json to csv']
NDCG: 0.23844571589394653
