In [1]:
from datasets import load_dataset
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

import time
import pandas as pd
import numpy as np
import pickle

  from .autonotebook import tqdm as notebook_tqdm


## Step 0: Read in the data

In [2]:
python_data = load_dataset("code_search_net", "python")
python_data

DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 412178
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 22176
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 23107
    })
})

In [3]:
# Use the training data for Python code only
data_train = python_data['train']

# if you would like a smaller dataset size
# data_train = python_data['train][:10000]

# convert Dataset --> DataFrame
train_df = pd.DataFrame(data_train)

## Step 1: Setting up ElasticSearch

In [4]:
es = Elasticsearch("http://localhost:9200")
# es.info().body

In [5]:
data_train[0].keys()

dict_keys(['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'])

In [6]:
# Creating the mappings (structure) for the python index
mappings = {
    "properties": {
        'repository_name': {"type": "text"}, 
        'func_path_in_repository': {"type": "text"}, 
        'func_name': {"type": "text"}, 
        'whole_func_string': {"type": "text"}, 
        'language': {"type": "text"}, 
        'func_code_string': {"type": "text"}, 
        'func_code_tokens': {"type": "text"}, 
        'func_documentation_string': {"type": "text"}, 
        'func_documentation_tokens': {"type": "text"}, 
        'split_name': {"type": "text"}, 
        'func_code_url': {"type": "text"}
    }
}

#analyzer settings
analyzer_settings = {
    "settings": {
        "analysis": {
            "analyzer": {
                "code_analyzer": {
                    "type": "custom",
                    "tokenizer": "whitespace",
                    "filter": ["lowercase"],
                    "char_filter": ["symbol_char_filter"]
                }
            },
            "char_filter": {
                "symbol_char_filter": {
                    "type": "mapping",
                    "mappings": [
                        "_=> ",    # Replace underscore with space
                        ";=>",     # Remove semicolons
                        "{=>",     # Remove opening curly braces
                        "}=>",     # Remove closing curly braces
                        ")=>",
                        "(=>"
                    ]
                }
            }
        }
    }
}

In [7]:
# Creating the index named python with the mappings above
# Will have an error if running more than once since indiced will already be created
try:
    es.indices.create(index="python", mappings=mappings, body=analyzer_settings)
except:
    pass

## Step 2: Adding Data into ElasticSearch

In [8]:
start = time.time()

# Creating a list of dictionaries with all the data to be added in to the ElasticSearch index
bulk_data = []
for i, row in enumerate(data_train):
    bulk_data.append(
        {
            "_index": "python",
            "_id": i,
            "_source": {
                "repository_name": row['repository_name'],
                'func_path_in_repository': row['func_path_in_repository'], 
                'func_name': row['func_name'], 
                'whole_func_string': row['whole_func_string'], 
                'language': row['language'], 
                'func_code_string': row['func_code_string'], 
                'func_code_tokens': row['func_code_tokens'], 
                'func_documentation_string': row['func_documentation_string'], 
                'func_documentation_tokens': row['func_documentation_tokens'], 
                'split_name': row['split_name'], 
                'func_code_url': row['func_code_url']
            }
        }
    )
    
end = time.time()
print("Time taken to read data into bulk_data:", end - start)

Time taken to read data into bulk_data: 138.64055919647217


In [9]:
start = time.time()

# Adding data into the index
bulk(es, bulk_data)

end = time.time()
print("time taken to add data into the index:", end - start)

time taken to add data into the index: 310.7415289878845


In [10]:
# verifying that all data has been read into the python index properly
es.indices.refresh(index="python")
es.cat.count(index="python", format="json")

ListApiResponse([{'epoch': '1702307093', 'timestamp': '15:04:53', 'count': '412178'}])

## Step 3: Implementing the Search Function

In [11]:
def es_search(query, k=10):
    """
    Searches the data using ElasticSearch to find the k most similar documents to the query.
    Returns a list of the k most similar functions, along with their GitHub URLs and their similarity scores to the query
    """

    es_query = {
        "query": {
            "bool": {
                "must": {
                    "query_string": {
                        "query": query,
                        "fields": [
                            'repository_name',
                            'func_path_in_repository',
                            'func_name',
                            'whole_func_string^3', #boost 3x
                            'language',
                            'func_code_string',
                            'func_code_tokens^2', # boost 2x
                            'func_documentation_string', 
                            'func_documentation_tokens',
                            'split_name',
                            'func_code_url'
                        ],
                        "phrase_slop": 2  # still considered a match if they are up to two terms apart
                    }
                },
            }
        },
        "size": k
    }
    
    response = es.search(index="python", body=es_query)
    
    results = []
    # for each result, add the function name, the GitHub URL of the function, and the similarity score to the results list
    for hit in response['hits']['hits']:
        row = hit['_source']
        results.append((row['func_name'], row['func_code_url'], hit['_score']))
        
    return results

## Step 4: Testing the Search Engine

In [12]:
query = "unique elements"
es_search(query, 5)

[('unique',
  'https://github.com/odlgroup/odl/blob/b8443f6aca90e191ba36c91d32253c5a36249a6c/odl/util/utility.py#L1573-L1611',
  45.621704),
 ('generate_random_sframe',
  'https://github.com/apple/turicreate/blob/74514c3f99e25b46f22c6e02977fe3da69221c2e/src/unity/python/turicreate/util/_sframe_generation.py#L13-L71',
  44.050896),
 ('unique',
  'https://github.com/limix/numpy-sugar/blob/4bdfa26913135c76ef3cd542a332f4e5861e948b/numpy_sugar/_array.py#L132-L149',
  43.924316),
 ('BaseProvider.random_sample',
  'https://github.com/joke2k/faker/blob/965824b61132e52d92d1a6ce470396dbbe01c96c/faker/providers/__init__.py#L243-L248',
  43.847404),
 ('unique',
  'https://github.com/dedupeio/dedupe/blob/9f7c9f84473a4bcacf0f2b11152d8ed3eb35d48b/dedupe/labeler.py#L383-L390',
  42.582573)]

In [13]:
query = "create cookie"
es_search(query, 3)

[('WHTTPCookieJar.import_header_text',
  'https://github.com/a1ezzz/wasp-general/blob/1029839d33eb663f8dec76c1c46754d53c1de4a9/wasp_general/network/web/cookies.py#L316-L324',
  46.92044),
 ('IIIFAuth.access_token',
  'https://github.com/zimeon/iiif/blob/9d10018d01202fa2a76dfa61598dc6eca07b471f/iiif/auth.py#L257-L269',
  46.747246),
 ('make_cookie',
  'https://github.com/IdentityPython/pysaml2/blob/d3aa78eeb7d37c12688f783cb4db1c7263a14ad6/src/saml2/httputil.py#L320-L346',
  45.509644)]

## Step 5: Evaluating the Search Engine

In [14]:
def run_test_query_python_es(query_list, k=10):
    """
    Takes in a list of Python queries. Runs the search engine on those queries and returns
    the top k results for that query
    """
    test_results = []
    i = 1
    
    total_start = time.time()
    for query in query_list:
        # get top k results of query in our elastic search engine
        query_search = es_search(query, k)
        
        # for each result of the query, add a row to test_results with
        # the language (python), the query, and the GitHub Url to the result
        for result in query_search:
            # query_results = ["python", query, result[1]]
            query_results = ["python", query, result[1], result[2]]
            test_results.append(query_results)
        
        
    total_end = time.time()
    print("Time taken for all queries:", total_end - total_start)
    return test_results

In [15]:
# read in the annotated test dataset and get only the Python queries
test_queries = pd.read_csv('annotation_store.csv')
python_queries = test_queries[test_queries['Language'] == 'Python']

query_list = python_queries['Query'].unique().tolist()

In [16]:
# run the tests for the evaluation data, return top 300 results for each
test_es_results = run_test_query_python_es(query_list, 300)

Time taken for all queries: 9.671287298202515


In [18]:
# store the test results as a Pandas DataFrame
test_es_results_df = pd.DataFrame(test_es_results)
test_es_results_df.columns=['language', 'query', 'url', 'score']

In [19]:
# Export the model predictions to a csv file
out = test_es_results_df[['language', 'query', 'url']]
out.to_csv("predictions/es_model_predictions.csv", index=False)

**Simple Model (query_string search):**
- Using 200 top results from ElasticSearch engine
```
% of URLs in predictions that exist in the annotation dataset:
        python: 31.24%
% of URLs in predictions that exist in the annotation dataset (avg relevance > 0):
        python: 32.42%
NDCG:
        python: 0.355
NDCG (full ranking):
        python: 0.203
```

**Improved Model (field boosting + analyzer )**
- Using 300 top results from ElasticSearch engine
```
% of URLs in predictions that exist in the annotation dataset:
        python: 31.75%
% of URLs in predictions that exist in the annotation dataset (avg relevance > 0):
        python: 32.76%
NDCG:
        python: 0.367
NDCG (full ranking):
        python: 0.202
```


# Implementing LTR with LGBMRanker
reference: https://towardsdatascience.com/how-to-implement-learning-to-rank-model-using-python-569cd9c49b08 

In [22]:
from sentence_transformers import SentenceTransformer
from torch.nn.functional import cosine_similarity
from lightgbm import LGBMRanker
from sklearn.model_selection import train_test_split

## Step 1: Creating Training Data

In [23]:
sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2')

In [24]:
# if you would like to make the embeddings yourself (WARNING: MAY BE SLOW):
# code_tokens = [sentence_transformer.encode(train_df['func_code_string'][i], convert_to_tensor=True) for i in range(len(train_df))]
# docu_tokens = [sentence_transformer.encode(train_df['func_documentation_string'][i], convert_to_tensor=True) for i in range(len(train_df))]

# otherwise can load in embeddings:
with open('python_train_code_tokens.pkl', 'rb') as f:
    code_tokens = pickle.load(f)
    
with open('python_train_docu_tokens.pkl', 'rb') as f:
    docu_tokens = pickle.load(f)

In [25]:
def parse_query(q):
    """
    Given an input, encodes it using the sentence_transformer.
    Returns a tensor vector representation of the input.
    """
    encoded = sentence_transformer.encode(q, convert_to_tensor=True)
    return encoded

In [26]:
train_df.head()

Unnamed: 0,repository_name,func_path_in_repository,func_name,whole_func_string,language,func_code_string,func_code_tokens,func_documentation_string,func_documentation_tokens,split_name,func_code_url
0,ageitgey/face_recognition,examples/face_recognition_knn.py,train,"def train(train_dir, model_save_path=None, n_n...",python,"def train(train_dir, model_save_path=None, n_n...","[def, train, (, train_dir, ,, model_save_path,...",Trains a k-nearest neighbors classifier for fa...,"[Trains, a, k, -, nearest, neighbors, classifi...",train,https://github.com/ageitgey/face_recognition/b...
1,ageitgey/face_recognition,examples/face_recognition_knn.py,predict,"def predict(X_img_path, knn_clf=None, model_pa...",python,"def predict(X_img_path, knn_clf=None, model_pa...","[def, predict, (, X_img_path, ,, knn_clf, =, N...",Recognizes faces in given image using a traine...,"[Recognizes, faces, in, given, image, using, a...",train,https://github.com/ageitgey/face_recognition/b...
2,ageitgey/face_recognition,examples/face_recognition_knn.py,show_prediction_labels_on_image,"def show_prediction_labels_on_image(img_path, ...",python,"def show_prediction_labels_on_image(img_path, ...","[def, show_prediction_labels_on_image, (, img_...",Shows the face recognition results visually.\n...,"[Shows, the, face, recognition, results, visua...",train,https://github.com/ageitgey/face_recognition/b...
3,ageitgey/face_recognition,face_recognition/api.py,_rect_to_css,"def _rect_to_css(rect):\n """"""\n Convert ...",python,"def _rect_to_css(rect):\n """"""\n Convert ...","[def, _rect_to_css, (, rect, ), :, return, rec...",Convert a dlib 'rect' object to a plain tuple ...,"[Convert, a, dlib, rect, object, to, a, plain,...",train,https://github.com/ageitgey/face_recognition/b...
4,ageitgey/face_recognition,face_recognition/api.py,_trim_css_to_bounds,"def _trim_css_to_bounds(css, image_shape):\n ...",python,"def _trim_css_to_bounds(css, image_shape):\n ...","[def, _trim_css_to_bounds, (, css, ,, image_sh...","Make sure a tuple in (top, right, bottom, left...","[Make, sure, a, tuple, in, (, top, right, bott...",train,https://github.com/ageitgey/face_recognition/b...


In [27]:
# add in the embeddings
train_df['code_token'] = code_tokens.values()
train_df['docu_token'] = docu_tokens.values()

In [28]:
# we want to get the token representation of the ES results
es_model_preds = test_es_results_df

# match embeddings to the url of our ES results
to_merge = train_df[['func_code_url', 'code_token', 'docu_token']]
es_with_tokens = es_model_preds.merge(to_merge, left_on='url', right_on='func_code_url')
es_with_tokens.head()

Unnamed: 0,language,query,url,score,func_code_url,code_token,docu_token
0,python,sorting multiple arrays based on another array...,https://github.com/pyviz/holoviews/blob/ae0dd2...,95.27759,https://github.com/pyviz/holoviews/blob/ae0dd2...,"[tensor(-0.0594), tensor(0.0142), tensor(-0.04...","[tensor(-0.0302), tensor(0.0654), tensor(-0.00..."
1,python,sorting multiple arrays based on another array...,https://github.com/google/prettytensor/blob/75...,91.74193,https://github.com/google/prettytensor/blob/75...,"[tensor(0.0362), tensor(0.0141), tensor(-0.079...","[tensor(0.0491), tensor(0.0401), tensor(-0.064..."
2,python,sorting multiple arrays based on another array...,https://github.com/rkday/nose2dep/blob/135a529...,84.326645,https://github.com/rkday/nose2dep/blob/135a529...,"[tensor(-0.0707), tensor(0.0311), tensor(-0.05...","[tensor(-0.0843), tensor(0.0162), tensor(-0.06..."
3,python,sorting multiple arrays based on another array...,https://github.com/Zitrax/nose-dep/blob/fd29c9...,82.950325,https://github.com/Zitrax/nose-dep/blob/fd29c9...,"[tensor(0.0173), tensor(-0.0341), tensor(0.009...","[tensor(0.0289), tensor(-0.0360), tensor(0.022..."
4,python,sorting multiple arrays based on another array...,https://github.com/bcbio/bcbio-nextgen/blob/6a...,80.83913,https://github.com/bcbio/bcbio-nextgen/blob/6a...,"[tensor(0.0325), tensor(-0.0050), tensor(-0.02...","[tensor(0.0463), tensor(-0.0255), tensor(-0.01..."


In [29]:
# create embeddings of the queries
queries = es_with_tokens['query'].unique()
query_mapping = {q:parse_query(q) for q in queries}
es_with_tokens['query_tokens'] = es_with_tokens['query'].map(query_mapping)

In [30]:
# cosine similarity function
def cos_sim(tensor1, tensor2):
    return cosine_similarity(tensor1.unsqueeze(0), tensor2.unsqueeze(0)).item()

# get the cosine similarities for code and documentation
es_with_tokens['code_similarity'] = es_with_tokens.apply(lambda row: cos_sim(row['query_tokens'], row['code_token']), axis=1)
es_with_tokens['docu_similarity'] = es_with_tokens.apply(lambda row: cos_sim(row['query_tokens'], row['docu_token']), axis=1)

In [31]:
# reading in relevance scores and creating a merged dataset
relevance_scores = pd.read_csv("evaluation/annotation_store.csv")

#only working with Python
python_rs = relevance_scores[relevance_scores['Language']=='Python']

#merging and dropping irrelevant columns
merged_scores = pd.merge(es_with_tokens, python_rs,  how='left', left_on=['query','url'], right_on = ['Query','GitHubUrl'])
merged_scores = merged_scores.drop(columns=['Language', 'Query', 'GitHubUrl', 'Notes'])

#search results that were not in the relevance csv get a relevance score of 0
merged_scores['Relevance'] = merged_scores['Relevance'].fillna(0)

In [32]:
# need to have queries grouped together for our ranking model
merged_scores = merged_scores.sort_values(by=["query"], ascending=[True])

# our feature will be the relevance score our es model returns
X = merged_scores[['score', 'code_similarity', 'docu_similarity']]
y = merged_scores['Relevance']

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False, random_state=11)
train_groups = merged_scores.iloc[:len(X_train)].groupby('query').size().to_numpy()
test_groups = merged_scores.iloc[len(X_train):].groupby('query').size().to_numpy()

#checking lengths match up
assert len(X_train) == sum(train_groups)
assert len(X_test) == sum(test_groups)

## Step 2: Fitting the Model

In [33]:
# create model
model = LGBMRanker(objective="lambdarank", metric="ndcg")

#fit model
model.fit(X_train, y_train, group=train_groups, eval_set=[(X_test,y_test)],eval_group=[test_groups],eval_metric=['ndcg'])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000524 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 765
[LightGBM] [Info] Number of data points in the train set: 27051, number of used features: 3


## Step 3: Using the Model to Edit Results

In [34]:
# use the model to predict an adjusted score
merged_scores['adjusted_score'] = model.predict(merged_scores[['score', 'code_similarity', 'docu_similarity']])
merged_scores.head()

Unnamed: 0,language,query,url,score,func_code_url,code_token,docu_token,query_tokens,code_similarity,docu_similarity,Relevance,adjusted_score
5193,python,aes encryption,https://github.com/emory-libraries/eulfedora/b...,26.64624,https://github.com/emory-libraries/eulfedora/b...,"[tensor(-0.0138), tensor(0.0153), tensor(-0.00...","[tensor(-0.0269), tensor(0.0363), tensor(-0.03...","[tensor(-0.0402), tensor(0.0566), tensor(-0.03...",0.044184,-0.000421,0.0,-4.201495
4923,python,aes encryption,https://github.com/UCL-INGI/INGInious/blob/cbd...,36.234085,https://github.com/UCL-INGI/INGInious/blob/cbd...,"[tensor(0.0086), tensor(0.0712), tensor(-0.080...","[tensor(-0.0148), tensor(0.0829), tensor(-0.04...","[tensor(-0.0402), tensor(0.0566), tensor(-0.03...",-0.034489,-0.022924,0.0,-2.807974
4924,python,aes encryption,https://github.com/saltstack/salt/blob/e8541fd...,36.123787,https://github.com/saltstack/salt/blob/e8541fd...,"[tensor(0.0209), tensor(-0.1407), tensor(-0.01...","[tensor(0.0332), tensor(-0.1304), tensor(-0.04...","[tensor(-0.0402), tensor(0.0566), tensor(-0.03...",-0.06027,-0.096251,0.0,-4.500716
4926,python,aes encryption,https://github.com/UCSBarchlab/PyRTL/blob/0988...,36.123787,https://github.com/UCSBarchlab/PyRTL/blob/0988...,"[tensor(-0.0091), tensor(0.0212), tensor(-0.12...","[tensor(-0.0562), tensor(0.0040), tensor(-0.06...","[tensor(-0.0402), tensor(0.0566), tensor(-0.03...",-0.005832,0.04836,0.0,-4.343052
4928,python,aes encryption,https://github.com/openid/JWTConnect-Python-Cr...,35.89798,https://github.com/openid/JWTConnect-Python-Cr...,"[tensor(0.0162), tensor(0.0723), tensor(-0.030...","[tensor(0.0249), tensor(-0.0223), tensor(-0.03...","[tensor(-0.0402), tensor(0.0566), tensor(-0.03...",-0.027712,-0.053528,0.0,-4.132794


In [35]:
# only keep the results that have an adjusted score above the threshold (found through trial and error)
thresh = -0.75
new_preds = merged_scores[(merged_scores['adjusted_score'] >= thresh)]

#output results
new_preds[['language', 'query', 'url']].to_csv('predictions/ltr_es_model_predictions.csv')

### ElasticSearch + LTR
threshold = -0.75
```
% of URLs in predictions that exist in the annotation dataset:
        python: 20.32%
% of URLs in predictions that exist in the annotation dataset (avg relevance > 0):
        python: 21.39%
NDCG:
        python: 0.438
NDCG (full ranking):
        python: 0.241
```

### Elasticsearch with Cosim Sim (both) + LTR
threshold = -0.75
```
% of URLs in predictions that exist in the annotation dataset:
        python: 23.05%
% of URLs in predictions that exist in the annotation dataset (avg relevance > 0):
        python: 25.37%
NDCG:
        python: 0.549
NDCG (full ranking):
        python: 0.408
```