In [58]:
from datasets import load_dataset
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

import time
import pandas as pd
import numpy as np
import pickle

## Step 0: Read in the data

In [59]:
python_data = load_dataset("code_search_net", "python")
python_data

DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 412178
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 22176
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 23107
    })
})

In [60]:
# Use the training data for Python code only
data_train = python_data['train']
# data_train_snippet = data_train['whole_func_string']

## Step 1: Setting up ElasticSearch

In [61]:
es = Elasticsearch("http://localhost:9200")
# es.info().body

In [62]:
data_train[0].keys()

dict_keys(['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'])

In [63]:
# Creating the mappings (structure) for the python index
mappings = {
    "properties": {
        'repository_name': {"type": "text"}, 
        'func_path_in_repository': {"type": "text"}, 
        'func_name': {"type": "text"}, 
        'whole_func_string': {"type": "text"}, 
        'language': {"type": "text"}, 
        'func_code_string': {"type": "text"}, 
        'func_code_tokens': {"type": "text"}, 
        'func_documentation_string': {"type": "text"}, 
        'func_documentation_tokens': {"type": "text"}, 
        'split_name': {"type": "text"}, 
        'func_code_url': {"type": "text"}
    }
}

#analyzer settings
analyzer_settings = {
    "settings": {
        "analysis": {
            "analyzer": {
                "code_analyzer": {
                    "type": "custom",
                    "tokenizer": "whitespace",
                    "filter": ["lowercase"],
                    "char_filter": ["symbol_char_filter"]
                }
            },
            "char_filter": {
                "symbol_char_filter": {
                    "type": "mapping",
                    "mappings": [
                        "_=> ",    # Replace underscore with space
                        ";=>",     # Remove semicolons
                        "{=>",     # Remove opening curly braces
                        "}=>",     # Remove closing curly braces
                        ")=>",
                        "(=>"
                    ]
                }
            }
        }
    }
}

In [64]:
# Creating the index python with the mappings above
# es.indices.create(index="python", mappings=mappings, body=analyzer_settings)

## Step 2: Adding Data into ElasticSearch

In [65]:
start = time.time()

# Creating a list of dictionaries with all the data to be added in to the ElasticSearch index
bulk_data = []
for i, row in enumerate(data_train):
    bulk_data.append(
        {
            "_index": "python",
            "_id": i,
            "_source": {
                "repository_name": row['repository_name'],
                'func_path_in_repository': row['func_path_in_repository'], 
                'func_name': row['func_name'], 
                'whole_func_string': row['whole_func_string'], 
                'language': row['language'], 
                'func_code_string': row['func_code_string'], 
                'func_code_tokens': row['func_code_tokens'], 
                'func_documentation_string': row['func_documentation_string'], 
                'func_documentation_tokens': row['func_documentation_tokens'], 
                'split_name': row['split_name'], 
                'func_code_url': row['func_code_url']
            }
        }
    )
    
end = time.time()
print("Time taken to read data into bulk_data:", end - start)

Time taken to read data into bulk_data: 644.6600196361542


In [68]:
start = time.time()

# Adding data into the index
bulk(es, bulk_data)

end = time.time()
print("time taken to add data into the index:", end - start)

time taken to add data into the index: 1660.0469880104065


In [69]:
# verifying that all data has been read into the python index properly
es.indices.refresh(index="python")
es.cat.count(index="python", format="json")

[{'epoch': '1701667572', 'timestamp': '05:26:12', 'count': '412178'}]

## Step 3: Implementing the Search Function

In [70]:
def es_search(query, k=10):
    """
    Searches the data using ElasticSearch to find the k most similar documents to the query.
    Returns a list of the k most similar functions, along with their GitHub URLs and their similarity scores to the query
    """

    es_query = {
        "query": {
            "bool": {
                "must": {
                    "query_string": {
                        "query": query,
                        "fields": [
                            'repository_name',
                            'func_path_in_repository',
                            'func_name',
                            'whole_func_string^3', #boost
                            'language',
                            'func_code_string',
                            'func_code_tokens^2', # boost
                            'func_documentation_string',
                            'func_documentation_tokens',
                            'split_name',
                            'func_code_url'
                        ],
                        "phrase_slop": 2  # allows for flexibility in phrase matching
                    }
                },
                # "should": [
                #     {"match_phrase": {"func_name": {"query": query, "boost": 2}}},
                #     {"match_phrase": {"whole_func_string": {"query": query, "boost": 3}}}
                # ]
            }
        },
        "size": k
    }
    
    response = es.search(index="python", body=es_query)
    
    results = []
    # for each result, add the function name, the GitHub URL of the function, and the similarity score to the results list
    for hit in response['hits']['hits']:
        row = hit['_source']
        results.append((row['func_name'], row['func_code_url'], hit['_score']))
        
    return results

## Step 4: Testing the Search Engine

In [None]:
query = "unique elements"
es_search(query, 5)

  response = es.search(index="python", body=es_query)


[('unique',
  'https://github.com/odlgroup/odl/blob/b8443f6aca90e191ba36c91d32253c5a36249a6c/odl/util/utility.py#L1573-L1611',
  45.621704),
 ('generate_random_sframe',
  'https://github.com/apple/turicreate/blob/74514c3f99e25b46f22c6e02977fe3da69221c2e/src/unity/python/turicreate/util/_sframe_generation.py#L13-L71',
  44.050896),
 ('unique',
  'https://github.com/limix/numpy-sugar/blob/4bdfa26913135c76ef3cd542a332f4e5861e948b/numpy_sugar/_array.py#L132-L149',
  43.924316),
 ('BaseProvider.random_sample',
  'https://github.com/joke2k/faker/blob/965824b61132e52d92d1a6ce470396dbbe01c96c/faker/providers/__init__.py#L243-L248',
  43.847404),
 ('unique',
  'https://github.com/dedupeio/dedupe/blob/9f7c9f84473a4bcacf0f2b11152d8ed3eb35d48b/dedupe/labeler.py#L383-L390',
  42.582573)]

In [None]:
es.info()

{'name': '222f85e45a05',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': 'OwyYYkU1RSeGf5Nx4VAQjA',
 'version': {'number': '8.7.0',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': '09520b59b6bc1057340b55750186466ea715e30e',
  'build_date': '2023-03-27T16:31:09.816451435Z',
  'build_snapshot': False,
  'lucene_version': '9.5.0',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

In [None]:
query = "create cookie"
es_search(query, 3)

  response = es.search(index="python", body=es_query)


[('WHTTPCookieJar.import_header_text',
  'https://github.com/a1ezzz/wasp-general/blob/1029839d33eb663f8dec76c1c46754d53c1de4a9/wasp_general/network/web/cookies.py#L316-L324',
  46.92044),
 ('IIIFAuth.access_token',
  'https://github.com/zimeon/iiif/blob/9d10018d01202fa2a76dfa61598dc6eca07b471f/iiif/auth.py#L257-L269',
  46.747246),
 ('make_cookie',
  'https://github.com/IdentityPython/pysaml2/blob/d3aa78eeb7d37c12688f783cb4db1c7263a14ad6/src/saml2/httputil.py#L320-L346',
  45.509644)]

## Step 5: Evaluating the Search Engine

In [71]:
def run_test_query_python_es(query_list, k=10):
    """
    Takes in a list of Python queries. Runs the search engine on those queries and returns
    the top k results for that query
    """
    test_results = []
    i = 1
    
    total_start = time.time()
    for query in query_list:
        # get top k results of query in our elastic search engine
        query_search = es_search(query, k)
        
        # for each result of the query, add a row to test_results with
        # the language (python), the query, and the GitHub Url to the result
        for result in query_search:
            # query_results = ["python", query, result[1]]
            query_results = ["python", query, result[1], result[2]]
            test_results.append(query_results)
        
        
    total_end = time.time()
    print("Time taken for all queries:", total_end - total_start)
    return test_results

In [72]:
# read in the annotated test dataset and get only the Python queries
test_queries = pd.read_csv('annotation_store.csv')
python_queries = test_queries[test_queries['Language'] == 'Python']

query_list = python_queries['Query'].unique().tolist()

In [73]:
# run the tests for the evaluation data
test_es_results = run_test_query_python_es(query_list, 300)

  response = es.search(index="python", body=es_query)


Time taken for all queries: 37.51243734359741


In [74]:
# store the test results as a Pandas DataFrame
test_es_results_df = pd.DataFrame(test_es_results)
test_es_results_df.columns=['language', 'query', 'url', 'score']


Unnamed: 0,language,query,url,score
0,python,sorting multiple arrays based on another array...,https://github.com/pyviz/holoviews/blob/ae0dd2...,95.294620
1,python,sorting multiple arrays based on another array...,https://github.com/google/prettytensor/blob/75...,91.741810
2,python,sorting multiple arrays based on another array...,https://github.com/rkday/nose2dep/blob/135a529...,84.342710
3,python,sorting multiple arrays based on another array...,https://github.com/Zitrax/nose-dep/blob/fd29c9...,82.965920
4,python,sorting multiple arrays based on another array...,https://github.com/bcbio/bcbio-nextgen/blob/6a...,80.840324
...,...,...,...,...
29695,python,convert html to pdf,https://github.com/Phyks/libbmc/blob/9ef1a29d2...,30.736752
29696,python,convert html to pdf,https://github.com/acutesoftware/AIKIF/blob/fc...,30.733560
29697,python,convert html to pdf,https://github.com/PlaidWeb/Publ/blob/ce789363...,30.733560
29698,python,convert html to pdf,https://github.com/DS-100/nb-to-gradescope/blo...,30.722044


In [89]:
test_es_results_df.to_pickle("test_es_results_df.pkl")

In [19]:
# Export the model predictions to a csv file
out = test_es_results_df[['language', 'query', 'url']]
out.to_csv("es_model_predictions.csv", index=False)

**Simple Model (query_string search):**
- Using 200 top results from ElasticSearch engine
```
% of URLs in predictions that exist in the annotation dataset:
        python: 31.24%
% of URLs in predictions that exist in the annotation dataset (avg relevance > 0):
        python: 32.42%
NDCG:
        python: 0.355
NDCG (full ranking):
        python: 0.203
```

**Improved Model (field boosting + analyzer)**
- Using 300 top results from ElasticSearch engine
```
% of URLs in predictions that exist in the annotation dataset:
        python: 31.75%
% of URLs in predictions that exist in the annotation dataset (avg relevance > 0):
        python: 32.76%
NDCG:
        python: 0.373
NDCG (full ranking):
        python: 0.196
```

**Improved Model (field boosting + analyzer with 'should' statement removed)**
- Using 300 top results from ElasticSearch engine
```
% of URLs in predictions that exist in the annotation dataset:
        python: 31.75%
% of URLs in predictions that exist in the annotation dataset (avg relevance > 0):
        python: 32.76%
NDCG:
        python: 0.367
NDCG (full ranking):
        python: 0.202
```


# Implementing LTR with LGBMRanker
reference: https://towardsdatascience.com/how-to-implement-learning-to-rank-model-using-python-569cd9c49b08 

## Step 1: Creating Training Data

In [4]:
from sentence_transformers import SentenceTransformer
sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2')

In [7]:
with open('python_train_code_tokens.pkl', 'rb') as f:
    code_tokens = pickle.load(f)
    
with open('python_train_docu_tokens.pkl', 'rb') as f:
    docu_tokens = pickle.load(f)

In [93]:
train_df = pd.DataFrame(data_train)

In [8]:
def parse_query(q):
    encoded = sentence_transformer.encode(q, convert_to_tensor=True)
    return encoded

In [None]:
train_df.head()

IndexError: index 11 is out of bounds for axis 0 with size 11

IndexError: index 11 is out of bounds for axis 0 with size 11

In [88]:
train_df['code_token'] = code_tokens.values()

IndexError: index 11 is out of bounds for axis 0 with size 11

In [87]:
train_df['docu_token'] = docu_tokens.values()

KeyboardInterrupt: 

In [75]:
# get the token representation of the es_results
es_model_preds = test_es_results_df
# pd.read_csv("es_model_predictions.csv")

to_merge = train_df[['func_code_url', 'code_token', 'docu_tokens']]
to_merge.head()

Unnamed: 0,func_code_url,code_token
0,https://github.com/ageitgey/face_recognition/b...,"[tensor(0.0289), tensor(0.0024), tensor(0.0457..."
1,https://github.com/ageitgey/face_recognition/b...,"[tensor(0.0075), tensor(0.0143), tensor(-0.003..."
2,https://github.com/ageitgey/face_recognition/b...,"[tensor(-0.0106), tensor(0.0305), tensor(0.001..."
3,https://github.com/ageitgey/face_recognition/b...,"[tensor(0.0011), tensor(0.0091), tensor(0.0238..."
4,https://github.com/ageitgey/face_recognition/b...,"[tensor(0.0342), tensor(0.0096), tensor(0.0531..."


In [76]:
es_with_tokens = es_model_preds.merge(to_merge, left_on='url', right_on='func_code_url')
es_with_tokens.head()

Unnamed: 0,language,query,url,score,func_code_url,code_token
0,python,sorting multiple arrays based on another array...,https://github.com/pyviz/holoviews/blob/ae0dd2...,95.29462,https://github.com/pyviz/holoviews/blob/ae0dd2...,"[tensor(-0.0594), tensor(0.0142), tensor(-0.04..."
1,python,sorting multiple arrays based on another array...,https://github.com/google/prettytensor/blob/75...,91.74181,https://github.com/google/prettytensor/blob/75...,"[tensor(0.0362), tensor(0.0141), tensor(-0.079..."
2,python,sorting multiple arrays based on another array...,https://github.com/rkday/nose2dep/blob/135a529...,84.34271,https://github.com/rkday/nose2dep/blob/135a529...,"[tensor(-0.0707), tensor(0.0311), tensor(-0.05..."
3,python,sorting multiple arrays based on another array...,https://github.com/Zitrax/nose-dep/blob/fd29c9...,82.96592,https://github.com/Zitrax/nose-dep/blob/fd29c9...,"[tensor(0.0173), tensor(-0.0341), tensor(0.009..."
4,python,sorting multiple arrays based on another array...,https://github.com/bcbio/bcbio-nextgen/blob/6a...,80.840324,https://github.com/bcbio/bcbio-nextgen/blob/6a...,"[tensor(0.0325), tensor(-0.0050), tensor(-0.02..."


In [77]:
queries = es_with_tokens['query'].unique()
query_mapping = {q:parse_query(q) for q in queries}

In [78]:
es_with_tokens['query_tokens'] = es_with_tokens['query'].map(query_mapping)

In [79]:
from torch.nn.functional import cosine_similarity

def cos_sim(tensor1, tensor2):
    return cosine_similarity(tensor1.unsqueeze(0), tensor2.unsqueeze(0)).item()

In [80]:
es_with_tokens['code_similarity'] = es_with_tokens.apply(lambda row: cos_sim(row['query_tokens'], row['code_token']), axis=1)

In [None]:
es_with_tokens['docu_similarity'] = es_with_tokens.apply(lambda row: cos_sim(row['query_tokens'], row['docu_token']), axis=1)

In [None]:
# reading in relevance scores and creating a merged dataset
relevance_scores = pd.read_csv("annotation_store.csv")

#only working with Python
python_rs = relevance_scores[relevance_scores['Language']=='Python']

#merging and dropping irrelevant columns
merged_scores = pd.merge(es_with_tokens, python_rs,  how='left', left_on=['query','url'], right_on = ['Query','GitHubUrl'])
merged_scores = merged_scores.drop(columns=['Language', 'Query', 'GitHubUrl', 'Notes'])

#search results that were not in the relevance csv get a relevance score of 0
merged_scores['Relevance'] = merged_scores['Relevance'].fillna(0)

In [None]:
from sklearn.model_selection import train_test_split

# need to have queries grouped together for our ranking model
merged_scores = merged_scores.sort_values(by=["query"], ascending=[True])

# our feature will be the relevance score our es model returns
X = merged_scores[['score', 'code_similarity', 'docu_similarity']]
y = merged_scores['Relevance']

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False, random_state=11)
train_groups = merged_scores.iloc[:len(X_train)].groupby('query').size().to_numpy()
test_groups = merged_scores.iloc[len(X_train):].groupby('query').size().to_numpy()

#checking lengths match up
assert len(X_train) == sum(train_groups)
assert len(X_test) == sum(test_groups)

## Step 2: Fitting the Model

In [83]:
from lightgbm import LGBMRanker
# create model
model = LGBMRanker(objective="lambdarank", metric="ndcg")

#fit model
model.fit(X_train, y_train, group=train_groups, eval_set=[(X_test,y_test)],eval_group=[test_groups],eval_metric=['ndcg'])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000716 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 27051, number of used features: 2


## Step 3: Using the Model to Edit Results

In [85]:
# use the model to predict an adjusted score
merged_scores['adjusted_score'] = model.predict(merged_scores[['score', 'code_similarity']])
merged_scores

Unnamed: 0,language,query,url,score,func_code_url,code_token,query_tokens,code_similarity,Relevance,adjusted_score
4852,python,aes encryption,https://github.com/saltstack/salt/blob/e8541fd...,41.184580,https://github.com/saltstack/salt/blob/e8541fd...,"[tensor(0.0483), tensor(0.0898), tensor(-0.010...","[tensor(-0.0402), tensor(0.0566), tensor(-0.03...",-0.003748,0.0,-1.426020
5201,python,aes encryption,https://github.com/brocade/pynos/blob/bd8a34e9...,26.465645,https://github.com/brocade/pynos/blob/bd8a34e9...,"[tensor(-0.0653), tensor(0.0220), tensor(0.060...","[tensor(-0.0402), tensor(0.0566), tensor(-0.03...",0.025786,0.0,-4.536407
5202,python,aes encryption,https://github.com/has2k1/plotnine/blob/566e57...,26.345700,https://github.com/has2k1/plotnine/blob/566e57...,"[tensor(-0.0260), tensor(0.0442), tensor(0.037...","[tensor(-0.0402), tensor(0.0566), tensor(-0.03...",-0.010518,0.0,-5.123511
5203,python,aes encryption,https://github.com/terrycain/aioboto3/blob/0fd...,26.345700,https://github.com/terrycain/aioboto3/blob/0fd...,"[tensor(-0.0127), tensor(0.0771), tensor(-0.01...","[tensor(-0.0402), tensor(0.0566), tensor(-0.03...",0.071042,0.0,-5.384475
5204,python,aes encryption,https://github.com/spulec/moto/blob/4a286c4bc2...,26.191330,https://github.com/spulec/moto/blob/4a286c4bc2...,"[tensor(0.0392), tensor(-0.0297), tensor(-0.00...","[tensor(-0.0402), tensor(0.0566), tensor(-0.03...",0.037631,0.0,-4.328918
...,...,...,...,...,...,...,...,...,...,...
14789,python,write csv,https://github.com/i3visio/osrframework/blob/8...,38.171497,https://github.com/i3visio/osrframework/blob/8...,"[tensor(0.0807), tensor(0.0592), tensor(-0.038...","[tensor(0.0423), tensor(0.0717), tensor(-0.097...",0.191607,0.0,-3.425596
14787,python,write csv,https://github.com/coinkite/connectrum/blob/99...,38.294220,https://github.com/coinkite/connectrum/blob/99...,"[tensor(0.0184), tensor(0.0284), tensor(-0.026...","[tensor(0.0423), tensor(0.0717), tensor(-0.097...",0.265386,0.0,-3.950415
14786,python,write csv,https://github.com/mrstephenneal/databasetools...,38.859993,https://github.com/mrstephenneal/databasetools...,"[tensor(0.0158), tensor(0.0631), tensor(0.0847...","[tensor(0.0423), tensor(0.0717), tensor(-0.097...",-0.008689,0.0,-3.985775
14794,python,write csv,https://github.com/Cognexa/cxflow/blob/dd609e6...,37.291153,https://github.com/Cognexa/cxflow/blob/dd609e6...,"[tensor(-0.0889), tensor(0.0501), tensor(0.062...","[tensor(0.0423), tensor(0.0717), tensor(-0.097...",-0.013258,0.0,-4.512487


In [86]:
# only keep the results that have an adjusted score above the threshold (found through trial and error)
thresh = -0.75
new_preds = merged_scores[(merged_scores['adjusted_score'] >= thresh)]

#output results
new_preds[['language', 'query', 'url']].to_csv('ltr_es_model_predictions.csv')

threshold = -0.75
```
% of URLs in predictions that exist in the annotation dataset:
        python: 20.32%
% of URLs in predictions that exist in the annotation dataset (avg relevance > 0):
        python: 21.39%
NDCG:
        python: 0.438
NDCG (full ranking):
        python: 0.241
```

threshold = -5
```
% of URLs in predictions that exist in the annotation dataset:
        python: 30.84%
% of URLs in predictions that exist in the annotation dataset (avg relevance > 0):
        python: 31.97%
NDCG:
        python: 0.583
NDCG (full ranking):
        python: 0.213
```

### With Cosine Sim + LTR

```
% of URLs in predictions that exist in the annotation dataset:
        python: 22.45%
% of URLs in predictions that exist in the annotation dataset (avg relevance > 0):
        python: 24.35%
NDCG:
        python: 0.515
NDCG (full ranking):
        python: 0.341
```