In [1]:
from datasets import load_dataset
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

import time
import pandas as pd
import numpy as np

## Step 0: Read in the data

In [2]:
python_data = load_dataset("code_search_net", "python")
python_data

DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 412178
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 22176
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 23107
    })
})

In [3]:
# Use the training data for Python code only
data_train = python_data['train']
data_train_snippet = data_train['whole_func_string']

## Step 1: Setting up ElasticSearch

In [4]:
es = Elasticsearch("http://localhost:9200")
es.info().body

{'name': '8dfd8d5f67dd',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': '3jzC3PX1R2SACsSjjagp6w',
 'version': {'number': '8.7.0',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': '09520b59b6bc1057340b55750186466ea715e30e',
  'build_date': '2023-03-27T16:31:09.816451435Z',
  'build_snapshot': False,
  'lucene_version': '9.5.0',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

In [5]:
data_train[0].keys()

dict_keys(['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'])

In [6]:
# Creating the mappings (structure) for the python index
mappings = {
    "properties": {
        'repository_name': {"type": "text"}, 
        'func_path_in_repository': {"type": "text"}, 
        'func_name': {"type": "text"}, 
        'whole_func_string': {"type": "text"}, 
        'language': {"type": "text"}, 
        'func_code_string': {"type": "text"}, 
        'func_code_tokens': {"type": "text"}, 
        'func_documentation_string': {"type": "text"}, 
        'func_documentation_tokens': {"type": "text"}, 
        'split_name': {"type": "text"}, 
        'func_code_url': {"type": "text"}
    }
}

#analyzer settings
analyzer_settings = {
    "settings": {
        "analysis": {
            "analyzer": {
                "code_analyzer": {
                    "type": "custom",
                    "tokenizer": "whitespace",
                    "filter": ["lowercase"],
                    "char_filter": ["symbol_char_filter"]
                }
            },
            "char_filter": {
                "symbol_char_filter": {
                    "type": "mapping",
                    "mappings": [
                        "_=> ",    # Replace underscore with space
                        ";=>",     # Remove semicolons
                        "{=>",     # Remove opening curly braces
                        "}=>",     # Remove closing curly braces
                        ")=>",
                        "(=>"
                    ]
                }
            }
        }
    }
}

In [7]:
# Creating the index python with the mappings above
es.indices.create(index="python", mappings=mappings, body=analyzer_settings)

BadRequestError: BadRequestError(400, 'resource_already_exists_exception', 'index [python/_lKLwfeiRP2DUYz6EyT7bA] already exists')

## Step 2: Adding Data into ElasticSearch

In [11]:
start = time.time()

# Creating a list of dictionaries with all the data to be added in to the ElasticSearch index
bulk_data = []
for i, row in enumerate(data_train):
    bulk_data.append(
        {
            "_index": "python",
            "_id": i,
            "_source": {
                "repository_name": row['repository_name'],
                'func_path_in_repository': row['func_path_in_repository'], 
                'func_name': row['func_name'], 
                'whole_func_string': row['whole_func_string'], 
                'language': row['language'], 
                'func_code_string': row['func_code_string'], 
                'func_code_tokens': row['func_code_tokens'], 
                'func_documentation_string': row['func_documentation_string'], 
                'func_documentation_tokens': row['func_documentation_tokens'], 
                'split_name': row['split_name'], 
                'func_code_url': row['func_code_url']
            }
        }
    )
    
end = time.time()
print("Time taken to read data into bulk_data:", end - start)

Time taken to read data into bulk_data: 191.51014494895935


In [13]:
start = time.time()

# Adding data into the index
bulk(es, bulk_data)

end = time.time()
print("time taken to add data into the index:", end - start)

time taken to add data into the index: 527.269453048706


In [14]:
# verifying that all data has been read into the python index properly
es.indices.refresh(index="python")
es.cat.count(index="python", format="json")

ListApiResponse([{'epoch': '1700604381', 'timestamp': '22:06:21', 'count': '412178'}])

## Step 3: Implementing the Search Function

In [118]:
def es_search(query, k=10):
    """
    Searches the data using ElasticSearch to find the k most similar documents to the query.
    Returns a list of the k most similar functions, along with their GitHub URLs and their similarity scores to the query
    """

    es_query = {
        "query": {
            "bool": {
                "must": {
                    "query_string": {
                        "query": query,
                        "fields": [
                            'repository_name',
                            'func_path_in_repository',
                            'func_name',
                            'whole_func_string^3', #boost
                            'language',
                            'func_code_string',
                            'func_code_tokens^2', # boost
                            'func_documentation_string',
                            'func_documentation_tokens',
                            'split_name',
                            'func_code_url'
                        ],
                        "phrase_slop": 2  # allows for flexibility in phrase matching
                    }
                },
                "should": [
                    {"match_phrase": {"func_name": {"query": query, "boost": 2}}},
                    {"match_phrase": {"whole_func_string": {"query": query, "boost": 3}}}
                ]
            }
        },
        "size": k
    }
    
    response = es.search(index="python", body=es_query)
    
    results = []
    # for each result, add the function name, the GitHub URL of the function, and the similarity score to the results list
    for hit in response['hits']['hits']:
        row = hit['_source']
        results.append((row['func_name'], row['func_code_url'], hit['_score']))
        
    return results

## Step 4: Testing the Search Engine

In [119]:
query = "unique elements"
es_search(query, 5)

[('unique',
  'https://github.com/odlgroup/odl/blob/b8443f6aca90e191ba36c91d32253c5a36249a6c/odl/util/utility.py#L1573-L1611',
  45.621704),
 ('generate_random_sframe',
  'https://github.com/apple/turicreate/blob/74514c3f99e25b46f22c6e02977fe3da69221c2e/src/unity/python/turicreate/util/_sframe_generation.py#L13-L71',
  44.050896),
 ('unique',
  'https://github.com/limix/numpy-sugar/blob/4bdfa26913135c76ef3cd542a332f4e5861e948b/numpy_sugar/_array.py#L132-L149',
  43.924316),
 ('BaseProvider.random_sample',
  'https://github.com/joke2k/faker/blob/965824b61132e52d92d1a6ce470396dbbe01c96c/faker/providers/__init__.py#L243-L248',
  43.847404),
 ('unique',
  'https://github.com/dedupeio/dedupe/blob/9f7c9f84473a4bcacf0f2b11152d8ed3eb35d48b/dedupe/labeler.py#L383-L390',
  42.582573)]

In [120]:
es.info()

ObjectApiResponse({'name': '8dfd8d5f67dd', 'cluster_name': 'docker-cluster', 'cluster_uuid': '3jzC3PX1R2SACsSjjagp6w', 'version': {'number': '8.7.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '09520b59b6bc1057340b55750186466ea715e30e', 'build_date': '2023-03-27T16:31:09.816451435Z', 'build_snapshot': False, 'lucene_version': '9.5.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [121]:
query = "create cookie"
es_search(query, 3)

[('WHTTPCookieJar.import_header_text',
  'https://github.com/a1ezzz/wasp-general/blob/1029839d33eb663f8dec76c1c46754d53c1de4a9/wasp_general/network/web/cookies.py#L316-L324',
  46.92044),
 ('IIIFAuth.access_token',
  'https://github.com/zimeon/iiif/blob/9d10018d01202fa2a76dfa61598dc6eca07b471f/iiif/auth.py#L257-L269',
  46.747246),
 ('make_cookie',
  'https://github.com/IdentityPython/pysaml2/blob/d3aa78eeb7d37c12688f783cb4db1c7263a14ad6/src/saml2/httputil.py#L320-L346',
  45.509644)]

## Step 5: Evaluating the Search Engine

In [122]:
def run_test_query_python_es(query_list, k=10):
    """
    Takes in a list of Python queries. Runs the search engine on those queries and returns
    the top k results for that query
    """
    test_results = []
    i = 1
    
    total_start = time.time()
    for query in query_list:
        # get top k results of query in our elastic search engine
        query_search = es_search(query, k)
        
        # for each result of the query, add a row to test_results with
        # the language (python), the query, and the GitHub Url to the result
        for result in query_search:
            query_results = ["python", query, result[1]]
            test_results.append(query_results)
        
        
    total_end = time.time()
    print("Time taken for all queries:", total_end - total_start)
    return test_results

In [123]:
# read in the annotated test dataset and get only the Python queries
test_queries = pd.read_csv('annotation_store.csv')
python_queries = test_queries[test_queries['Language'] == 'Python']

query_list = python_queries['Query'].unique().tolist()

In [124]:
# run the tests for the evaluation data
test_es_results = run_test_query_python_es(query_list, 300)

Time taken for all queries: 8.889116287231445


In [125]:
# store the test results as a Pandas DataFrame
test_es_results_df = pd.DataFrame(test_es_results)
test_es_results_df = test_es_results_df.rename(columns={0: "language", 1: "query", 2: "url"})
test_es_results_df

Unnamed: 0,language,query,url
0,python,sorting multiple arrays based on another array...,https://github.com/pyviz/holoviews/blob/ae0dd2...
1,python,sorting multiple arrays based on another array...,https://github.com/google/prettytensor/blob/75...
2,python,sorting multiple arrays based on another array...,https://github.com/rkday/nose2dep/blob/135a529...
3,python,sorting multiple arrays based on another array...,https://github.com/Zitrax/nose-dep/blob/fd29c9...
4,python,sorting multiple arrays based on another array...,https://github.com/bcbio/bcbio-nextgen/blob/6a...
...,...,...,...
29695,python,convert html to pdf,https://github.com/acutesoftware/AIKIF/blob/fc...
29696,python,convert html to pdf,https://github.com/PlaidWeb/Publ/blob/ce789363...
29697,python,convert html to pdf,https://github.com/Phyks/libbmc/blob/9ef1a29d2...
29698,python,convert html to pdf,https://github.com/DS-100/nb-to-gradescope/blo...


In [126]:
# Export the model predictions to a csv file
test_es_results_df.to_csv("es_model_predictions.csv")

**Simple Model (query_string search):**
- Using 200 top results from ElasticSearch engine
```
% of URLs in predictions that exist in the annotation dataset:
        python: 31.24%
% of URLs in predictions that exist in the annotation dataset (avg relevance > 0):
        python: 32.42%
NDCG:
        python: 0.355
NDCG (full ranking):
        python: 0.203
```

**Improved Model (field boosting + analyzer)
- Using 300 top results from ElasticSearch engine
```
% of URLs in predictions that exist in the annotation dataset:
        python: 31.75%
% of URLs in predictions that exist in the annotation dataset (avg relevance > 0):
        python: 32.76%
NDCG:
        python: 0.373
NDCG (full ranking):
        python: 0.196
```

**Improved Model (field boosting + analyzer with 'should' statement removed)**
- Using 300 top results from ElasticSearch engine
```
% of URLs in predictions that exist in the annotation dataset:
        python: 31.75%
% of URLs in predictions that exist in the annotation dataset (avg relevance > 0):
        python: 32.76%
NDCG:
        python: 0.367
NDCG (full ranking):
        python: 0.202
```
