In [1]:
from datasets import load_dataset
from FlagEmbedding import FlagModel

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import numpy as np
import re
import time
import gzip
import pandas as pd
import heapq
import json

In [2]:
# if you haven't downloaded stopwords or wordnet before, uncomment the lines below
# nltk.download('stopwords')
# nltk.download('wordnet')

## Step 0: Read in the data

In [3]:
python_data = load_dataset("code_search_net", "python")
python_data

Downloading builder script:   0%|          | 0.00/8.44k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/18.5k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/941M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/412178 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/22176 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/23107 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 412178
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 22176
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 23107
    })
})

In [4]:
# Use the training data for Python code only
data_train = python_data['train']
data_train_snippet = data_train['whole_func_string']

## Step 1: Preprocess the Data

In [5]:
def preprocess(text):
     # Convert to lowercase
    text = text.lower()
    
    # Tokenizes text
    tokens = re.split(r'[^a-zA-Z0-9]+', text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return ' '.join(tokens)

In [6]:
start = time.time()
documents = [preprocess(raw_text) for raw_text in data_train_snippet]
end = time.time()
print("Time taken to preprocess data:", round(end - start, 2), "seconds")

Time taken to preprocess data: 134.78 seconds


## Step 2: Retrieve Off-the-Shelf Embeddings

In [7]:
model = FlagModel('BAAI/bge-small-en-v1.5', 
                  query_instruction_for_retrieval="Represent this sentence for searching relevant passages: ",
                  use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

Downloading (…)okenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

----------using 4*GPUs----------


## Step 3a: Generate Embeddings for Documents
- **Note:** Only if you want to generate new embeddings. If you don't, skip to step 3b

In [27]:
start = time.time()

# document_embeddings = [compute_document_embedding(doc, embeddings) for doc in documents]
document_embeddings = model.encode(documents)

end = time.time()
print("Time taken to generate document embeddings:", round(end - start, 2), "seconds")

Inference Embeddings:   0%|          | 0/202 [00:00<?, ?it/s]2023-10-30 16:11:12.678728: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Inference Embeddings: 100%|██████████| 202/202 [11:09<00:00,  3.31s/it]


Time taken to generate document embeddings: 669.49 seconds


In [77]:
# convert embeddings to dictionary, where key is the index of the code snippet and value is the embeddings
embeddings_dict = dict(zip(range(len(document_embeddings)), document_embeddings.tolist()))

In [80]:
# export embeddings_dict as a json file
embeddings_json = json.dumps(embeddings_dict)
with open('embeddings.json', 'w') as f:
    f.write(embeddings_json)

## Step 3b: Load in the Document Embeddings
- **Note:** If you ran step 3a already, skip this step and proceed to step 4

In [8]:
start = time.time()

# read the json file from embeddings.json
with open('embeddings.json', 'r') as json_file:
    embeddings_dict = json.load(json_file)

# convert each list value to a numpy array
document_embeddings = []
for key, value in embeddings_dict.items():
    document_embeddings.append(np.array(value))
end = time.time()

print("Time taken to load in the document embeddings:", end - start, "seconds")

Time taken to load in the document embeddings: 34.01527118682861 seconds


## Step 4: Implement the Search Function

In [9]:
def cosine_similarity(vec1, vec2):
    """
    Computes the cosine similarity between two vectors
    """
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [10]:
def search(query, document_embeddings, data, k=10):
    """
    Computes the embedding of the query and retrieves the k most similar documents
    """
    # gets the embedding of the query
    query_embedding = model.encode_queries(query)
    
    # get the similarities between the query embedding and the document embeddings
    similarities = [cosine_similarity(query_embedding, doc_emb) for doc_emb in document_embeddings]
#     similarities = [query_embedding @ doc_emb.T for doc_emb in document_embeddings]
    
    # ranks similarities by most similar to query embedding
    index_similarity_pair_ranked =  heapq.nlargest(k, enumerate(similarities), key=lambda x: x[1])
    
    
    ranked_docs = []
    for ind, sim in index_similarity_pair_ranked:
        # don't include results that have similarity score < 0.7 unless it's the top result
        if sim < 0.7 and ranked_docs: break
        # grab the function name, the github url, and the similarity score
        ranked_docs.append((data['func_name'][ind], data['func_code_url'][ind], sim))
    
    return ranked_docs

## Step 5: Testing the Search Engine

In [15]:
start = time.time()
query = "unique elements"
search_results = search(query, document_embeddings, data_train, 25)
end = time.time()
print("Search time:", end - start, "seconds")
search_results

2023-10-31 17:32:29.754588: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Search time: 20.542781352996826 seconds


[('unique_by_index',
  'https://github.com/matthew-brett/delocate/blob/ed48de15fce31c3f52f1a9f32cae1b02fc55aa60/delocate/tools.py#L72-L89',
  0.7713445625225607),
 ('unique_element',
  'https://github.com/konstantinstadler/pymrio/blob/d764aa0dd2150200e867a9713a98ddae203e12d4/pymrio/tools/ioutil.py#L324-L333',
  0.7660439202589988),
 ('unique_everseen',
  'https://github.com/secynic/ipwhois/blob/b5d634d36b0b942d538d38d77b3bdcd815f155a0/ipwhois/utils.py#L424-L457',
  0.7613811770842612),
 ('unique',
  'https://github.com/dedupeio/dedupe/blob/9f7c9f84473a4bcacf0f2b11152d8ed3eb35d48b/dedupe/labeler.py#L383-L390',
  0.7599133009642218),
 ('unique_everseen',
  'https://github.com/bsolomon1124/pyfinance/blob/c95925209a809b4e648e79cbeaf7711d8e5ff1a6/pyfinance/utils.py#L768-L776',
  0.7598204538419598),
 ('unique_everseen',
  'https://github.com/althonos/pronto/blob/a768adcba19fb34f26f67cde4a03d317f932c274/pronto/utils.py#L30-L38',
  0.7511108412946457),
 ('unique_justseen',
  'https://github.c

In [20]:
start = time.time()
query = "create cookie"
search_results = search(query, document_embeddings, data_train, 25)
end = time.time()
print("Search time:", end - start, "seconds")
search_results

Search time: 17.220950841903687 seconds


[('Latch._make_cookie',
  'https://github.com/dw/mitogen/blob/a7fdb55e1300a7e0a5e404b09eb730cf9a525da7/mitogen/core.py#L2158-L2165',
  0.802719833574395),
 ('make_cookie_content',
  'https://github.com/IdentityPython/oidcendpoint/blob/6c1d729d51bfb6332816117fe476073df7a1d823/src/oidcendpoint/cookie.py#L132-L181',
  0.779596272108196),
 ('create_cookie',
  'https://github.com/pgjones/quart/blob/7cb2d3bd98e8746025764f2b933abc12041fa175/quart/utils.py#L29-L59',
  0.7765420503446522),
 ('create_cookie',
  'https://github.com/quantmind/pulsar/blob/fee44e871954aa6ca36d00bb5a3739abfdb89b26/pulsar/utils/httpurl.py#L396-L425',
  0.7672891740365899),
 ('create_cookie',
  'https://github.com/pypa/pipenv/blob/cae8d76c210b9777e90aab76e9c4b0e53bb19cde/pipenv/vendor/requests/cookies.py#L441-L474',
  0.7646073618570387),
 ('CookieDealer.append_cookie',
  'https://github.com/IdentityPython/oidcendpoint/blob/6c1d729d51bfb6332816117fe476073df7a1d823/src/oidcendpoint/cookie.py#L369-L404',
  0.764575518113

In [21]:
start = time.time()
query = "buffered file reader read text"
search_results = search(query, document_embeddings, data_train, 25)
end = time.time()
print("Search time:", end - start, "seconds")
search_results

Search time: 16.98989725112915 seconds


[('_read',
  'https://github.com/pandas-dev/pandas/blob/9feb3ad92cc0397a04b665803a49299ee7aa1037/pandas/io/parsers.py#L405-L452',
  0.7475948457056484),
 ('smartread',
  'https://github.com/MacHu-GWU/dataIO-project/blob/7e1cc192b5e53426eed6dbd742918619b8fd60ab/dataIO/textfile.py#L82-L88',
  0.7450667959125123),
 ('BaseIOStream._read_from_buffer',
  'https://github.com/tornadoweb/tornado/blob/b8b481770bcdb333a69afde5cce7eaa449128326/tornado/iostream.py#L887-L895',
  0.7382655057853018),
 ('from_buffer',
  'https://github.com/ahupp/python-magic/blob/c5b386b08bfbc01330e2ba836d97749d242429dc/magic.py#L146-L156',
  0.7349693599202292),
 ('BufferWorkSpace.yieldable',
  'https://github.com/RobinNil/file_read_backwards/blob/e56443095b58aae309fbc43a0943eba867dc8500/file_read_backwards/buffer_work_space.py#L42-L55',
  0.7320194985576927),
 ('AsyncioTelnetServer._read',
  'https://github.com/GNS3/gns3-server/blob/a221678448fb5d24e977ef562f81d56aacc89ab1/gns3server/utils/asyncio/telnet_server.py#L

## Step 6: Evaluating Search Engine

In [11]:
test_queries = pd.read_csv('annotation_store.csv')
test_queries.head()

Unnamed: 0,Language,Query,GitHubUrl,Relevance,Notes
0,Go,unique elements,https://github.com/tylertreat/BoomFilters/blob...,0,
1,Go,read properties file,https://github.com/gobs/httpclient/blob/a93d46...,0,
2,Go,heatmap from 3d coordinates,https://github.com/twpayne/go-geom/blob/e21b3a...,1,
3,Go,create cookie,https://github.com/volatiletech/abcweb/blob/9e...,2,
4,Go,readonly array,https://github.com/ericchiang/k8s/blob/68fb216...,0,


In [12]:
python_queries = test_queries[test_queries['Language'] == 'Python']
python_queries.head()

Unnamed: 0,Language,Query,GitHubUrl,Relevance,Notes
1612,Python,sorting multiple arrays based on another array...,https://github.com/JoseAntFer/pyny3d/blob/fb81...,2,
1613,Python,priority queue,https://github.com/keon/algorithms/blob/4d6569...,3,
1614,Python,custom http error response,https://github.com/mapbox/mapbox-sdk-py/blob/7...,3,
1615,Python,randomly extract x items from a list,https://github.com/CitrineInformatics/python-c...,0,
1616,Python,extract data from html content,https://github.com/dragnet-org/dragnet/blob/53...,1,


In [13]:
def run_test_query_python(query_list, document_embeddings, k=5):
    """
    Takes in a list of Python queries. Runs the search engine on those queries and returns
    the top k results for that query
    """
    test_results = []
    i = 1
    
    total_start = time.time()
    for query in query_list:
        start = time.time()
        print("Running query", i, ":", query)
        
        # get top k results of query in our search engine
        query_search = search(query, document_embeddings, data_train, k)
        
        # for each result of the query, add a row to test_results with
        # the language (python), the query, and the GitHub Url to the result
        for result in query_search:
            query_results = ["python", query, result[1]]
            test_results.append(query_results)
        
        end = time.time()
        print("Time taken for query", i, ":", end - start)
        i += 1
        
    total_end = time.time()
    print("Time taken for all queries", i, ":", total_end - total_start)
    return test_results

In [14]:
# list of the 99 test queries
query_list = python_queries['Query'].unique().tolist()

In [102]:
# running all test queries on search engine
test_results = run_test_query_python(query_list, document_embeddings, 25)

Running query 1 : sorting multiple arrays based on another arrays sorted order
Time taken for query 1 : 26.899136304855347
Running query 2 : priority queue
Time taken for query 2 : 26.554736852645874
Running query 3 : custom http error response
Time taken for query 3 : 26.579458475112915
Running query 4 : randomly extract x items from a list
Time taken for query 4 : 26.75666069984436
Running query 5 : extract data from html content
Time taken for query 5 : 26.176507234573364
Running query 6 : html encode string
Time taken for query 6 : 29.602113723754883
Running query 7 : how to determine a string is a valid word
Time taken for query 7 : 31.568598747253418
Running query 8 : convert json to csv
Time taken for query 8 : 28.396673917770386
Running query 9 : get executable path
Time taken for query 9 : 29.96829319000244
Running query 10 : how to empty array
Time taken for query 10 : 29.222440004348755
Running query 11 : parse query string in url
Time taken for query 11 : 29.39021325111389


Time taken for query 94 : 26.931403636932373
Running query 95 : concatenate several file remove header lines
Time taken for query 95 : 26.266358137130737
Running query 96 : buffered file reader read text
Time taken for query 96 : 27.063509225845337
Running query 97 : how to get database table name
Time taken for query 97 : 26.61334204673767
Running query 98 : underline text in label widget
Time taken for query 98 : 26.77951979637146
Running query 99 : convert html to pdf
Time taken for query 99 : 26.344631671905518
Time taken for all queries 100 : 2654.525272846222


In [103]:
# store the test results as a Pandas DataFrame
test_results_df = pd.DataFrame(test_results)
test_results_df = test_results_df.rename(columns={0: "language", 1: "query", 2: "url"})
test_results_df

Unnamed: 0,language,query,url
0,python,sorting multiple arrays based on another array...,https://github.com/mlperf/training/blob/1c6ae7...
1,python,sorting multiple arrays based on another array...,https://github.com/Spinmob/spinmob/blob/f037f5...
2,python,sorting multiple arrays based on another array...,https://github.com/yamins81/tabular/blob/1caf0...
3,python,sorting multiple arrays based on another array...,https://github.com/OLC-Bioinformatics/sipprver...
4,python,sorting multiple arrays based on another array...,https://github.com/datastore/datastore/blob/7c...
...,...,...,...
2404,python,convert html to pdf,https://github.com/fhamborg/news-please/blob/7...
2405,python,convert html to pdf,https://github.com/BlendedSiteGenerator/Blende...
2406,python,convert html to pdf,https://github.com/henrysher/kotocore/blob/c52...
2407,python,convert html to pdf,https://github.com/apragacz/django-rest-regist...


In [104]:
# Export the model predictions to a csv file
test_results_df.to_csv("model_predictions.csv")

**Current best model**: 

- Using hybrid of k=25 search results and/or search results with > 0.7 similarity score

```
% of URLs in predictions that exist in the annotation dataset:
        python: 20.83%
% of URLs in predictions that exist in the annotation dataset (avg relevance > 0):
        python: 22.64%
NDCG:
        python: 0.346
NDCG (full ranking):
        python: 0.226
```