In [1]:
!pip install tree-sitter
!pip install tree-sitter-language-pack
!pip install tree-sitter-javascript
!pip install tree-sitter-typescript
!pip install hnswlib

Collecting tree-sitter
  Downloading tree_sitter-0.24.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.8 kB)
Downloading tree_sitter-0.24.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (575 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m575.6/575.6 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tree-sitter
Successfully installed tree-sitter-0.24.0
Collecting tree-sitter-language-pack
  Downloading tree_sitter_language_pack-0.7.2-cp39-abi3-manylinux2014_x86_64.whl.metadata (21 kB)
Collecting tree-sitter-c-sharp>=0.23.1 (from tree-sitter-language-pack)
  Downloading tree_sitter_c_sharp-0.23.1-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.7 kB)
Collecting tree-sitter-embedded-template>=0.23.2 (from tree-sitter-language-pack)
  Downloading tree_sitter_embedded_template-0.23.2-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_1

# 1. Parse Code

In [2]:
from tree_sitter import Language, Parser
import tree_sitter_typescript
import tree_sitter_javascript

JS_LANGUAGE = Language(tree_sitter_javascript.language())
TS_LANGUAGE = Language(tree_sitter_typescript.language_typescript())

parser = Parser(JS_LANGUAGE)

code = """
import('./bootstrap')
export {}
"""

xtree = parser.parse(bytes(code, "utf8"))

# 2. Explore parsed tree structure

In [3]:
root = xtree.root_node

for child in root.children:
    print(child.type, " -> ", code[child.start_byte:child.end_byte])

expression_statement  ->  import('./bootstrap')
export_statement  ->  export {}


# 3. Chunking Code with Tree-Sitter

In [4]:
terminal = [
    'import_statement',
    'lexical_declaration',
    'expression_statement',
    'export_statement'
]

def extract_subtree(subtree_root):
  queue = [subtree_root]
  subtree_nodes = []
  ignore_types = ["\n"]
  while queue:
    current_node = queue.pop(0)
    for child in current_node.children:
      child_type = str(child.type)
      if child_type not in ignore_types:
        queue.append(child)
      if child_type in terminal:
        subtree_nodes.append(child)
  return subtree_nodes

def extract_subtrees(tree):
  root = tree.root_node
  all_subtrees = []
  queue = [root]
  while queue:
    current_node = queue.pop(0)
    if str(current_node.type) in terminal:
      all_subtrees.append(current_node)
    else:
      subtree = extract_subtree(current_node)
      all_subtrees.extend(subtree)
      children = [x for x in current_node.children]
      queue.extend(children)
  return all_subtrees



In [5]:
subtrees = extract_subtrees(xtree)

print(subtrees)

[<Node type=expression_statement, start_point=(1, 0), end_point=(1, 21)>, <Node type=export_statement, start_point=(2, 0), end_point=(2, 9)>, <Node type=expression_statement, start_point=(1, 0), end_point=(1, 21)>, <Node type=export_statement, start_point=(2, 0), end_point=(2, 9)>]


# 4. Convert AST Nodes to Text for Embeddings

In [6]:
src_texts = []
for subtree in subtrees:
  if code[subtree.start_byte:subtree.end_byte] not in src_texts:
    src_texts.append(code[subtree.start_byte:subtree.end_byte])

In [7]:
import torch
from transformers import AutoModel, AutoTokenizer
### from optimum.bettertransformer import BetterTransformer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embedding_model = "Salesforce/codet5p-110m-embedding"

tokenizer = AutoTokenizer.from_pretrained(embedding_model, trust_remote_code=True)
model = AutoModel.from_pretrained(embedding_model, trust_remote_code=True).to(device)
### model = AutoModel.from_pretrained(embedding_model, trust_remote_code=True, torch_dtype=torch.float16).to(device)
### model.config.model_type = 't5'
### model = model.to_bettertransformer()
### model.eval()

"""
from transformers import AutoModel, AutoTokenizer

checkpoint = "Salesforce/codet5p-110m-embedding"
device = "cuda"  # for GPU usage or "cpu" for CPU usage

tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
model = AutoModel.from_pretrained(checkpoint, trust_remote_code=True).to(device)

inputs = tokenizer.encode("def print_hello_world():\tprint('Hello World!')", return_tensors="pt").to(device)
embedding = model(inputs)[0]
print(f'Dimension of the embedding: {embedding.size()[0]}, with norm={embedding.norm().item()}')
# Dimension of the embedding: 256, with norm=1.0
print(embedding)
"""

def get_embedding(texts, max_length=2048):
  ### inputs = tokenizer(texts, return_tensors="pt", max_length=max_length, padding='max_length', truncation=True).to(device)
  with torch.no_grad():
    inputs = tokenizer.encode(texts, return_tensors="pt").to(device)
    return model(inputs)[0]
  """
  with torch.no_grad():
    outputs = model(**inputs)
    return outputs.cpu().detach()
  """

embeddings = []
for src_text in src_texts:
  embedding = get_embedding(src_text)
  embeddings.append(embedding)

query_embedding = get_embedding("find code that import bootstrap")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/511k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

configuration_codet5p_embedding.py:   0%|          | 0.00/2.62k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Salesforce/codet5p-110m-embedding:
- configuration_codet5p_embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_codet5p_embedding.py:   0%|          | 0.00/1.93k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Salesforce/codet5p-110m-embedding:
- modeling_codet5p_embedding.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

In [27]:
import numpy as np

print(type(embeddings)) ## list of tensor
print(len(embeddings))
print(type(query_embedding)) ## tensor
# print(embeddings[0])

list_of_arrays = [emb.numpy() for emb in embeddings]
src_emb = np.stack(list_of_arrays)


print(type(src_emb))
print(len(src_emb))
print(src_emb.shape[1])
print(src_emb.shape)
print(src_emb)

<class 'list'>
2
<class 'torch.Tensor'>
<class 'numpy.ndarray'>
2
256
(2, 256)
[[ 5.27880229e-02 -1.06922667e-02 -4.52732742e-02  3.71475518e-03
  -1.02984674e-01  2.16619577e-02  1.19901381e-01  2.39841342e-02
   3.98024209e-02  1.04995422e-01  7.34024793e-02 -3.46152298e-02
  -8.33136141e-02  3.51612903e-02  1.35052070e-01  6.12260140e-02
   7.01959133e-02 -1.03019506e-01  8.41637477e-02  9.37056988e-02
  -2.84229461e-02  9.95419770e-02  1.17883189e-02 -5.78139424e-02
   4.76059178e-03  1.13701344e-01 -9.41876322e-02  3.28374244e-02
  -4.99504767e-02 -1.28368109e-01 -3.73877361e-02  6.85024336e-02
  -6.91653714e-02 -4.68419902e-02 -1.72553826e-02  1.89101193e-02
   3.26877758e-02  1.13916071e-02  4.60650474e-02 -6.39562011e-02
  -7.77734676e-03 -1.25879452e-01 -3.11552305e-02  8.24935585e-02
  -2.33982448e-02  9.08889771e-02  1.21338390e-01 -8.07112902e-02
   5.38107194e-02 -4.30807732e-02 -3.12907957e-02  1.52375670e-02
   1.18480131e-01  2.55304463e-02 -8.69178697e-02  1.69561747e-

# 5. Storing and Retrieving Code Chunks

In [26]:
import hnswlib
import numpy as np

dim = src_emb.shape[1]
print(f"Dimension: {dim}")

num_elements = len(src_emb)
print(f"Number of elements: {num_elements}")

index = hnswlib.Index(space='cosine', dim=dim)
index.init_index(max_elements=num_elements, ef_construction=200, M=16)
index.add_items(src_emb, np.arange(num_elements))

# emb is the embedding of our query code chunk
# k=5 returns 5 most similar code chunks
labels, distances = index.knn_query(query_embedding.numpy(), k=1)

print(f"Nearest neighbors' labels: {labels}")
print(f"Distance: {distances}")

print(f"Retrieved documents: {[src_texts[i] for i in labels[0]]}")

Dimension: 256
Number of elements: 2
Nearest neighbors' labels: [[0]]
Distance: [[0.4322331]]
Retrieved documents: ["import('./bootstrap')"]
