In [44]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m70.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


In [None]:
!pip install -U datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

In [None]:
import os
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel
from torch.optim import AdamW
from tqdm import tqdm
from datasets import load_dataset
import faiss
import numpy as np
from peft import LoraConfig, get_peft_model

In [None]:
dataset = load_dataset("code_search_net", "python")
train_data = dataset["train"]
valid_data = dataset["validation"]
test_data  = dataset["test"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

code_search_net.py:   0%|          | 0.00/8.44k [00:00<?, ?B/s]

The repository for code_search_net contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/code_search_net.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


python.zip:   0%|          | 0.00/941M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/412178 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/22176 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/23107 [00:00<?, ? examples/s]

In [None]:
def extract_pairs(split):
    pairs = []
    for ex in split:
        doc = ex["func_documentation_string"]
        code = ex["func_code_string"]
        if doc and code:
            pairs.append((doc, code))
    return pairs

In [None]:
train_pairs = extract_pairs(train_data)
valid_pairs = extract_pairs(valid_data)
test_pairs  = extract_pairs(test_data)

In [None]:
class CodeSearchDataset(Dataset):
    def __init__(self, pairs, tokenizer, max_length=256):
        self.pairs      = pairs
        self.tokenizer  = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        q, c = self.pairs[idx]
        q_enc = self.tokenizer(
            q, return_tensors="pt", truncation=True,
            padding="max_length", max_length=self.max_length
        )
        c_enc = self.tokenizer(
            c, return_tensors="pt", truncation=True,
            padding="max_length", max_length=self.max_length
        )
        return q_enc, c_enc

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
base_model = RobertaModel.from_pretrained("microsoft/codebert-base")  #  [oai_citation:4‡Medium](https://jaotheboss.medium.com/peft-with-bert-8763d8b8a4ca?utm_source=chatgpt.com)

# Configure LoRA to adapt only the query and value projection layers
lora_config = LoraConfig(
    r=8,                            # LoRA rank
    lora_alpha=16,                  # LoRA scaling
    target_modules=["query","value"],
    lora_dropout=0.1,               # Dropout on LoRA adapters
    bias="none"
)
model = get_peft_model(base_model, lora_config)  #  [oai_citation:5‡Hugging Face](https://huggingface.co/docs/peft/main/en/developer_guides/lora?utm_source=chatgpt.com)
model.cuda()
model.train()
model.print_trainable_parameters()

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

trainable params: 294,912 || all params: 124,940,544 || trainable%: 0.2360


In [None]:
def info_nce_loss(q_emb, c_emb, temperature=0.07):
    """
    InfoNCE: maximizes similarity of true pairs vs in-batch negatives.  [oai_citation:6‡Hugging Face](https://huggingface.co/docs/peft/en/package_reference/lora?utm_source=chatgpt.com)
    """
    q_norm = F.normalize(q_emb, p=2, dim=1)
    c_norm = F.normalize(c_emb, p=2, dim=1)
    logits = torch.matmul(q_norm, c_norm.T) / temperature
    labels = torch.arange(q_emb.size(0)).cuda()
    return F.cross_entropy(logits, labels)

In [None]:
batch_size   = 64
train_loader = DataLoader(CodeSearchDataset(train_pairs, tokenizer),
                          batch_size=batch_size, shuffle=True,  num_workers=4)
valid_loader = DataLoader(CodeSearchDataset(valid_pairs, tokenizer),
                          batch_size=batch_size, shuffle=False, num_workers=4)

optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
from tqdm.auto import tqdm

# … (rest of your imports and setup) …

for epoch in range(2):
    model.train()
    total_train_loss = 0.0

    # Wrap DataLoader with tqdm
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}", leave=False)
    for q_batch, c_batch in progress_bar:
        # Move to GPU
        q_ids = q_batch["input_ids"].squeeze(1).cuda()
        q_mask = q_batch["attention_mask"].squeeze(1).cuda()
        c_ids = c_batch["input_ids"].squeeze(1).cuda()
        c_mask = c_batch["attention_mask"].squeeze(1).cuda()

        # Forward
        q_out = model(input_ids=q_ids, attention_mask=q_mask)
        c_out = model(input_ids=c_ids, attention_mask=c_mask)
        q_emb = q_out.last_hidden_state[:,0,:]
        c_emb = c_out.last_hidden_state[:,0,:]

        # Loss & backward
        loss = info_nce_loss(q_emb, c_emb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

        # Update the tqdm bar with current loss
        progress_bar.set_postfix(train_loss=loss.item())

    avg_train_loss = total_train_loss / len(train_loader)

    # Validation step (unchanged) …
    model.eval()
    total_val_loss = 0.0
    with torch.no_grad():
        for q_batch, c_batch in valid_loader:
            # … compute val loss …
            total_val_loss += info_nce_loss(
                model(input_ids=q_batch["input_ids"].squeeze(1).cuda(),
                      attention_mask=q_batch["attention_mask"].squeeze(1).cuda()
                     ).last_hidden_state[:,0,:],
                model(input_ids=c_batch["input_ids"].squeeze(1).cuda(),
                      attention_mask=c_batch["attention_mask"].squeeze(1).cuda()
                     ).last_hidden_state[:,0,:]
            ).item()

    avg_val_loss = total_val_loss / len(valid_loader)
    print(f"Epoch {epoch+1} — Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

Epoch 1:   0%|          | 0/6441 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7bab28ffeca0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1601, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7bab28ffeca0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 16

Epoch 1 — Train Loss: 0.0160 | Val Loss: 0.2174


Epoch 2:   0%|          | 0/6441 [00:00<?, ?it/s]

Epoch 2 — Train Loss: 0.0117 | Val Loss: 0.2140


In [45]:
# 9. Save the fine‑tuned LoRA model and tokenizer
# ------------------------------------------------
output_dir = "/content/drive/MyDrive/IR Project/checkpoint"
os.makedirs(output_dir, exist_ok=True)

# This will save:
# - the base model config + weights (frozen)
# - the LoRA adapter weights in adapters_config.json + adapter_model.bin
model.save_pretrained(output_dir)

# And save the tokenizer so you can reload it exactly
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}/")

Model and tokenizer saved to /content/drive/MyDrive/IR Project/checkpoint/


In [54]:
from transformers import RobertaTokenizer, RobertaModel
from peft import PeftModel

# Load base + adapters
base = RobertaModel.from_pretrained("microsoft/codebert-base")
lora_model = PeftModel.from_pretrained(base, "/content/drive/MyDrive/IR Project/checkpoint")

tokenizer = RobertaTokenizer.from_pretrained("/content/drive/MyDrive/IR Project/checkpoint")
lora_model.eval()

PeftModel(
  (base_model): LoraModel(
    (model): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0-11): 12 x RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSdpaSelfAttention(
                (query): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=768, out_features=8, bias=False)
                  )


In [55]:
batch_size = 64
lora_model.cuda()
lora_model.eval()
all_codes = [code for (_,code) in valid_pairs]
code_embs  = []
with torch.no_grad():
    for i in range(0, len(all_codes), batch_size):
        batch_codes = all_codes[i:i+batch_size]
        enc = tokenizer(batch_codes, return_tensors="pt",
                        truncation=True, padding="max_length", max_length=256).to("cuda")
        out = lora_model(**enc)
        emb = F.normalize(out.last_hidden_state[:,0,:], p=2, dim=1)
        code_embs.append(emb.cpu().numpy())

code_matrix = np.vstack(code_embs)
index = faiss.IndexFlatIP(code_matrix.shape[1])  # inner-product ≃ cosine if vectors are normalized  [oai_citation:8‡GitHub](https://github.com/huggingface/diffusers/issues/2719?utm_source=chatgpt.com)
index.add(code_matrix)

In [None]:
def retrieve(query, k=5):
    enc = tokenizer([query], return_tensors="pt",
                    truncation=True, padding="max_length", max_length=256).to("cuda")
    with torch.no_grad():
        out = lora_model(**enc)
        qv = F.normalize(out.last_hidden_state[:,0,:], p=2, dim=1).cpu().numpy()
    _, I = index.search(qv, k)
    return [all_codes[i] for i in I[0]]

In [None]:
def evaluate(pairs, K=5):
    mrr, recall = 0.0, 0.0
    for q, c_true in pairs:
        results = retrieve(q, k=K)
        rank = next((i+1 for i,c in enumerate(results) if c==c_true), None)
        if rank:
            mrr += 1.0/rank
            recall+= 1.0
    n = len(pairs)
    return mrr/n, recall/n

mrr5, rec5 = evaluate(valid_pairs[:1000], K=1)
print(f"MRR@1: {mrr5:.4f}, Recall@1: {rec5:.4f}")
mrr5, rec5 = evaluate(valid_pairs[:1000], K=3)
print(f"MRR@3: {mrr5:.4f}, Recall@3: {rec5:.4f}")
mrr5, rec5 = evaluate(valid_pairs[:1000], K=5)
print(f"MRR@5: {mrr5:.4f}, Recall@5: {rec5:.4f}")
mrr5, rec5 = evaluate(valid_pairs[:1000], K=10)
print(f"MRR@10: {mrr5:.4f}, Recall@10: {rec5:.4f}")

MRR@1: 0.9590, Recall@1: 0.9590
MRR@3: 0.9702, Recall@3: 0.9840
MRR@5: 0.9709, Recall@5: 0.9870
MRR@10: 0.9718, Recall@10: 0.9930


In [53]:
import random
N = 5
sampled = random.sample(train_pairs, N)
for i, (query, true_code) in enumerate(sampled, 1):
    print(f"\n=== Sample {i} ===")
    print(f"Query:\n{query}\n")

    # Retrieve top-5 snippets
    results = retrieve(query, k=5)

    # Print retrieved snippets
    print("Top 5 Retrieved Code Snippets:")
    for rank, code_snippet in enumerate(results, start=1):
        print(f"{rank}. {code_snippet.splitlines()[0]} ...")



=== Sample 1 ===
Query:
Returns a function that returns both value and gradient. Suitable for use
    in scipy.optimize

Top 5 Retrieved Code Snippets:
1. def main(): ...
2. def plot_errors_single(rad, crb, errors, labels=['trackpy', 'peri']): ...
3. def divide(ol,interval): ...
4. def broken_seqs(ol,break_points): ...
5. def is_lop(ch,block_op_pairs_dict=get_block_op_pairs('{}[]()')): ...

=== Sample 2 ===
Query:
Unpacks a JWT into its parts and base64 decodes the parts
        individually

        :param token: The JWT
        :param kwargs: A possible empty set of claims to verify the header
            against.

Top 5 Retrieved Code Snippets:
1. def main(): ...
2. def plot_errors_single(rad, crb, errors, labels=['trackpy', 'peri']): ...
3. def divide(ol,interval): ...
4. def is_lop(ch,block_op_pairs_dict=get_block_op_pairs('{}[]()')): ...
5. def broken_seqs(ol,break_points): ...

=== Sample 3 ===
Query:
_sum(data [, start]) -> value
    Return a high-precision sum of the given nu