In [1]:
!pip install datasets



In [2]:
import pandas as pd
import re
import nltk
from tqdm import tqdm
from datasets import load_dataset


try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')

def clean_text(text):
    if text is None:
        return ""
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def split_passages(text, min_tokens=80, max_tokens=300):
    sentences = nltk.sent_tokenize(text)
    passages = []
    current_passage = ""

    for sentence in sentences:
        if len(current_passage.split()) + len(sentence.split()) <= max_tokens:
            current_passage += " " + sentence
        else:
            if len(current_passage.split()) >= min_tokens:
                passages.append(current_passage.strip())
            current_passage = sentence

    if len(current_passage.split()) >= min_tokens:
        passages.append(current_passage.strip())

    return passages

print("Loading MS MARCO dataset...")
dataset = load_dataset("ms_marco", "v2.1", split="train")

print("Processing documents...")
query_list = []
passage_list = []

for item in tqdm(dataset, desc="Building (query, passage) pairs"):
    query_text = item.get('query', '')
    passage_info = item.get('passages', {})
    is_selected_list = passage_info.get('is_selected', [])
    passage_text_list = passage_info.get('passage_text', [])

    for is_selected, passage_text in zip(is_selected_list, passage_text_list):
        if is_selected == 1:
            text = clean_text(passage_text)
            passages = split_passages(text)


            for passage in passages:
                query_list.append(query_text)
                passage_list.append(passage)

print(f"Total (query, passage) pairs extracted: {len(query_list)}")


df = pd.DataFrame({"query": query_list, "passage": passage_list})

output_path = "cleaned_query_passage_pairs.parquet"
df.to_parquet(output_path)

print(f"Saved cleaned (query, passage) pairs to {output_path}")



Loading MS MARCO dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Processing documents...


Building (query, passage) pairs: 100%|██████████| 808731/808731 [03:26<00:00, 3921.01it/s]


Total (query, passage) pairs extracted: 99394
Saved cleaned (query, passage) pairs to cleaned_query_passage_pairs.parquet


In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
from torch.cuda.amp import autocast, GradScaler


model_name = "distilbert-base-uncased"
batch_size = 64
epochs = 3
lr = 2e-5
max_length = 256
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
scaler = GradScaler()


df = pd.read_parquet("cleaned_query_passage_pairs.parquet")
queries = df['query'].tolist()
passages = df['passage'].tolist()


class QueryPassageDataset(Dataset):
    def __init__(self, queries, passages, tokenizer, max_length=256):
        self.queries = queries
        self.passages = passages
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.queries)

    def __getitem__(self, idx):
        query = self.queries[idx]
        passage = self.passages[idx]

        query_inputs = self.tokenizer(
            query, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt"
        )
        passage_inputs = self.tokenizer(
            passage, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt"
        )

        return {
            'query_input_ids': query_inputs['input_ids'].squeeze(0),
            'query_attention_mask': query_inputs['attention_mask'].squeeze(0),
            'passage_input_ids': passage_inputs['input_ids'].squeeze(0),
            'passage_attention_mask': passage_inputs['attention_mask'].squeeze(0),
        }


tokenizer = AutoTokenizer.from_pretrained(model_name)
train_dataset = QueryPassageDataset(queries, passages, tokenizer, max_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)  # Important: drop_last for clean in-batch negatives


class BiEncoder(nn.Module):
    def __init__(self, model_name):
        super(BiEncoder, self).__init__()
        self.encoder = AutoModel.from_pretrained(model_name)

    def encode(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
        return embeddings

    def forward(self, query_input_ids, query_attention_mask, passage_input_ids, passage_attention_mask):
        query_emb = self.encode(query_input_ids, query_attention_mask)
        passage_emb = self.encode(passage_input_ids, passage_attention_mask)
        return query_emb, passage_emb


model = BiEncoder(model_name).to(device)
model = torch.compile(model)  
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss()


for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        query_input_ids = batch['query_input_ids'].to(device)
        query_attention_mask = batch['query_attention_mask'].to(device)
        passage_input_ids = batch['passage_input_ids'].to(device)
        passage_attention_mask = batch['passage_attention_mask'].to(device)

        optimizer.zero_grad()

        with autocast():
            query_emb, passage_emb = model(
                query_input_ids, query_attention_mask,
                passage_input_ids, passage_attention_mask
            )

            
            similarity_matrix = torch.matmul(query_emb, passage_emb.T)

            
            labels = torch.arange(similarity_matrix.size(0)).to(device)

            loss = loss_fn(similarity_matrix, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}: Average Loss = {total_loss/len(train_loader):.4f}")


save_path = "trained_biencoder_distilbert_better"
model.encoder.save_pretrained(f"{save_path}/encoder")
tokenizer.save_pretrained(f"{save_path}/tokenizer")

print(f"Model saved to {save_path}")



  scaler = GradScaler()
  with autocast():
W0426 19:04:28.431000 35423 torch/_inductor/utils.py:1137] [0/0] Not enough SMs to use max_autotune_gemm mode
  with autocast():
Epoch 1: 100%|██████████| 1553/1553 [16:52<00:00,  1.53it/s]


Epoch 1: Average Loss = 0.0693


Epoch 2: 100%|██████████| 1553/1553 [16:06<00:00,  1.61it/s]


Epoch 2: Average Loss = 0.0260


Epoch 3: 100%|██████████| 1553/1553 [16:02<00:00,  1.61it/s]


Epoch 3: Average Loss = 0.0194
Model saved to trained_biencoder_distilbert_better


In [None]:
model = AutoModel.from_pretrained(f"{save_path}/encoder").to(device)
tokenizer = AutoTokenizer.from_pretrained(f"{save_path}/tokenizer")
model.eval()
queries = df['query'].tolist()
passages = df['passage'].tolist()


def embed_texts(texts, batch_size=64):
    embeddings = []
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
            batch_texts = texts[i:i+batch_size]
            inputs = tokenizer(batch_texts, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt").to(device)
            outputs = model(**inputs).last_hidden_state.mean(dim=1)  # mean pooling
            embeddings.append(outputs.cpu())
    embeddings = torch.cat(embeddings, dim=0)
    return embeddings


print("Embedding all passages...")
passage_embeddings = embed_texts(passages)


print("Fitting NearestNeighbors index...")
index = NearestNeighbors(n_neighbors=top_k, metric="cosine")
index.fit(passage_embeddings)


print("Embedding all queries...")
query_embeddings = embed_texts(queries)


print("Retrieving top-k passages...")
distances, indices = index.kneighbors(query_embeddings, return_distance=True)


recall_at_1 = 0
recall_at_5 = 0
total = len(queries)


query_to_passage = {i: i for i in range(total)}  

for query_idx in range(total):
    retrieved_indices = indices[query_idx]
    correct_idx = query_to_passage[query_idx]

    if correct_idx == retrieved_indices[0]:
        recall_at_1 += 1
    if correct_idx in retrieved_indices:
        recall_at_5 += 1

recall_at_1 /= total
recall_at_5 /= total

print(f"\n--- Retrieval Results ---")
print(f"Recall@1: {recall_at_1:.4f}")
print(f"Recall@5: {recall_at_5:.4f}")

print("\nShowing first 5 queries and retrievals:\n")
for i in range(5):
    print(f"Query {i+1}: {queries[i]}\n")
    for rank, idx in enumerate(indices[i]):
        print(f"Top {rank+1} passage: {passages[idx][:300]}...")  # show first 300 chars
    print("-" * 80)


Embedding all passages...


Embedding: 100%|██████████| 1554/1554 [11:57<00:00,  2.17it/s]


Fitting NearestNeighbors index...
Embedding all queries...


Embedding: 100%|██████████| 1554/1554 [11:31<00:00,  2.25it/s]


Retrieving top-k passages...

--- Retrieval Results ---
Recall@1: 0.4653
Recall@5: 0.7419

Showing first 5 queries and retrievals:

Query 1: nyu tuition cost

Top 1 passage: tuition for new york university is $ 43746 for the 2014 2015 academic year this is 73 % more expensive than the national average private non profit four year college tuition of $ 25240he net out of pocket total cost you end up paying or financing though student loans is known as the net price the re...
Top 2 passage: Tuition, Room, Board and Fees. On average, tuition and fees are approximately $47,750 for two semesters; room and board cost approximately $17,580 per year. Most NYU students receive one or more forms of financial aid to support contributions made by them and their families.uition, Room, Board and F...
Top 3 passage: Annual Total Cost. The annual total list price cost to go to New York University was $64,022 for the 2014/2015 academic year. The cost is the same for all students regardless of New York r

In [15]:
from google.colab import drive
drive.mount('/content/drive')
load_path = "trained_biencoder_distilbert_better"
model = AutoModel.from_pretrained(f"{load_path}/encoder").to(device)
save_path = "/content/drive/MyDrive/trained_biencoder_distilbert_better"
model.save_pretrained(f"{save_path}/encoder")
tokenizer.save_pretrained(f"{save_path}/tokenizer")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


('/content/drive/MyDrive/trained_biencoder_distilbert_better/tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/trained_biencoder_distilbert_better/tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/trained_biencoder_distilbert_better/tokenizer/vocab.txt',
 '/content/drive/MyDrive/trained_biencoder_distilbert_better/tokenizer/added_tokens.json',
 '/content/drive/MyDrive/trained_biencoder_distilbert_better/tokenizer/tokenizer.json')