Data Loading

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import pandas as pd

# ---- Load Data ----
queries = pd.read_csv("queries.train.tsv", sep="\t", names=["qid", "query"])
collection = pd.read_csv("collection.tsv", sep="\t", names=["pid", "passage"])
qrels = pd.read_csv("qrels.train.tsv", sep="\t", names=["qid", "pid", "relevance"])

Formatting for the cross encoder

In [2]:
# Merge to get (query, passage, label)
df = qrels.merge(queries, on="qid").merge(collection, on="pid")
df["label"] = (df["relevance"] > 0).astype(int)  # convert to binary relevance

Model Training

For cross encoder training, you need (query, passage, label) triplet

In [3]:
import pandas as pd

# Load 1000 samples
triples = pd.read_csv(
    "triples.train.small.tsv",
    sep="\t",
    names=["query", "pos", "neg"],
    nrows=1000
)

# Convert to (query, passage, label)
pos_df = triples[["query", "pos"]].rename(columns={"pos": "passage"})
pos_df["label"] = 1

neg_df = triples[["query", "neg"]].rename(columns={"neg": "passage"})
neg_df["label"] = 0

df = pd.concat([pos_df, neg_df]).reset_index(drop=True)

print("Training pairs:", df.shape)
df.head()

Training pairs: (2000, 3)


Unnamed: 0,query,passage,label
0,is a little caffeine ok during pregnancy,We donât know a lot about the effects of caf...,1
1,what fruit is native to australia,Passiflora herbertiana. A rare passion fruit n...,1
2,how large is the canadian military,The Canadian Armed Forces. 1 The first large-...,1
3,types of fruit trees,Cherry. Cherry trees are found throughout the ...,1
4,how many calories a day are lost breastfeeding,"Not only is breastfeeding better for the baby,...",1


In [4]:
from torch.utils.data import Dataset

class CrossEncoderDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=256):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        enc = self.tokenizer(
            row["query"], row["passage"],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in enc.items()}
        item["labels"] = row["label"]
        return item

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
train_dataset = CrossEncoderDataset(df, tokenizer)
print("Dataset length:", len(train_dataset))  # should print 2000

Dataset length: 2000


In [6]:
train_dataset = CrossEncoderDataset(df, tokenizer)
print("Dataset length:", len(train_dataset))  # should print 2000

Dataset length: 2000


In [12]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

training_args = TrainingArguments(
    output_dir="./cross-encoder-msmarco",
    eval_strategy="steps",
    save_strategy="steps",
    save_total_limit=2,
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=500,
    eval_steps=5000,
    save_steps=5000,
    load_best_model_at_end=True,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_df,  # you can create a dev set with qrels.dev.tsv
)

# ---- Train ----
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


TrainOutput(global_step=250, training_loss=0.36691259765625, metrics={'train_runtime': 49.5668, 'train_samples_per_second': 80.699, 'train_steps_per_second': 5.044, 'total_flos': 526222110720000.0, 'train_loss': 0.36691259765625, 'epoch': 2.0})

Save the model

In [15]:
# Save the model
trainer.model.save_pretrained("./cross-encoder-msmarco")
# Save the tokenizer
tokenizer.save_pretrained("./cross-encoder-msmarco")

('./cross-encoder-msmarco/tokenizer_config.json',
 './cross-encoder-msmarco/special_tokens_map.json',
 './cross-encoder-msmarco/vocab.txt',
 './cross-encoder-msmarco/added_tokens.json',
 './cross-encoder-msmarco/tokenizer.json')

Sampling a test set (different from the training set) to pass in the trainer function

In [11]:
# Load 100 triples for testing (not overlapping with training if you like)
test_triples = pd.read_csv(
    "triples.train.small.tsv",
    sep="\t",
    names=["query", "pos", "neg"],
    skiprows=1000,  # skip the first 1000 used for training
    nrows=100
)

# Convert to (query, passage, label)
pos_test = test_triples[["query", "pos"]].rename(columns={"pos": "passage"})
pos_test["label"] = 1

neg_test = test_triples[["query", "neg"]].rename(columns={"neg": "passage"})
neg_test["label"] = 0

test_df = pd.concat([pos_test, neg_test]).reset_index(drop=True)

print("Test pairs:", test_df.shape)
test_df.head()

Test pairs: (200, 3)


Unnamed: 0,query,passage,label
0,does long term heat harm bottled water,IBWA advises consumers to store bottled water ...,1
1,closehandle function,CloseHandle. The CloseHandle function closes a...,1
2,how to get fitbit cheaper,"Amazon release this product on May 1, 2013. Th...",1
3,bastrop jail,Here is jail inmate information for the Bastro...,1
4,wiley x customer service phone number,Wiley X Customer Service Phone Numbers The Cus...,1


Saving the test data for cross encoder

In [None]:
# Save test dataset to a TSV file
test_df.to_csv("test_cross_encoder.tsv", sep="\t", index=False)

print("Saved test dataset to test_cross_encoder.tsv")

Load the trained model for evaluation

In [16]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load saved model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("./cross-encoder-msmarco")
model = AutoModelForSequenceClassification.from_pretrained("./cross-encoder-msmarco")
model.eval()  # set to evaluation mode

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [17]:
import pandas as pd

test_df = pd.read_csv("test_cross_encoder.tsv", sep="\t")

Score passages for a query and rank them

In [18]:
query = "does long term heat harm bottled water"
candidates = test_df[test_df["query"] == query]
passages = candidates["passage"].tolist()
labels = candidates["label"].tolist()

# Tokenize query-passage pairs
inputs = tokenizer([query]*len(passages), passages, padding=True, truncation=True, return_tensors="pt")

# Score
with torch.no_grad():
    outputs = model(**inputs)
    scores = outputs.logits[:, 1].tolist()  # relevance score for label=1

# Rank passages by score descending
ranked = sorted(zip(passages, labels, scores), key=lambda x: x[2], reverse=True)
print("Ranking:")
for passage, label, score in ranked:
    print(f"{score:.4f} | label={label} | {passage[:100]}...")

Ranking:
1.2381 | label=1 | IBWA advises consumers to store bottled water at room temperature or cooler, out of direct sunlight ...
0.8556 | label=0 | According to the International Bottled Water Association, âBottled water can be used indefinitely ...


Compute ranking metrics

In [20]:
import numpy as np
from sklearn.metrics import average_precision_score

y_true = [label for _, label, _ in ranked]
y_score = [score for _, _, score in ranked]

# Mean Average Precision (MAP)
map_score = average_precision_score(y_true, y_score)
print("MAP:", map_score)

# Precision@k
k = 3
precision_at_k = sum(y_true[:k])/k
print(f"Precision@{k}:", precision_at_k)

# nDCG@k
def dcg(rels):
    return sum((2**r - 1)/np.log2(i+2) for i,r in enumerate(rels))

ideal = sorted(y_true, reverse=True)[:k]
ndcg_score = dcg(y_true[:k])/dcg(ideal) if dcg(ideal) > 0 else 0
print(f"nDCG@{k}:", ndcg_score)

MAP: 1.0
Precision@3: 0.3333333333333333
nDCG@3: 1.0
