In [13]:
import pandas as pd

# -----------------------------
# 1. Load your triples file
# -----------------------------
df = pd.read_csv(
    "qidpidtriples.top3.clean.tsv",
    sep="\t",
    encoding="utf-16",
    names=["qid", "query", "pos_pid", "positive", "neg_pid", "negative"],
    header=0  
)

df

Unnamed: 0,qid,query,pos_pid,positive,neg_pid,negative
0,1000094,where is whitemarsh island,5399011,"Whitemarsh Island, Georgia. Whitemarsh Island ...",271630,Underwater Volcano Forms New South Pacific Isl...
1,1000094,where is whitemarsh island,5399011,"Whitemarsh Island, Georgia. Whitemarsh Island ...",5534953,"Komodo is one of the 17,508 islands that make ..."
2,1000684,where is your perineum,6133670,That part of the floor of the PELVIS that lies...,54955,rule of nines (rÅ«l nÄ«nz) Method used in calc...
3,1000684,where is your perineum,6133670,That part of the floor of the PELVIS that lies...,5952792,This delicate triangle is important during chi...
4,1000684,where is your perineum,6133670,That part of the floor of the PELVIS that lies...,4455896,"1 abdomen: Latin abdomen = the belly, the part..."
...,...,...,...,...,...,...
60052,112246,cracking definition,5457832,( Extractive engineering : Refinery processes ...,99501,
60053,112246,cracking definition,5457832,( Extractive engineering : Refinery processes ...,3830360,
60054,112324,crawford county indiana population,4935331,"Crawford County, Indiana. Crawford County is a...",7418714,
60055,112324,crawford county indiana population,4935331,"Crawford County, Indiana. Crawford County is a...",5445465,


In [16]:
import pandas as pd

# -----------------------------
# 2. Convert positives
# -----------------------------
df_pos = df[["query", "positive"]].copy()
df_pos = df_pos.rename(columns={"positive":"passage"})
df_pos["label"] = 1

# -----------------------------
# 3. Convert negatives
# -----------------------------
df_neg = df[["query", "negative"]].copy()
df_neg = df_neg.rename(columns={"negative":"passage"})
df_neg["label"] = 0

# -----------------------------
# 4. Combine into single dataframe
# -----------------------------
cross_df = pd.concat([df_pos, df_neg], ignore_index=True)

# -----------------------------
# 5. Optional: shuffle rows
# -----------------------------
cross_df = cross_df.dropna().sample(frac=1, random_state=42).reset_index(drop=True)

# -----------------------------
# 6. Check
# -----------------------------

cross_df

Unnamed: 0,query,passage,label
0,who was the marshall plan named after,Really big trucks are coming to Marshall Motor...,0
1,what does disclaimer mean,Understanding the Causes of Acne Part 1: Hormo...,0
2,what is an epidural made of,"By April, they were complete. I actually had 3...",0
3,why did the cuban missile crisis originate,"Introduction. During the Cuban Missile Crisis,...",0
4,flat anvil definition,An anvil is a heavy block of iron or steel tha...,1
...,...,...,...
102325,where is antrim,3D map of Antrim in United Kingdom. You can al...,1
102326,how many milligrams of potassium is safe,"Children should have 3,000 to 4,000 milligrams...",1
102327,calories in a cup of baby carrots,"More from Red Grapes, 1/2 Cup. 1 Red ;Seedles...",0
102328,dr kate temme npi number,"Kate Temme is a provider in Philadelphia, PA. ...",1


In [17]:
cross_df = cross_df.drop_duplicates(subset=["query", "passage", "label"]).reset_index(drop=True)
print("Total rows after removing duplicates:", len(cross_df))

Total rows after removing duplicates: 60700


Training of the cross-encoder

In [18]:
from sentence_transformers import CrossEncoder, InputExample
from torch.utils.data import DataLoader
import torch

print("Torch CUDA available:", torch.cuda.is_available())

# Convert dataframe rows into InputExamples
train_samples = [
    InputExample(texts=[row['query'], row['passage']], label=float(row['label']))
    for _, row in cross_df.iterrows()
]

# Wrap them in a DataLoader
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=16)

# Initialize cross encoder
model = CrossEncoder(
    'cross-encoder/ms-marco-MiniLM-L-6-v2',
    num_labels=1   # binary (0/1) regression-style output
)

# Train the model
model.fit(
    train_dataloader=train_dataloader,
    epochs=3,
    warmup_steps=100,
    output_path="./cross-encoder-model",
    use_amp=True # for mixed precision training which uses float16 and is faster on modern GPUs
)

Torch CUDA available: True


Step,Training Loss
500,0.19
1000,0.1615
1500,0.148
2000,0.1578
2500,0.1579
3000,0.1493
3500,0.1579
4000,0.1392
4500,0.1115
5000,0.1094


Saving the model

Using the common dataset for evaluation of the cross-encoder

In [19]:
model.save("./cross-encoder-model")

In [25]:
common_dataset = pd.read_csv("common_dataset_80k.tsv", sep="\t", names=["pid", "passage"], dtype={"pid": str, "passage": str})
common_dataset.head()

Unnamed: 0,pid,passage
0,448,A postal code (also known locally in various E...
1,466,"Therefore, all pathologists must have complete..."
2,646,Obesity is a complex disorder involving an exc...
3,1212,Which president appointed FBI Director James C...
4,1213,"Comey was confirmed by the Senate on July 29, ..."


Reworking the common dataset with the help of other dataset. From format (pid,text) ----> (query,text,label)

In [26]:
import pandas as pd

# Load queries (qid, query)
queries = pd.read_csv("queries.dev.tsv", sep="\t", names=["qid", "query"], dtype=str)

# Load qrels.dev (qid, _, pid, rel)
qrels = pd.read_csv("qrels.dev.tsv", sep="\t", names=["qid", "_", "pid", "rel"], dtype=str)

# Merge qrels with queries and collection
eval_df = qrels.merge(queries, on="qid").merge(common_dataset, on="pid")

# Rename columns to match (query, passage, label)
eval_df = eval_df[["query", "passage", "rel"]].rename(columns={"rel": "label"})

# Convert label to float
eval_df["label"] = eval_df["label"].astype(float)

# Save to file for later use
# eval_df.to_csv("cross_encoder_eval.tsv", sep="\t", index=False)

In [27]:
eval_df

Unnamed: 0,query,passage,label
0,. what is a corporation?,McDonald's Corporation is one of the most reco...,1.0
1,why did rachel carson write an obligation to e...,The Obligation to Endure by Rachel Carson Rach...,1.0
2,why did rachel carson write an obligation to e...,Carson believes that as man tries to eliminate...,1.0
3,symptoms of a dying mouse,The symptoms are similar but the mouse will be...,1.0
4,average number of lightning strikes per day,Although many lightning flashes are simply clo...,1.0
...,...,...,...
59268,different types of cuisines,Italian Cuisine. One of the oldest cuisines of...,1.0
59269,are electric cleansers safe?,clear. 1 EL2020 - Plastics Safe Contact Clean...,1.0
59270,define width,"Width is defined as the quality of being wide,...",1.0
59271,forbes definition of human resource management,Human Resource Management (HRM) is the term us...,1.0


In [29]:
import pandas as pd

# Pick 10 queries only
sample_queries = eval_df["query"].drop_duplicates().sample(10, random_state=42).tolist()
sample_eval_df = eval_df[eval_df["query"].isin(sample_queries)].copy()

print(sample_eval_df.head())
print("Sample size:", sample_eval_df.shape)

                                                query  \
17606             hotel owned by al capone in chicago   
20081               are chlorophylls natural products   
24794                    can edibles affect erections   
26237  what happens when you lose your amniotic fluid   
31077           what causes constant bloating and gas   

                                                 passage  label  
17606  Tuesday, November 11, 2008. Capone's Cicero He...    1.0  
20081  1 Chlorophyll a and chlorophyll b are natural,...    1.0  
24794  The Golden Rule of edibles: start small and be...    1.0  
26237  Later in pregnancy, the baby drinks the amniot...    1.0  
31077  Excess gas may be a symptom of a more serious ...    1.0  
Sample size: (12, 3)


In [31]:
from sentence_transformers import CrossEncoder

# Load the trained model
model = CrossEncoder("./cross-encoder-model")

# Predict scores
pairs = sample_eval_df[["query", "passage"]].values.tolist()
scores = model.predict(pairs)

# Attach scores
sample_eval_df["score"] = scores

In [30]:
from eval_metrics import ndcg_at_k, average_precision_at_k, recall_at_k
from collections import defaultdict
import numpy as np

# Using standardized metrics from eval_metrics.py:
# - ndcg_at_k
# - average_precision_at_k
# - recall_at_k
# We'll compute P@k manually (simple hits/k) since it's not in the helper file.


In [32]:
metrics = defaultdict(list)

K_MAP = 10          # cutoff for AP@K
K_NDCG = 10         # nDCG@10
K_RECALL = 100      # recall@100 (adjust if needed)

for query, group in sample_eval_df.groupby("query"):
    ranked = group.sort_values("score", ascending=False)
    ranked_pids = ranked.index.tolist()  # using index as pseudo pid
    ranked_labels = ranked["label"].astype(int).tolist()

    # Build relevance dict/set
    rel_dict = {i: rel for i, rel in zip(ranked_pids, ranked_labels)}
    rel_set = {i for i, rel in zip(ranked_pids, ranked_labels) if rel > 0}

    ap = average_precision_at_k(ranked_pids, rel_set, k=K_MAP)
    ndcg10 = ndcg_at_k(ranked_pids, rel_dict, k=K_NDCG)
    recall100 = recall_at_k(ranked_pids, rel_set, k=K_RECALL)

    metrics["AP@10"].append(ap)
    metrics["nDCG@10"].append(ndcg10)
    metrics["Recall@100"].append(recall100)

results = {m: float(np.mean(v)) for m, v in metrics.items()}
print("Evaluation Results:", results)

Evaluation Results: {'AP@10': 1.0, 'nDCG@10': 1.0, 'Recall@100': 1.0}


In [33]:
import pandas as pd
import numpy as np
from sentence_transformers import CrossEncoder
from sklearn.metrics import average_precision_score
from tqdm import tqdm

def evaluate_cross_encoder(model, eval_df, k=10, sample_size=100):
    """
    Evaluate cross-encoder on a subset of eval_df
    using Precision@k, MAP, and nDCG.
    """
    results = []
    
    # Take a small sample for faster testing (optional)
    queries = eval_df["query"].unique()[:sample_size]
    
    for query in tqdm(queries, desc="Evaluating"):
        # Get all candidate passages + labels for this query
        group = eval_df[eval_df["query"] == query]
        passages = group["passage"].tolist()
        labels = group["label"].tolist()
        
        # Predict relevance scores
        pairs = [[query, passage] for passage in passages]
        scores = model.predict(pairs)
        
        # Rank by score
        ranked = sorted(zip(passages, labels, scores), key=lambda x: x[2], reverse=True)
        ranked_labels = [lbl for _, lbl, _ in ranked]
        
        # Precision@k
        prec_k = np.mean(ranked_labels[:k])
        
        # MAP
        map_score = average_precision_score(ranked_labels, [s for _, _, s in ranked])
        
        # nDCG
        dcg = 0.0
        for i, rel in enumerate(ranked_labels[:k]):
            dcg += (2**rel - 1) / np.log2(i+2)
        ideal_labels = sorted(ranked_labels, reverse=True)[:k]
        idcg = 0.0
        for i, rel in enumerate(ideal_labels):
            idcg += (2**rel - 1) / np.log2(i+2)
        ndcg = dcg / idcg if idcg > 0 else 0.0
        
        results.append((prec_k, map_score, ndcg))
    
    # Aggregate
    precision = np.mean([r[0] for r in results])
    map_ = np.mean([r[1] for r in results])
    ndcg = np.mean([r[2] for r in results])
    
    return {"Precision@k": precision, "MAP": map_, "nDCG": ndcg}

# Example: evaluate on 10 queries only
metrics = evaluate_cross_encoder(model, sample_eval_df, k=10, sample_size=10)
print(metrics)

Evaluating: 100%|██████████| 10/10 [00:00<00:00, 102.87it/s]

{'Precision@k': np.float64(1.0), 'MAP': np.float64(1.0), 'nDCG': np.float64(1.0)}





In [34]:
import pandas as pd

def test_single_query_table(model, eval_df, query_text, top_k=3):
    # Get all passages for this query
    group = eval_df[eval_df["query"] == query_text]
    if group.empty:
        print(f"No matches found for query: {query_text}")
        return
    
    passages = group["passage"].tolist()
    labels = group["label"].tolist()
    
    # Predictions
    pairs = [[query_text, passage] for passage in passages]
    scores = model.predict(pairs)
    
    # Rank passages by score
    ranked = sorted(zip(passages, scores), key=lambda x: x[1], reverse=True)[:top_k]
    
    # True passage(s)
    true_passages = [p for p, l in zip(passages, labels) if l == 1]
    true_text = true_passages[0] if true_passages else "N/A"
    
    # Create clean dataframe
    data = {
        "Query": [query_text],
        "True Passage": [true_text]
    }
    for i, (p, s) in enumerate(ranked, start=1):
        data[f"Top-{i}"] = [f"[{s:.4f}] {p}"]
    
    return pd.DataFrame(data)

# Example usage:
query = "hotel owned by al capone in chicago"
result_df = test_single_query_table(model, sample_eval_df, query, top_k=3)
result_df

Unnamed: 0,Query,True Passage,Top-1
0,hotel owned by al capone in chicago,"Tuesday, November 11, 2008. Capone's Cicero He...","[5.6006] Tuesday, November 11, 2008. Capone's ..."
