In [2]:
import pandas as pd

# -----------------------------
# 1. Loading our triples file
# -----------------------------
df = pd.read_csv(
    "qidpidtriples.top3.clean.tsv",
    sep="\t",
    encoding="utf-16",
    names=["qid", "query", "pos_pid", "positive", "neg_pid", "negative"],
    header=0  
)

df

Unnamed: 0,qid,query,pos_pid,positive,neg_pid,negative
0,1000094,where is whitemarsh island,5399011,"Whitemarsh Island, Georgia. Whitemarsh Island ...",271630,Underwater Volcano Forms New South Pacific Isl...
1,1000094,where is whitemarsh island,5399011,"Whitemarsh Island, Georgia. Whitemarsh Island ...",5534953,"Komodo is one of the 17,508 islands that make ..."
2,1000684,where is your perineum,6133670,That part of the floor of the PELVIS that lies...,54955,rule of nines (rÅ«l nÄ«nz) Method used in calc...
3,1000684,where is your perineum,6133670,That part of the floor of the PELVIS that lies...,5952792,This delicate triangle is important during chi...
4,1000684,where is your perineum,6133670,That part of the floor of the PELVIS that lies...,4455896,"1 abdomen: Latin abdomen = the belly, the part..."
...,...,...,...,...,...,...
60052,112246,cracking definition,5457832,( Extractive engineering : Refinery processes ...,99501,
60053,112246,cracking definition,5457832,( Extractive engineering : Refinery processes ...,3830360,
60054,112324,crawford county indiana population,4935331,"Crawford County, Indiana. Crawford County is a...",7418714,
60055,112324,crawford county indiana population,4935331,"Crawford County, Indiana. Crawford County is a...",5445465,


In [11]:
import pandas as pd

# -----------------------------
# 2. Add label 1 to positives
# -----------------------------
df_pos = df[["query", "positive"]].copy()
df_pos = df_pos.rename(columns={"positive":"passage"})
df_pos["label"] = 1

# -----------------------------
# 3. Add label 0 to negatives
# -----------------------------
df_neg = df[["query", "negative"]].copy()
df_neg = df_neg.rename(columns={"negative":"passage"})
df_neg["label"] = 0

# -----------------------------
# 4. Combine into single dataframe
# -----------------------------
cross_df = pd.concat([df_pos, df_neg], ignore_index=True)

# -----------------------------
# 5. Optional: shuffle rows
# -----------------------------
cross_df = cross_df.dropna().sample(frac=1, random_state=42).reset_index(drop=True)

# -----------------------------
# 6. Check
# -----------------------------

cross_df

Unnamed: 0,query,passage,label
0,who was the marshall plan named after,Really big trucks are coming to Marshall Motor...,0
1,what does disclaimer mean,Understanding the Causes of Acne Part 1: Hormo...,0
2,what is an epidural made of,"By April, they were complete. I actually had 3...",0
3,why did the cuban missile crisis originate,"Introduction. During the Cuban Missile Crisis,...",0
4,flat anvil definition,An anvil is a heavy block of iron or steel tha...,1
...,...,...,...
102325,where is antrim,3D map of Antrim in United Kingdom. You can al...,1
102326,how many milligrams of potassium is safe,"Children should have 3,000 to 4,000 milligrams...",1
102327,calories in a cup of baby carrots,"More from Red Grapes, 1/2 Cup. 1 Red ;Seedles...",0
102328,dr kate temme npi number,"Kate Temme is a provider in Philadelphia, PA. ...",1


Training of the cross-encoder

In [12]:
from sentence_transformers import CrossEncoder, InputExample
from torch.utils.data import DataLoader
import torch

print("Torch CUDA available:", torch.cuda.is_available())

# Convert dataframe rows into InputExamples
train_samples = [
    InputExample(texts=[row['query'], row['passage']], label=float(row['label']))
    for _, row in cross_df.iterrows()
]

# Wrap them in a DataLoader
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=16)

# Initialize cross encoder
model = CrossEncoder(
    'cross-encoder/ms-marco-MiniLM-L-6-v2',
    num_labels=1   # binary (0/1) regression-style output
)

# Train the model
model.fit(
    train_dataloader=train_dataloader,
    epochs=3,
    warmup_steps=100,
    output_path="./cross-encoder-model",
    use_amp=True # for mixed precision training which uses float16 and is faster on modern GPUs
)
model.save("./cross-encoder-model")

  from .autonotebook import tqdm as notebook_tqdm


Torch CUDA available: True


Step,Training Loss
500,0.1765
1000,0.1494
1500,0.1453
2000,0.1345
2500,0.1369
3000,0.1369
3500,0.1293
4000,0.1371
4500,0.1348
5000,0.1283
