In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load train data
df = pd.read_csv("/kaggle/input/nlpassignment-5/train.csv")

# Split train/validation
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['is_duplicate'])


In [2]:
# === Quora Duplicate Questions: SBERT trainer (CosineSimilarityLoss) ===
import os
import numpy as np
import pandas as pd
from datasets import Dataset
from sklearn import model_selection
from sklearn.metrics import f1_score

import torch
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
from sentence_transformers.training_args import SentenceTransformerTrainingArguments
from sentence_transformers.trainer import SentenceTransformerTrainer

# -----------------------
# Config
# -----------------------
config = {
    "model_path": "microsoft/xtremedistil-l6-h256-uncased",  
    "learning_rate": 5e-4,
    "train_batch_size": 128,
    "eval_batch_size": 128,
    "epochs": 3,
    "warmup_ratio": 0.1,
    "output_dir": "/kaggle/working/sbert-out"
}
os.makedirs(config["output_dir"], exist_ok=True)

# -----------------------
# Load & split
# -----------------------
df = pd.read_csv("/kaggle/input/nlpassignment-5/train.csv")
df = df.dropna(subset=["question1", "question2", "is_duplicate"]).copy()
df = df.rename(columns={"is_duplicate": "label"})[["question1", "question2", "label"]]
df["label"] = df["label"].astype(float)

train_df, val_df = model_selection.train_test_split(
    df, test_size=0.20, random_state=42, stratify=df["label"]
)

train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
val_ds   = Dataset.from_pandas(val_df.reset_index(drop=True))

# -----------------------
# Model, loss, evaluator
# -----------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
model  = SentenceTransformer(config["model_path"], device=device)

train_loss = losses.CosineSimilarityLoss(model=model)

dev_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_ds["question1"],
    sentences2=val_ds["question2"],
    scores=val_ds["label"],
    main_similarity=SimilarityFunction.COSINE,
    name="dev-score",
)

# -----------------------
# Training args (no wandb)
# -----------------------
args = SentenceTransformerTrainingArguments(
    output_dir=config["output_dir"],
    num_train_epochs=config["epochs"],
    learning_rate=config["learning_rate"],
    per_device_train_batch_size=config["train_batch_size"],
    per_device_eval_batch_size=config["eval_batch_size"],
    warmup_ratio=config["warmup_ratio"],
    fp16=True,
    logging_steps=100,
    report_to="none"   # <--- disables wandb / hf logging
)

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    loss=train_loss,
    evaluator=dev_evaluator,
)
trainer.train()

# -----------------------
# Pairwise F1 on validation
# -----------------------
def encode_norm(texts, batch_size=256, dev=device):
    return model.encode(
        list(texts),
        batch_size=batch_size,
        convert_to_tensor=True,
        device=dev,
        normalize_embeddings=True,
        show_progress_bar=True,
    )

with torch.no_grad():
    q1v = encode_norm(val_df["question1"], batch_size=256)
    q2v = encode_norm(val_df["question2"], batch_size=256)
    cos_scores = (q1v * q2v).sum(dim=1).detach().cpu().numpy()

y_true = val_df["label"].to_numpy().astype(int)

best_t, best_f1 = 0.5, 0.0
for t in np.linspace(0.2, 0.8, 49):
    f1 = f1_score(y_true, (cos_scores > t).astype(int))
    if f1 > best_f1:
        best_t, best_f1 = t, f1

print(f"\nValidation Results:")
print(f"Best threshold = {best_t:.4f}")
print(f"Best F1-score  = {best_f1:.4f}")


2025-09-07 06:37:08.317533: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757227028.509253      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757227028.567122      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/525 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/51.0M [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/51.0M [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
100,0.3181
200,0.1991
300,0.1988
400,0.1942
500,0.1885
600,0.1984
700,0.2455
800,0.2325
900,0.2809
1000,0.2728


Batches:   0%|          | 0/316 [00:00<?, ?it/s]

Batches:   0%|          | 0/316 [00:00<?, ?it/s]


Validation Results:
Best threshold = 0.7625
Best F1-score  = 0.5603


In [3]:
# === Bi-encoder with ContrastiveLoss ===
from sentence_transformers import losses

train_loss = losses.ContrastiveLoss(model=model)  # <-- only change

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    loss=train_loss,
    evaluator=dev_evaluator,
)
trainer.train()

# Evaluation
with torch.no_grad():
    q1v = encode_norm(val_df["question1"])
    q2v = encode_norm(val_df["question2"])
    cos_scores = (q1v * q2v).sum(dim=1).cpu().numpy()

y_true = val_df["label"].to_numpy().astype(int)

best_t, best_f1 = 0, 0
for t in np.linspace(0.2, 0.8, 49):
    f1 = f1_score(y_true, (cos_scores > t).astype(int))
    if f1 > best_f1:
        best_t, best_f1 = t, f1

print("\nContrastiveLoss Results:")
print(f"Best threshold = {best_t:.4f} | F1 = {best_f1:.4f}")


Step,Training Loss
100,0.0378
200,0.0292
300,0.0291
400,0.0301
500,0.0292
600,0.0294
700,0.0355
800,0.0366
900,0.0357
1000,0.0345


Batches:   0%|          | 0/316 [00:00<?, ?it/s]

Batches:   0%|          | 0/316 [00:00<?, ?it/s]


ContrastiveLoss Results:
Best threshold = 0.2000 | F1 = 0.5393


In [4]:
# === Bi-encoder with MultipleNegativesRankingLoss ===
from sentence_transformers import losses

train_loss = losses.MultipleNegativesRankingLoss(model=model)  # <-- only change

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    loss=train_loss,
    evaluator=dev_evaluator,
)
trainer.train()

# Evaluation
with torch.no_grad():
    q1v = encode_norm(val_df["question1"])
    q2v = encode_norm(val_df["question2"])
    cos_scores = (q1v * q2v).sum(dim=1).cpu().numpy()

y_true = val_df["label"].to_numpy().astype(int)

best_t, best_f1 = 0, 0
for t in np.linspace(0.2, 0.8, 49):
    f1 = f1_score(y_true, (cos_scores > t).astype(int))
    if f1 > best_f1:
        best_t, best_f1 = t, f1

print("\nMNRL Results:")
print(f"Best threshold = {best_t:.4f} | F1 = {best_f1:.4f}")


Step,Training Loss
100,5.0238
200,4.8426
300,4.8259
400,4.822
500,4.8196
600,4.8303
700,4.815
800,4.8152
900,4.8129
1000,4.8075


Batches:   0%|          | 0/316 [00:00<?, ?it/s]

Batches:   0%|          | 0/316 [00:00<?, ?it/s]


MNRL Results:
Best threshold = 0.2000 | F1 = 0.5393


In [5]:
# ---- Disable wandb (transformers may warn; it's fine) ----
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "offline"

import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.metrics import f1_score
import torch
from sentence_transformers import CrossEncoder, InputExample
from torch.utils.data import DataLoader

# === Load & prep (keeps split constant) ===
df = pd.read_csv("/kaggle/input/nlpassignment-5/train.csv")
if "label" not in df.columns and "is_duplicate" in df.columns:
    df = df.rename(columns={"is_duplicate": "label"})
df = df[["question1", "question2", "label"]].dropna().copy()

# labels must be ints (0/1) for CrossEntropy
df["label"] = df["label"].astype(int)

train_df, val_df = model_selection.train_test_split(
    df, test_size=0.20, random_state=42, stratify=df["label"]
)

# === Build dataset (INT labels) ===
train_examples = [
    InputExample(texts=[q1, q2], label=int(y))
    for q1, q2, y in zip(train_df["question1"], train_df["question2"], train_df["label"])
]
train_loader = DataLoader(train_examples, shuffle=True, batch_size=32)

val_pairs = [[q1, q2] for q1, q2 in zip(val_df["question1"], val_df["question2"])]
y_true = val_df["label"].to_numpy().astype(int)

# === Model ===
device = "cuda" if torch.cuda.is_available() else "cpu"
cross_model = CrossEncoder("distilroberta-base", num_labels=2, device=device)

# === Train ===
cross_model.fit(
    train_dataloader=train_loader,
    epochs=1,               # 3–5 for stronger results
    warmup_steps=100,
    show_progress_bar=True
)

# === Predict (get probabilities) ===
probs = cross_model.predict(val_pairs, apply_softmax=True)[:, 1]  # P(class=1)

# === Threshold sweep for best F1 ===
best_t, best_f1 = 0.0, 0.0
for t in np.linspace(0.2, 0.8, 49):
    f1 = f1_score(y_true, (probs > t).astype(int))
    if f1 > best_f1:
        best_t, best_f1 = t, f1

print("\nCross-Encoder Results")
print(f"Best threshold = {best_t:.4f}")
print(f"Best F1-score  = {best_f1:.4f}")


config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/7.50k [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
500,0.4839
1000,0.3878
1500,0.3652
2000,0.3402
2500,0.3323
3000,0.3224
3500,0.3127
4000,0.3215
4500,0.3074
5000,0.3046


Batches:   0%|          | 0/2527 [00:00<?, ?it/s]


Cross-Encoder Results
Best threshold = 0.4125
Best F1-score  = 0.8563
