In [None]:
from sentence_transformers import losses, evaluation
import zipfile

import pandas as pd
from sklearn import model_selection
from datasets import Dataset

from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.similarity_functions import SimilarityFunction
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

2025-09-07 14:36:40.802963: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757255801.135513      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757255801.228029      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
import zipfile
import pandas as pd
from sklearn import model_selection

with zipfile.ZipFile("/kaggle/input/quora-question-pairs/train.csv.zip", 'r') as zip_ref:
    zip_ref.extractall("./train/")

# Load and clean data
df = pd.read_csv("/kaggle/working/train/train.csv").dropna()
df = df.rename(columns={'is_duplicate': 'label'})[['question1', 'question2', 'label']]

# Split into train+val and test (80%/20%)
train_val, test = model_selection.train_test_split(
    df, test_size=0.2, random_state=42, stratify=df['label']
)

# Split train+val into train and val (75%/25% of train_val)
train, val = model_selection.train_test_split(
    train_val, test_size=0.25, random_state=42, stratify=train_val['label']
)

# Convert to Dataset objects
from datasets import Dataset
train_ds = Dataset.from_pandas(train.reset_index(drop=True))
val_ds = Dataset.from_pandas(val.reset_index(drop=True))
test_ds = Dataset.from_pandas(test.reset_index(drop=True))

In [None]:
import torch
from sklearn.metrics import f1_score
from tqdm import tqdm
import numpy as np

def evaluate_f1(model, dataset, threshold=0.5, is_cross_encoder=False, batch_size=128):
    q1, q2, labels = dataset['question1'], dataset['question2'], dataset['label']
    n_samples = len(labels)
    predictions = []

    if is_cross_encoder:
        from sentence_transformers import CrossEncoder
        if not isinstance(model, CrossEncoder):
            model = CrossEncoder(model)

        # Process in batches
        for i in tqdm(range(0, n_samples, batch_size), desc="Evaluating Cross-Encoder"):
            batch_q1 = q1[i:i+batch_size]
            batch_q2 = q2[i:i+batch_size]
            batch_scores = model.predict(list(zip(batch_q1, batch_q2)))
            batch_preds = (batch_scores >= threshold).astype(int)
            predictions.extend(batch_preds)

    else:
        # Process question1 and question2 separately in batches
        emb1_list, emb2_list = [], []

        # Encode question1 in batches
        for i in tqdm(range(0, n_samples, batch_size), desc="Encoding question1"):
            batch_q1 = q1[i:i+batch_size]
            emb1_batch = model.encode(batch_q1, convert_to_tensor=False, show_progress_bar=False)
            emb1_list.append(emb1_batch)

        # Encode question2 in batches
        for i in tqdm(range(0, n_samples, batch_size), desc="Encoding question2"):
            batch_q2 = q2[i:i+batch_size]
            emb2_batch = model.encode(batch_q2, convert_to_tensor=False, show_progress_bar=False)
            emb2_list.append(emb2_batch)

        # Concatenate all batches
        emb1 = np.concatenate(emb1_list, axis=0)
        emb2 = np.concatenate(emb2_list, axis=0)

        # Normalize embeddings for cosine similarity
        emb1_norm = emb1 / np.linalg.norm(emb1, axis=1, keepdims=True)
        emb2_norm = emb2 / np.linalg.norm(emb2, axis=1, keepdims=True)

        # Compute cosine similarity element-wise (much more memory efficient)
        cos_scores = np.sum(emb1_norm * emb2_norm, axis=1)
        scores = (cos_scores + 1) / 2  # Convert from [-1,1] to [0,1]
        predictions = (scores >= threshold).astype(int)

    return f1_score(labels, predictions)

In [None]:
config = {
    "model_path": "microsoft/xtremedistil-l6-h256-uncased",

    "learning_rate": 5e-4,
    "train_batch_size": 256,
    "eval_batch_size": 256,
    "epochs": 5,
    "warmup_ratio": 0.1,

    "output_dir": "."
}

In [None]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer(config["model_path"])
val_f1 = evaluate_f1(model, val_ds)
print(f"Benchmark F1-Score: {val_f1:.4f}")

config.json:   0%|          | 0.00/525 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/51.0M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/51.0M [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Encoding question1: 100%|██████████| 632/632 [00:19<00:00, 31.82it/s]
Encoding question2: 100%|██████████| 632/632 [00:19<00:00, 32.22it/s]


Benchmark F1-Score: 0.5393


In [None]:
# Create a more memory-efficient evaluator
class F1Evaluator:
    def __init__(self, dataloader, threshold=0.5):
        self.dataloader = dataloader
        self.threshold = threshold

    def __call__(self, model, output_path=None, epoch=-1, steps=-1):
        all_labels = []
        all_predictions = []

        for batch in tqdm(self.dataloader, desc="Evaluating"):
            features, labels = batch
            emb1 = model.encode(features['question1'], convert_to_tensor=False)
            emb2 = model.encode(features['question2'], convert_to_tensor=False)

            # Compute cosine similarity
            emb1_norm = emb1 / np.linalg.norm(emb1, axis=1, keepdims=True)
            emb2_norm = emb2 / np.linalg.norm(emb2, axis=1, keepdims=True)
            cos_scores = np.sum(emb1_norm * emb2_norm, axis=1)
            scores = (cos_scores + 1) / 2

            predictions = (scores >= self.threshold).astype(int)
            all_labels.extend(labels.numpy())
            all_predictions.extend(predictions)

        f1 = f1_score(all_labels, all_predictions)
        return f1

In [None]:
# Train with cosine similarity loss
train_loss = losses.CosineSimilarityLoss(model=model)

args = SentenceTransformerTrainingArguments(
    output_dir=config["output_dir"],

    num_train_epochs=config["epochs"],
    learning_rate=config["learning_rate"],
    per_device_train_batch_size=config["train_batch_size"],
    per_device_eval_batch_size=config["eval_batch_size"],

    warmup_ratio=config["warmup_ratio"],

    fp16=True,

    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,

    report_to="none"
)

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    loss=train_loss,
)
trainer.train()

# Evaluate on test set
test_f1 = evaluate_f1(model, test_ds, batch_size=256)
print(f"CosineSimilarityLoss F1: {test_f1:.4f}")

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Epoch,Training Loss,Validation Loss
1,No log,0.163787
2,0.183700,0.136259
3,0.136800,0.126353
4,0.116500,0.117705
5,0.101700,0.115979


Encoding question1: 100%|██████████| 316/316 [00:18<00:00, 16.69it/s]
Encoding question2: 100%|██████████| 316/316 [00:19<00:00, 16.49it/s]


CosineSimilarityLoss F1: 0.5991


In [None]:
contrastive_loss = losses.ContrastiveLoss(model=model)

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    loss=contrastive_loss,
)
trainer.train()

test_f1 = evaluate_f1(model, test_ds, batch_size=256)
print(f"ContrastiveLoss F1: {test_f1:.4f}")

Epoch,Training Loss,Validation Loss
1,No log,0.014748
2,0.012500,0.014018
3,0.010700,0.013862
4,0.008900,0.013769
5,0.007600,0.013544


Encoding question1: 100%|██████████| 316/316 [00:18<00:00, 17.00it/s]
Encoding question2: 100%|██████████| 316/316 [00:19<00:00, 16.57it/s]


ContrastiveLoss F1: 0.5451


In [None]:
# Filter only positive pairs for MNRL
pos_indices = [i for i, label in enumerate(train_ds['label']) if label == 1]
pos_train_ds = train_ds.select(pos_indices)

mnrl_loss = losses.MultipleNegativesRankingLoss(model=model)

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=pos_train_ds,
    eval_dataset=val_ds,
    loss=mnrl_loss,
)
trainer.train()

test_f1 = evaluate_f1(model, test_ds, batch_size=256)
print(f"MNRL F1: {test_f1:.4f}")

Epoch,Training Loss,Validation Loss
1,No log,2.06844
2,No log,1.914759
3,0.349400,1.854747
4,0.349400,1.829723
5,0.349400,1.832685


Encoding question1: 100%|██████████| 316/316 [00:18<00:00, 17.11it/s]
Encoding question2: 100%|██████████| 316/316 [00:18<00:00, 16.71it/s]


MNRL F1: 0.5534


In [None]:
import zipfile
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report
from datasets import Dataset
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding


# Load tokenizer and model for sequence classification
tokenizer = AutoTokenizer.from_pretrained(config["model_path"])
model = AutoModelForSequenceClassification.from_pretrained(
    config["model_path"], num_labels=2
)

# Tokenization function
def tokenize_function(examples):
    texts = [(q1, q2) for q1, q2 in zip(examples["question1"], examples["question2"])]
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

# Tokenize datasets
tokenized_train = train_ds.map(tokenize_function, batched=True)
tokenized_val = val_ds.map(tokenize_function, batched=True)
tokenized_test = test_ds.map(tokenize_function, batched=True)

# Format datasets for PyTorch
tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_val = tokenized_val.rename_column("label", "labels")
tokenized_test = tokenized_test.rename_column("label", "labels")

tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_val.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Metrics function for evaluation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    f1 = f1_score(labels, predictions, average='weighted')
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')

    return {
        'f1': f1,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall
    }
# Training arguments
training_args = TrainingArguments(
    output_dir=config["output_dir"],
    learning_rate=config["learning_rate"],
    per_device_train_batch_size=config["train_batch_size"],
    per_device_eval_batch_size=config["eval_batch_size"],
    num_train_epochs=config["epochs"],
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    warmup_ratio=config["warmup_ratio"],
    logging_dir='./logs',
    logging_steps=500,
    save_total_limit=2,
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model
print("Training cross-encoder model...")
trainer.train()

# Save the model
trainer.save_model()
tokenizer.save_pretrained(config["output_dir"])
# Evaluate on test set
print("\nEvaluating on test set...")
test_results = trainer.evaluate(tokenized_test)
print(f"Test Results:")
print(f"F1-Score: {test_results['eval_f1']:.4f}")
print(f"Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"Precision: {test_results['eval_precision']:.4f}")
print(f"Recall: {test_results['eval_recall']:.4f}")

# Get detailed predictions on test set
test_predictions = trainer.predict(tokenized_test)
predicted_labels = np.argmax(test_predictions.predictions, axis=1)
true_labels = test_predictions.label_ids

# Detailed classification report
print("\nDetailed Classification Report:")
print(classification_report(true_labels, predicted_labels, target_names=['Not Duplicate', 'Duplicate']))

# Calculate and display F1-score
f1 = f1_score(true_labels, predicted_labels)
print(f"Final F1-Score on test set: {f1:.4f}")

# Example predictions
print("\nSample predictions from test set:")
sample_indices = np.random.choice(len(test), 5, replace=False)
for i in sample_indices:
    q1 = test.iloc[i]["question1"]
    q2 = test.iloc[i]["question2"]
    true_label = test.iloc[i]["label"]
    pred_label = predicted_labels[i]

    print(f"Q1: {q1}")
    print(f"Q2: {q2}")
    print(f"True: {'Duplicate' if true_label == 1 else 'Not Duplicate'}")
    print(f"Pred: {'Duplicate' if pred_label == 1 else 'Not Duplicate'}")
    print(f"Correct: {true_label == pred_label}")
    print("-" * 80)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/242571 [00:00<?, ? examples/s]

Map:   0%|          | 0/80858 [00:00<?, ? examples/s]

Map:   0%|          | 0/80858 [00:00<?, ? examples/s]

  trainer = Trainer(


Training cross-encoder model...




Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,No log,0.308227,0.861322,0.862017,0.861139,0.862017
2,0.391000,0.287632,0.876773,0.875436,0.882196,0.875436
3,0.277700,0.310986,0.888358,0.887618,0.890378,0.887618
4,0.219600,0.284841,0.892064,0.891835,0.892408,0.891835
5,0.174900,0.304377,0.893085,0.892552,0.894306,0.892552





Evaluating on test set...




Test Results:
F1-Score: 0.8939
Accuracy: 0.8933
Precision: 0.8950
Recall: 0.8933





Detailed Classification Report:
               precision    recall  f1-score   support

Not Duplicate       0.93      0.90      0.91     51005
    Duplicate       0.84      0.88      0.86     29853

     accuracy                           0.89     80858
    macro avg       0.88      0.89      0.89     80858
 weighted avg       0.90      0.89      0.89     80858

Final F1-Score on test set: 0.8592

Sample predictions from test set:
Q1: Is the new TV show “Westworld” worth watching?
Q2: Is westworld worth watching?
True: Duplicate
Pred: Duplicate
Correct: True
--------------------------------------------------------------------------------
Q1: Is daily masturbation causes any hair fall?
Q2: Does excessive masturbation lead to hair loss?
True: Duplicate
Pred: Duplicate
Correct: True
--------------------------------------------------------------------------------
Q1: How do I study 7th grade?
Q2: I am in college studying the foundation of science. I really want to keep up my good grade. H