In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/quora-question-pairs/train.csv.zip
/kaggle/input/quora-question-pairs/sample_submission.csv.zip
/kaggle/input/quora-question-pairs/test.csv
/kaggle/input/quora-question-pairs/test.csv.zip


In [2]:
import os
os.environ["WANDB_MODE"] = "offline"  # Disable wandb logging for Kaggle


In [3]:
import os
import random
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, f1_score

# SentenceTransformers imports
from sentence_transformers import SentenceTransformer, InputExample, losses, util
from sentence_transformers.cross_encoder import CrossEncoder
from torch.utils.data import DataLoader

# Reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# Paths
TRAIN_CSV = "/kaggle/input/quora-question-pairs/train.csv.zip"
OUTPUT_DIR = Path("/kaggle/working/models_quora_experiments")
OUTPUT_DIR.mkdir(exist_ok=True)

# Training hyperparams
EPOCHS = 1 
BATCH_SIZE = 32
LR = 2e-5
MAX_LEN = 128

# Model choices
BASE_BIEncoder = 'sentence-transformers/all-MiniLM-L6-v2'
BI_ENCODER_SAVE_PREFIX = OUTPUT_DIR / 'bi-'
CROSS_ENCODER_SAVE_PREFIX = OUTPUT_DIR / 'cross-'

# --- Load and prepare dataset ---
def load_quora(csv_path: str):
    df = pd.read_csv(csv_path)
    col_map = {}
    for c in df.columns:
        lc = c.lower()
        if 'question1' in lc:
            col_map[c] = 'question1'
        elif 'question2' in lc:
            col_map[c] = 'question2'
        elif 'is_duplicate' in lc or 'isduplicat' in lc:
            col_map[c] = 'is_duplicate'
    df = df.rename(columns=col_map)
    df = df[['question1', 'question2', 'is_duplicate']].dropna()
    df['is_duplicate'] = df['is_duplicate'].astype(int)
    return df

df = load_quora(TRAIN_CSV)
print(f"Total rows after dropna: {len(df)}")

# Split dataset
def prepare_splits(df, test_size=0.10, val_size=0.10, random_state=RANDOM_SEED):
    train_val, test = train_test_split(df, test_size=test_size, stratify=df['is_duplicate'], random_state=random_state)
    relative_val = val_size / (1 - test_size)
    train, val = train_test_split(train_val, test_size=relative_val, stratify=train_val['is_duplicate'], random_state=random_state)
    return train.reset_index(drop=True), val.reset_index(drop=True), test.reset_index(drop=True)

train_df, val_df, test_df = prepare_splits(df)
print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

# Convert to InputExample
def create_input_examples(df):
    return [InputExample(texts=[str(r['question1']), str(r['question2'])], label=float(r['is_duplicate'])) for _, r in df.iterrows()]

train_examples = create_input_examples(train_df)
val_examples = create_input_examples(val_df)

# --- Evaluation function for bi-encoder ---
def evaluate_bi_encoder(model, val_df, test_df, batch_size=64):
    val_emb1 = model.encode(val_df['question1'].astype(str).tolist(), convert_to_tensor=True, batch_size=batch_size)
    val_emb2 = model.encode(val_df['question2'].astype(str).tolist(), convert_to_tensor=True, batch_size=batch_size)
    test_emb1 = model.encode(test_df['question1'].astype(str).tolist(), convert_to_tensor=True, batch_size=batch_size)
    test_emb2 = model.encode(test_df['question2'].astype(str).tolist(), convert_to_tensor=True, batch_size=batch_size)

    val_sims = util.cos_sim(val_emb1, val_emb2).diag().cpu().numpy()
    test_sims = util.cos_sim(test_emb1, test_emb2).diag().cpu().numpy()
    y_val = val_df['is_duplicate'].values
    y_test = test_df['is_duplicate'].values

    best_thresh, best_f1 = 0.5, 0.0
    for t in np.linspace(0.1, 0.95, 85):
        f1 = f1_score(y_val, (val_sims >= t).astype(int))
        if f1 > best_f1:
            best_f1, best_thresh = f1, t

    test_preds = (test_sims >= best_thresh).astype(int)
    p, r, f, _ = precision_recall_fscore_support(y_test, test_preds, average='binary', zero_division=0)
    return {'val_best_thresh': float(best_thresh), 'val_best_f1': float(best_f1),
            'test_precision': float(p), 'test_recall': float(r), 'test_f1': float(f)}

# --- Benchmark ---
bench_model = SentenceTransformer(BASE_BIEncoder)
bench_metrics = evaluate_bi_encoder(bench_model, val_df, test_df, batch_size=BATCH_SIZE)
bench_model.save(str(BI_ENCODER_SAVE_PREFIX) + 'benchmark')
print("Benchmark metrics:", bench_metrics)

# --- Bi-Encoder Training ---
bi_encoder_results = {}

# 1) CosineSimilarityLoss
print("\n=== Training Bi-Encoder: CosineSimilarityLoss ===")
model_cosine = SentenceTransformer(BASE_BIEncoder)
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=BATCH_SIZE)
train_loss = losses.CosineSimilarityLoss(model_cosine)

model_cosine.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=100,
    show_progress_bar=True
)
model_cosine.save(str(BI_ENCODER_SAVE_PREFIX) + 'cosine')
bi_encoder_results['cosine_loss'] = evaluate_bi_encoder(model_cosine, val_df, test_df, batch_size=BATCH_SIZE)
print("CosineSimilarityLoss metrics:", bi_encoder_results['cosine_loss'])

# 2) ContrastiveLoss
print("\n=== Training Bi-Encoder: ContrastiveLoss ===")
model_contrastive = SentenceTransformer(BASE_BIEncoder)
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=BATCH_SIZE)
train_loss = losses.ContrastiveLoss(model_contrastive)

model_contrastive.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=100,
    show_progress_bar=True
)
model_contrastive.save(str(BI_ENCODER_SAVE_PREFIX) + 'contrastive')
bi_encoder_results['contrastive'] = evaluate_bi_encoder(model_contrastive, val_df, test_df, batch_size=BATCH_SIZE)
print("ContrastiveLoss metrics:", bi_encoder_results['contrastive'])

# 3) MultipleNegativesRankingLoss
print("\n=== Training Bi-Encoder: MultipleNegativesRankingLoss ===")
model_mnrl = SentenceTransformer(BASE_BIEncoder)
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=BATCH_SIZE)
train_loss = losses.MultipleNegativesRankingLoss(model_mnrl)

model_mnrl.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=100,
    show_progress_bar=True
)
model_mnrl.save(str(BI_ENCODER_SAVE_PREFIX) + 'mnrl')
bi_encoder_results['mnrl'] = evaluate_bi_encoder(model_mnrl, val_df, test_df, batch_size=BATCH_SIZE)
print("MNRL metrics:", bi_encoder_results['mnrl'])

# --- Cross-Encoder ---
print("\n=== Training Cross-Encoder ===")
cross_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2',
                           num_labels=1,
                           max_length=MAX_LEN)

# Convert df to InputExamples
train_ce = [InputExample(texts=[q1, q2], label=float(label))
            for q1, q2, label in zip(train_df['question1'], train_df['question2'], train_df['is_duplicate'])]

# Create dataloader
train_dataloader = DataLoader(train_ce, shuffle=True, batch_size=BATCH_SIZE)

# Train (no evaluator)
cross_model.fit(
    train_dataloader=train_dataloader,
    epochs=EPOCHS,
    save_best_model=True,
    output_path=str(CROSS_ENCODER_SAVE_PREFIX) + 'quora'
)

# --- Manual evaluation after training ---
def evaluate_cross_encoder(cross_model, val_df, test_df, batch_size=64):
    val_pairs = list(zip(val_df['question1'].astype(str), val_df['question2'].astype(str)))
    test_pairs = list(zip(test_df['question1'].astype(str), test_df['question2'].astype(str)))
    val_scores = cross_model.predict(val_pairs, batch_size=batch_size)
    test_scores = cross_model.predict(test_pairs, batch_size=batch_size)

    y_val = val_df['is_duplicate'].values
    y_test = test_df['is_duplicate'].values

    best_thresh, best_f1 = 0.5, 0.0
    for t in np.linspace(min(val_scores), max(val_scores), 100):
        f1 = f1_score(y_val, (np.array(val_scores) >= t).astype(int))
        if f1 > best_f1:
            best_f1, best_thresh = f1, t

    test_preds = (np.array(test_scores) >= best_thresh).astype(int)
    p, r, f, _ = precision_recall_fscore_support(y_test, test_preds, average='binary', zero_division=0)
    return {'val_best_thresh': float(best_thresh), 'val_best_f1': float(best_f1),
            'test_precision': float(p), 'test_recall': float(r), 'test_f1': float(f)}

cross_metrics = evaluate_cross_encoder(cross_model, val_df, test_df, batch_size=BATCH_SIZE)
print("Cross-Encoder metrics:", cross_metrics)

# --- Save all results ---
results_df = pd.DataFrame([
    {'experiment': 'benchmark', **bench_metrics},
    {'experiment': 'cosine_loss', **bi_encoder_results['cosine_loss']},
    {'experiment': 'contrastive', **bi_encoder_results['contrastive']},
    {'experiment': 'mnrl', **bi_encoder_results['mnrl']},
    {'experiment': 'cross_encoder', **cross_metrics}
])

print(results_df)


2025-09-07 18:17:55.863839: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757269076.188271      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757269076.281485      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Total rows after dropna: 404287
Train: 323429, Val: 40429, Test: 40429


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

Benchmark metrics: {'val_best_thresh': 0.7273809523809524, 'val_best_f1': 0.7384992509873347, 'test_precision': 0.6176860789606112, 'test_recall': 0.9045960069677074, 'test_f1': 0.7341035748266957}

=== Training Bi-Encoder: CosineSimilarityLoss ===


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

[34m[1mwandb[0m: Tracking run with wandb version 0.20.1
[34m[1mwandb[0m: W&B syncing is set to [1m`offline`[0m in this directory. Run [1m`wandb online`[0m or set [1mWANDB_MODE=online[0m to enable cloud syncing.


Step,Training Loss
500,0.1572
1000,0.1327
1500,0.1261
2000,0.1215
2500,0.121
3000,0.1186
3500,0.1169
4000,0.1166
4500,0.1151
5000,0.1134


Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

CosineSimilarityLoss metrics: {'val_best_thresh': 0.5755952380952382, 'val_best_f1': 0.8194056547525653, 'test_precision': 0.7646678424456202, 'test_recall': 0.8714323998392067, 'test_f1': 0.8145666332665331}

=== Training Bi-Encoder: ContrastiveLoss ===


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0154
1000,0.0136
1500,0.013
2000,0.0126
2500,0.0127
3000,0.012
3500,0.0121
4000,0.0123
4500,0.012
5000,0.0119


Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

ContrastiveLoss metrics: {'val_best_thresh': 0.7880952380952381, 'val_best_f1': 0.8445589264135344, 'test_precision': 0.8024884294043397, 'test_recall': 0.8944794318638617, 'test_f1': 0.8459905585654088}

=== Training Bi-Encoder: MultipleNegativesRankingLoss ===


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.5043
1000,0.4708
1500,0.4661
2000,0.4493
2500,0.4539
3000,0.4536
3500,0.4449
4000,0.4475
4500,0.4658
5000,0.4444


Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

MNRL metrics: {'val_best_thresh': 0.7273809523809524, 'val_best_f1': 0.6924876946663439, 'test_precision': 0.5765870704717531, 'test_recall': 0.8622537853410157, 'test_f1': 0.6910623674389884}

=== Training Cross-Encoder ===


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

Step,Training Loss
500,0.7662
1000,0.5283
1500,0.442
2000,0.4127
2500,0.3982
3000,0.3792
3500,0.3682
4000,0.3614
4500,0.3453
5000,0.3408


Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

Cross-Encoder metrics: {'val_best_thresh': -0.3595892540132155, 'val_best_f1': 0.8174940602726021, 'test_precision': 0.7671800256679501, 'test_recall': 0.8810799946402251, 'test_f1': 0.8201945865036797}
      experiment  val_best_thresh  val_best_f1  test_precision  test_recall  \
0      benchmark         0.727381     0.738499        0.617686     0.904596   
1    cosine_loss         0.575595     0.819406        0.764668     0.871432   
2    contrastive         0.788095     0.844559        0.802488     0.894479   
3           mnrl         0.727381     0.692488        0.576587     0.862254   
4  cross_encoder        -0.359589     0.817494        0.767180     0.881080   

    test_f1  
0  0.734104  
1  0.814567  
2  0.845991  
3  0.691062  
4  0.820195  
