In [41]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [42]:
import os
from pathlib import Path
from typing import Dict, List

import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from sentence_transformers import (
    InputExample,
    SentenceTransformer,
    losses,
    models,
    util,
)
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from torch.utils.data import DataLoader

os.environ.setdefault("WANDB_DISABLED", "true")

'true'

In [43]:
DATA_ROOT = Path('/content/drive/MyDrive/MIDS/266/FinalProject/Data/')
MODEL_ROOT = Path('/content/drive/MyDrive/MIDS/266/FinalProject/Model/')

In [44]:
DEFAULT_MODEL = "sentence-transformers/all-mpnet-base-v2"
DEFAULT_BATCH = 32
DEFAULT_EPOCHS = 1
DEFAULT_MAX_LENGTH = 256
WARMUP_RATIO = 0.1

In [45]:
def load_split(pair_type: str, split: str) -> pd.DataFrame:
    path = Path(DATA_ROOT) / pair_type / f"{split}.jsonl"
    if not path.exists():
        raise FileNotFoundError(f"Missing split file: {path}")
    return pd.read_json(path, lines=True)

In [46]:
def df_to_examples(df: pd.DataFrame, field1: str, field2: str) -> List[InputExample]:
    return [
        InputExample(texts=[str(row[field1]), str(row[field2])], label=float(row["label"]))
        for _, row in df.iterrows()
    ]

In [47]:
def make_evaluator(df: pd.DataFrame, field1: str, field2: str) -> EmbeddingSimilarityEvaluator:
    s1 = [str(x) for x in df[field1].tolist()]
    s2 = [str(x) for x in df[field2].tolist()]
    scores = [float(x) for x in df["label"].tolist()]
    return EmbeddingSimilarityEvaluator(s1, s2, scores)

In [48]:
def compute_threshold(scores: np.ndarray, labels: np.ndarray) -> float:
    uniq = np.unique(scores)
    candidates = np.concatenate([uniq, [0.5]])
    best_thr, best_f1, best_acc = 0.5, -1.0, -1.0
    for thr in candidates:
        preds = (scores >= thr).astype(int)
        f1 = f1_score(labels, preds)
        acc = accuracy_score(labels, preds)
        if f1 > best_f1 or (f1 == best_f1 and acc > best_acc):
            best_thr, best_f1, best_acc = thr, f1, acc
    return float(best_thr)

In [49]:
def evaluate_pairs(
    model: SentenceTransformer,
    df: pd.DataFrame,
    field1: str,
    field2: str,
    batch_size: int,
    threshold: float,
) -> Dict[str, float]:
    texts1 = [str(x) for x in df[field1].tolist()]
    texts2 = [str(x) for x in df[field2].tolist()]
    labels = np.array(df["label"].tolist())

    emb1 = model.encode(texts1, batch_size=batch_size, convert_to_tensor=True, show_progress_bar=False)
    emb2 = model.encode(texts2, batch_size=batch_size, convert_to_tensor=True, show_progress_bar=False)
    scores = util.cos_sim(emb1, emb2).diagonal().cpu().numpy()

    preds = (scores >= threshold).astype(int)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds),
        "threshold": float(threshold),
    }

In [50]:
def train_sbert(
    pair_type: str,
    field1: str,
    field2: str,
    save_name: str,
    model_name: str = DEFAULT_MODEL,
    max_length: int = DEFAULT_MAX_LENGTH,
    epochs: int = DEFAULT_EPOCHS,
    batch_size: int = DEFAULT_BATCH,
    eval_steps: int = 500,
) -> str:
    train_df = load_split(pair_type, "train")
    val_df = load_split(pair_type, "val")

    MODEL_ROOT.mkdir(parents=True, exist_ok=True)
    save_path = MODEL_ROOT / save_name

    word_emb = models.Transformer(model_name, max_seq_length=max_length)
    pooling = models.Pooling(word_emb.get_word_embedding_dimension())
    model = SentenceTransformer(modules=[word_emb, pooling])

    train_examples = df_to_examples(train_df, field1, field2)
    train_loader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
    loss = losses.CosineSimilarityLoss(model)

    evaluator = make_evaluator(val_df, field1, field2)
    warmup_steps = int(len(train_loader) * epochs * WARMUP_RATIO)

    eval_steps = max(10, min(eval_steps, max(1, len(train_loader) // 2)))

    model.fit(
        train_objectives=[(train_loader, loss)],
        evaluator=evaluator,
        epochs=epochs,
        warmup_steps=warmup_steps,
        evaluation_steps=eval_steps,
        output_path=str(save_path),
        optimizer_params={"lr": 2e-5},
        show_progress_bar=True,
        use_amp=True,
    )

    return str(save_path)

In [51]:
def run_sbert_pipeline(
    pair_type: str,
    field1: str,
    field2: str,
    save_name: str,
    model_name: str = DEFAULT_MODEL,
    max_length: int = DEFAULT_MAX_LENGTH,
    epochs: int = DEFAULT_EPOCHS,
    batch_size: int = DEFAULT_BATCH,
    train_first: bool = True,
) -> Dict[str, float]:
    if train_first:
        print(f"Training SBERT on {pair_type} -> {save_name}")
        model_path = train_sbert(
            pair_type=pair_type,
            field1=field1,
            field2=field2,
            save_name=save_name,
            model_name=model_name,
            max_length=max_length,
            epochs=epochs,
            batch_size=batch_size,
        )
    else:
        model_path = MODEL_ROOT / save_name
        if not model_path.exists():
            raise FileNotFoundError(f"Model not found: {model_path}")
        print(f"Skipping training, loading SBERT from {model_path}")

    model = SentenceTransformer(str(model_path))

    val_df = load_split(pair_type, "val")
    test_df = load_split(pair_type, "test")

    val_texts1 = [str(x) for x in val_df[field1].tolist()]
    val_texts2 = [str(x) for x in val_df[field2].tolist()]
    val_labels = np.array(val_df["label"].tolist())
    val_scores = util.cos_sim(
        model.encode(val_texts1, batch_size=batch_size, convert_to_tensor=True, show_progress_bar=False),
        model.encode(val_texts2, batch_size=batch_size, convert_to_tensor=True, show_progress_bar=False),
    ).diagonal().cpu().numpy()
    threshold = compute_threshold(val_scores, val_labels)
    print(f"Validation-tuned threshold: {threshold:.4f}")

    test_metrics = evaluate_pairs(
        model=model,
        df=test_df,
        field1=field1,
        field2=field2,
        batch_size=batch_size,
        threshold=threshold,
    )
    print(f"Test metrics for {pair_type}: acc={test_metrics['accuracy']:.4f}, f1={test_metrics['f1']:.4f}")
    return test_metrics

In [52]:
run_sbert_pipeline(
    pair_type="title-title-pair",
    field1="title1",
    field2="title2",
    save_name="sbert-title-mpnet",
    max_length=128,
    epochs=1,
    batch_size=32,
    train_first=True,
)

Training SBERT on title-title-pair -> sbert-title-mpnet


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Pearson Cosine,Spearman Cosine
500,0.0797,No log,0.922652,0.855863
1000,0.0474,No log,0.921692,0.854221
1500,0.0451,No log,0.918905,0.853243
2000,0.0438,No log,0.923239,0.85334
2500,0.0422,No log,0.920007,0.852088
3000,0.0415,No log,0.922609,0.852745


Step,Training Loss,Validation Loss,Pearson Cosine,Spearman Cosine
500,0.0797,No log,0.922652,0.855863
1000,0.0474,No log,0.921692,0.854221
1500,0.0451,No log,0.918905,0.853243
2000,0.0438,No log,0.923239,0.85334
2500,0.0422,No log,0.920007,0.852088
3000,0.0415,No log,0.922609,0.852745
3500,0.0409,No log,0.923674,0.853358
4000,0.0418,No log,0.923991,0.852829
4500,0.041,No log,0.924483,0.853106
5000,0.0393,No log,0.924759,0.852665


Validation-tuned threshold: 0.4351
Test metrics for title-title-pair: acc=0.9704, f1=0.9705


{'accuracy': 0.9704492122024807,
 'f1': 0.9704813729593973,
 'threshold': 0.43507665395736694}

In [53]:
run_sbert_pipeline(
    pair_type="body-body-pair",
    field1="body1",
    field2="body2",
    save_name="sbert-body-mpnet",
    max_length=256,
    epochs=1,
    batch_size=16,
    train_first=True,
)

Training SBERT on body-body-pair -> sbert-body-mpnet


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Pearson Cosine,Spearman Cosine
500,0.0746,No log,0.949944,0.863369
1000,0.0349,No log,0.951896,0.861949
1500,0.0286,No log,0.950725,0.861163
2000,0.0278,No log,0.949653,0.860198
2500,0.0287,No log,0.951214,0.860977
3000,0.027,No log,0.951976,0.860797
3500,0.0275,No log,0.951926,0.860443
4000,0.0277,No log,0.951449,0.860248
4500,0.0272,No log,0.953319,0.860915
5000,0.0246,No log,0.953902,0.859979


Validation-tuned threshold: 0.4209
Test metrics for body-body-pair: acc=0.9830, f1=0.9831


{'accuracy': 0.9830351011835037,
 'f1': 0.9831162566328991,
 'threshold': 0.42086976766586304}

In [54]:
run_sbert_pipeline(
    pair_type="post-post-pair",
    field1="post1",
    field2="post2",
    save_name="sbert-post-mpnet",
    max_length=256,
    epochs=1,
    batch_size=12,
    train_first=True,
)

Training SBERT on post-post-pair -> sbert-post-mpnet


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Pearson Cosine,Spearman Cosine
500,0.0615,No log,0.963526,0.865049
1000,0.0271,No log,0.965928,0.864548
1500,0.0237,No log,0.965131,0.864004
2000,0.0213,No log,0.964198,0.863505
2500,0.0211,No log,0.965545,0.863632
3000,0.0218,No log,0.963512,0.863204
3500,0.0192,No log,0.962567,0.862944
4000,0.0221,No log,0.963096,0.863083
4500,0.0208,No log,0.964811,0.863328
5000,0.0204,No log,0.964497,0.862872


Validation-tuned threshold: 0.4469
Test metrics for post-post-pair: acc=0.9913, f1=0.9914


{'accuracy': 0.9913442346077077,
 'f1': 0.9913603187678397,
 'threshold': 0.44687163829803467}