In [75]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [76]:
import os
from pathlib import Path
from typing import Dict

import numpy as np
import pandas as pd
import torch
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.data import Dataset
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)

# Disable W&B logging unless explicitly enabled
os.environ.setdefault("WANDB_DISABLED", "true")


'true'

In [77]:
DATA_ROOT = Path('/content/drive/MyDrive/MIDS/266/FinalProject/Data/')
MODEL_ROOT = Path('/content/drive/MyDrive/MIDS/266/FinalProject/Model/')

In [78]:
DEFAULT_MODEL = "microsoft/deberta-v3-base"
DEFAULT_MAX_LENGTH = 256
DEFAULT_EPOCHS = 1
DEFAULT_BATCH_SIZE = 16

In [79]:
def load_split(pair_type: str, split: str) -> pd.DataFrame:
    path = DATA_ROOT / pair_type / f"{split}.jsonl"
    if not path.exists():
        raise FileNotFoundError(f"Missing split file: {path}")
    return pd.read_json(path, lines=True)

In [80]:
class PairDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer, field1: str, field2: str, max_length: int):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.field1 = field1
        self.field2 = field2
        self.max_length = max_length

    def __len__(self) -> int:
        return len(self.df)

    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        row = self.df.iloc[idx]
        encoded = self.tokenizer(
            str(row[self.field1]),
            str(row[self.field2]),
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": encoded["input_ids"].squeeze(),
            "attention_mask": encoded["attention_mask"].squeeze(),
            "labels": torch.tensor(int(row["label"]), dtype=torch.long),
        }


In [81]:
def compute_metrics(eval_pred) -> Dict[str, float]:
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds),
    }

In [82]:
def train_deberta(
    pair_type: str,
    field1: str,
    field2: str,
    save_name: str,
    model_name: str = DEFAULT_MODEL,
    max_length: int = DEFAULT_MAX_LENGTH,
    epochs: int = DEFAULT_EPOCHS,
    batch_size: int = DEFAULT_BATCH_SIZE,
    learning_rate: float = 2e-5,
) -> str:
    train_df = load_split(pair_type, "train")
    val_df = load_split(pair_type, "val")

    MODEL_ROOT.mkdir(parents=True, exist_ok=True)
    save_path = MODEL_ROOT / save_name

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    train_ds = PairDataset(train_df, tokenizer, field1, field2, max_length)
    val_ds = PairDataset(val_df, tokenizer, field1, field2, max_length)

    training_args = TrainingArguments(
        output_dir=str(save_path / "checkpoints"),
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        save_total_limit=1,
        fp16=torch.cuda.is_available(),
        report_to="none",
        logging_steps=50,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # Save best model and tokenizer
    trainer.save_model(str(save_path))
    tokenizer.save_pretrained(str(save_path))
    return str(save_path)

In [91]:
def evaluate_deberta(
    model_path: Path,
    test_df: pd.DataFrame,
    field1: str,
    field2: str,
    max_length: int,
    batch_size: int,
    model_name: str = DEFAULT_MODEL,
) -> Dict[str, float]:
    config = AutoConfig.from_pretrained(str(model_path))
    model = AutoModelForSequenceClassification.from_pretrained(str(model_path), config=config)

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    test_ds = PairDataset(test_df, tokenizer, field1, field2, max_length)

    eval_args = TrainingArguments(
        output_dir=str(model_path / "eval"),
        per_device_eval_batch_size=batch_size,
        report_to="none",
        fp16=torch.cuda.is_available(),
    )

    trainer = Trainer(
        model=model,
        args=eval_args,
        eval_dataset=test_ds,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
    metrics = trainer.evaluate()
    return {
        "test_loss": metrics.get("eval_loss", float("nan")),
        "test_accuracy": metrics.get("eval_accuracy", float("nan")),
        "test_f1": metrics.get("eval_f1", float("nan")),
    }

In [92]:
def run_deberta_pipeline(
    pair_type: str,
    field1: str,
    field2: str,
    save_name: str,
    model_name: str = DEFAULT_MODEL,
    max_length: int = DEFAULT_MAX_LENGTH,
    epochs: int = DEFAULT_EPOCHS,
    batch_size: int = DEFAULT_BATCH_SIZE,
    train_first: bool = True,
) -> Dict[str, float]:
    if train_first:
        print(f"Training DeBERTa on {pair_type} -> {save_name}")
        model_path = Path(
            train_deberta(
                pair_type=pair_type,
                field1=field1,
                field2=field2,
                save_name=save_name,
                model_name=model_name,
                max_length=max_length,
                epochs=epochs,
                batch_size=batch_size,
            )
        )
    else:
        model_path = MODEL_ROOT / save_name
        if not model_path.exists():
            raise FileNotFoundError(f"Model not found: {model_path}")
        print(f"Skipping training, loading DeBERTa from {model_path}")

    test_df = load_split(pair_type, "test")
    metrics = evaluate_deberta(
        model_path=model_path,
        test_df=test_df,
        field1=field1,
        field2=field2,
        max_length=max_length,
        batch_size=batch_size,
        model_name = model_name,
    )
    print(f"Test metrics for {pair_type}: acc={metrics['test_accuracy']:.4f}, f1={metrics['test_f1']:.4f}")
    return metrics

In [93]:
run_deberta_pipeline(
    pair_type="title-title-pair",
    field1="title1",
    field2="title2",
    save_name="deberta-title-v3-base",
    max_length=128,
    epochs=1,
    batch_size=32,
    train_first=False,
)

Skipping training, loading DeBERTa from /content/drive/MyDrive/MIDS/266/FinalProject/Model/deberta-title-v3-base


  trainer = Trainer(


{'eval_loss': 0.0769580826163292, 'eval_model_preparation_time': 0.0029, 'eval_accuracy': 0.9752430439155213, 'eval_f1': 0.9752617033749267, 'eval_runtime': 95.34, 'eval_samples_per_second': 625.76, 'eval_steps_per_second': 19.562}
Test metrics for title-title-pair: acc=0.9752, f1=0.9753


{'test_loss': 0.0769580826163292,
 'test_accuracy': 0.9752430439155213,
 'test_f1': 0.9752617033749267}

In [94]:
run_deberta_pipeline(
    pair_type="body-body-pair",
    field1="body1",
    field2="body2",
    save_name="deberta-body-v3-base",
    max_length=256,
    epochs=1,
    batch_size=12,
    train_first=False,
)

Skipping training, loading DeBERTa from /content/drive/MyDrive/MIDS/266/FinalProject/Model/deberta-body-v3-base


  trainer = Trainer(


{'eval_loss': 0.0740351751446724, 'eval_model_preparation_time': 0.003, 'eval_accuracy': 0.9820656783939896, 'eval_f1': 0.9821313586606568, 'eval_runtime': 243.0305, 'eval_samples_per_second': 203.736, 'eval_steps_per_second': 16.981}
Test metrics for body-body-pair: acc=0.9821, f1=0.9821


{'test_loss': 0.0740351751446724,
 'test_accuracy': 0.9820656783939896,
 'test_f1': 0.9821313586606568}

In [95]:
run_deberta_pipeline(
    pair_type="post-post-pair",
    field1="post1",
    field2="post2",
    save_name="deberta-post-v3-base",
    max_length=256,
    epochs=1,
    batch_size=12,
    train_first=False,
)

Skipping training, loading DeBERTa from /content/drive/MyDrive/MIDS/266/FinalProject/Model/deberta-post-v3-base


  trainer = Trainer(


{'eval_loss': 0.038536615669727325, 'eval_model_preparation_time': 0.0029, 'eval_accuracy': 0.9912482726853984, 'eval_f1': 0.9912428944538332, 'eval_runtime': 267.0531, 'eval_samples_per_second': 195.107, 'eval_steps_per_second': 16.259}
Test metrics for post-post-pair: acc=0.9912, f1=0.9912


{'test_loss': 0.038536615669727325,
 'test_accuracy': 0.9912482726853984,
 'test_f1': 0.9912428944538332}