# BERT-base

In [None]:
!pip install transformers datasets accelerate -q
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import torch
from torch.utils.data import Dataset
import os
os.environ["WANDB_DISABLED"] = "true"

# ----------------------------
# Dataclass set and tokeniztaion
# ----------------------------
class PairDataset(Dataset):
    def __init__(self, df, tokenizer, field1, field2, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.field1 = field1
        self.field2 = field2
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        encoded = self.tokenizer(
            row[self.field1],
            row[self.field2],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": encoded["input_ids"].squeeze(),
            "attention_mask": encoded["attention_mask"].squeeze(),
            "labels": torch.tensor(row["label"], dtype=torch.long),
        }


# ----------------------------
# metirc help function
# ----------------------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds),
    }


# ----------------------------
# train_and_save help function
# ----------------------------
def train_and_save_bert(train_df, val_df, field1, field2, save_name,
                        max_length=128, epochs=1, batch=16):

    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    train_dataset = PairDataset(train_df, tokenizer, field1, field2, max_length)
    val_dataset   = PairDataset(val_df, tokenizer, field1, field2, max_length)

    model = AutoModelForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=2
    )

    training_args = TrainingArguments(
        output_dir="./bert-base-checkpoints",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=batch,
        per_device_eval_batch_size=batch,
        num_train_epochs=epochs,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        fp16=True,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # Save BEST model
    save_path = f"/content/drive/MyDrive/266NoteBooks/FinalProject/Model/{save_name}"
    trainer.save_model(save_path)
    tokenizer.save_pretrained(save_path)

    print("Model saved to:", save_path)

    return save_path

# ----------------------------
# load_and_eval help function
# ----------------------------
def load_and_eval_bert(model_path, test_df, field1, field2, max_length=128):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)

    test_dataset = PairDataset(test_df, tokenizer, field1, field2, max_length)

    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    raw = trainer.evaluate(test_dataset)

    metrics = {
        "test_loss": raw["eval_loss"],
        "test_accuracy": raw["eval_accuracy"],
        "test_f1": raw["eval_f1"],
    }

    print("\nTest Metrics:", metrics)
    return metrics


# ----------------------------
# Run pipeline (train-save-load-eval OR load-eval)
# ----------------------------
def run_bert_pipeline(
        pair_type, field1, field2, save_name,
        max_length=128, epochs=1, batch=16,
        train_first=True):  # turn on training mode, if false direstly load from saved version

    data_root = '/content/drive/MyDrive/266NoteBooks/FinalProject/Data/'
    model_root = "/content/drive/MyDrive/266NoteBooks/FinalProject/Model"
    model_path = f"{model_root}/{save_name}"

    # ----------------------------
    # Load Data
    # ----------------------------
    train_df = pd.read_json(f"{data_root}{pair_type}/train.jsonl", lines=True)
    val_df   = pd.read_json(f"{data_root}{pair_type}/val.jsonl", lines=True)
    test_df  = pd.read_json(f"{data_root}{pair_type}/test.jsonl", lines=True)

    print(f"\n Loaded dataset: {pair_type}")
    print("Train:", train_df.shape, "Val:", val_df.shape, "Test:", test_df.shape)

    # ----------------------------
    # Option 1: Train + Save
    # ----------------------------
    if train_first:
        print(f"\n Training & Saving Model: {save_name}")
        model_path = train_and_save_bert(
            train_df=train_df,
            val_df=val_df,
            field1=field1,
            field2=field2,
            save_name=save_name,
            max_length=max_length,
            epochs=epochs,
            batch=batch
        )
    else:
        print(f"\n Skipping training. Loading existing model: {model_path}")

    # ----------------------------
    # Load & Evaluate
    # ----------------------------
    print(f"\n Evaluating model: {save_name}")
    metrics = load_and_eval_bert(
        model_path=model_path,
        test_df=test_df,
        field1=field1,
        field2=field2,
        max_length=max_length
    )

    print(f"\n Finished: {save_name}")
    # print(metrics)
    return metrics



In [None]:
# Check the max token length of title, body, and post to decide max_length
# The max token limit for BERT is 512（ ModernBERT can take 4k， will try later)
def get_length_stats(df, field1, field2):
  from transformers import AutoTokenizer
  tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
  lengths = []
  for i, row in df.iterrows():
      text1 = str(row[field1])
      text2 = str(row[field2])
      encoded = tokenizer.encode(text1, text2, add_special_tokens=True)
      lengths.append(len(encoded))
  lengths = np.array(lengths)
  return {
      "mean": float(np.mean(lengths)),
      "median": float(np.median(lengths)),
      "95th_percentile": float(np.percentile(lengths, 95)),
      "max": int(np.max(lengths)),
      "min": int(np.min(lengths)),
  }
MydriveRootPath = '/content/drive/MyDrive/266NoteBooks/FinalProject/Data/'
train_title = pd.read_json(f"{MydriveRootPath}title-title-pair/train.jsonl", lines=True)
stats_title = get_length_stats(train_title, "title1", "title2")
train_body = pd.read_json(f"{MydriveRootPath}body-body-pair/train.jsonl", lines=True)
stats_body = get_length_stats(train_body, "body1", "body2")
train_post = pd.read_json(f"{MydriveRootPath}post-post-pair/train.jsonl", lines=True)
stats_post = get_length_stats(train_post, "post1", "post2")
print(stats_title)
print(stats_body)
print(stats_post)

Token indices sequence length is longer than the specified maximum sequence length for this model (1388 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (568 > 512). Running this sequence through the model will result in indexing errors


{'mean': 30.999358712523488, 'median': 28.0, '95th_percentile': 57.0, 'max': 216, 'min': 7}
{'mean': 344.0520482478365, 'median': 270.0, '95th_percentile': 854.0, 'max': 3307, 'min': 20}
{'mean': 372.38553132675077, 'median': 298.0, '95th_percentile': 888.0, 'max': 3730, 'min': 38}


In [None]:
# Run BERT on title data
run_bert_pipeline("title-title-pair", "title1", "title2", "bert-base-title-v1",
                  max_length=64, epochs=1, batch=32, train_first=True)



 Loaded dataset: title-title-pair
Train: (489640, 3) Val: (58976, 3) Test: (59660, 3)

 Training & Saving Model → bert-base-title-v1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1088,0.130025,0.954236,0.953749


  trainer = Trainer(
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Model saved to: /content/drive/MyDrive/266NoteBooks/FinalProject/Model/bert-base-title-v1

 Evaluating model: bert-base-title-v1


Test Evaluation: {'eval_loss': 0.119511678814888, 'eval_model_preparation_time': 0.0027, 'eval_accuracy': 0.9571237009721757, 'eval_f1': 0.9567057071288335, 'eval_runtime': 126.7469, 'eval_samples_per_second': 470.702, 'eval_steps_per_second': 58.842}

 Finished: bert-base-title-v1
{'eval_loss': 0.119511678814888, 'eval_model_preparation_time': 0.0027, 'eval_accuracy': 0.9571237009721757, 'eval_f1': 0.9567057071288335, 'eval_runtime': 126.7469, 'eval_samples_per_second': 470.702, 'eval_steps_per_second': 58.842}


{'eval_loss': 0.119511678814888,
 'eval_model_preparation_time': 0.0027,
 'eval_accuracy': 0.9571237009721757,
 'eval_f1': 0.9567057071288335,
 'eval_runtime': 126.7469,
 'eval_samples_per_second': 470.702,
 'eval_steps_per_second': 58.842}

In [None]:
# Run BERT on body data
run_bert_pipeline("body-body-pair", "body1", "body2", "bert-base-body-v1",
                  max_length=512, epochs=1, batch=12, train_first=True)


 Loaded dataset: body-body-pair
Train: (402588, 3) Val: (48390, 3) Test: (49514, 3)

 Training & Saving Model → bert-base-body-v1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0644,0.096891,0.97609,0.976033


Model saved to: /content/drive/MyDrive/266NoteBooks/FinalProject/Model/bert-base-body-v1

 Evaluating model: bert-base-body-v1


  trainer = Trainer(
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Test Evaluation: {'eval_loss': 0.08460821956396103, 'eval_model_preparation_time': 0.0029, 'eval_accuracy': 0.9786525023225754, 'eval_f1': 0.978651208822282, 'eval_runtime': 428.8374, 'eval_samples_per_second': 115.461, 'eval_steps_per_second': 14.434}

 Finished: bert-base-body-v1
{'eval_loss': 0.08460821956396103, 'eval_model_preparation_time': 0.0029, 'eval_accuracy': 0.9786525023225754, 'eval_f1': 0.978651208822282, 'eval_runtime': 428.8374, 'eval_samples_per_second': 115.461, 'eval_steps_per_second': 14.434}


{'eval_loss': 0.08460821956396103,
 'eval_model_preparation_time': 0.0029,
 'eval_accuracy': 0.9786525023225754,
 'eval_f1': 0.978651208822282,
 'eval_runtime': 428.8374,
 'eval_samples_per_second': 115.461,
 'eval_steps_per_second': 14.434}

In [None]:
# Run BERT on post data
run_bert_pipeline("post-post-pair", "post1", "post2", "bert-base-post-v1",
                   max_length=512, epochs=1, batch=8, train_first=True)


 Loaded dataset: post-post-pair
Train: (402276, 3) Val: (46504, 3) Test: (52104, 3)

 Training & Saving Model → bert-base-post-v1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0548,0.059949,0.98684,0.98681


  trainer = Trainer(
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Model saved to: /content/drive/MyDrive/266NoteBooks/FinalProject/Model/bert-base-post-v1

 Evaluating model: bert-base-post-v1


Test Evaluation: {'eval_loss': 0.06528539955615997, 'eval_model_preparation_time': 0.0028, 'eval_accuracy': 0.9859319821894672, 'eval_f1': 0.9859062854505951, 'eval_runtime': 455.476, 'eval_samples_per_second': 114.395, 'eval_steps_per_second': 14.299}

 Finished: bert-base-post-v1
{'eval_loss': 0.06528539955615997, 'eval_model_preparation_time': 0.0028, 'eval_accuracy': 0.9859319821894672, 'eval_f1': 0.9859062854505951, 'eval_runtime': 455.476, 'eval_samples_per_second': 114.395, 'eval_steps_per_second': 14.299}


{'eval_loss': 0.06528539955615997,
 'eval_model_preparation_time': 0.0028,
 'eval_accuracy': 0.9859319821894672,
 'eval_f1': 0.9859062854505951,
 'eval_runtime': 455.476,
 'eval_samples_per_second': 114.395,
 'eval_steps_per_second': 14.299}