In [1]:
!pip install transformers datasets evaluate

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6

Imports

In [2]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from datasets import load_dataset
import numpy as np
import evaluate
from types import SimpleNamespace
import os
os.environ["WANDB_MODE"] = "disabled"

Tokenize MRCP with Truncation (optionally limit dataset size - max_samples)

In [3]:
def preprocess_data(dataset, tokenizer, max_samples=-1):
    def tokenize(example):
        return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

    if max_samples != -1:
        dataset = dataset.select(range(max_samples))

    return dataset.map(tokenize, batched=True)

GLUE MRPC scorer (evaluates accuracy & F1)

In [4]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Calculate Accuracy

In [5]:
def calculate_accuracy(preds, labels):
    correct = sum([p == l for p, l in zip(preds, labels)])
    return correct / len(labels)

Run the Model with specified hyperparams

In [8]:
def run_model(args, model_name):
    print(f"Running {model_name}...")
    # Load dataset
    raw_datasets = load_dataset("glue", "mrpc")
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    # Tokenize
    tokenized_train = preprocess_data(raw_datasets["train"], tokenizer, args.max_train_samples)
    tokenized_eval = preprocess_data(raw_datasets["validation"], tokenizer, args.max_eval_samples)
    tokenized_test = preprocess_data(raw_datasets["test"], tokenizer, args.max_predict_samples)

    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_path if args.model_path else "bert-base-uncased",
        num_labels=2
    )

    # Training
    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=args.lr,
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        num_train_epochs=args.num_train_epochs,
        eval_strategy="epoch",
        logging_strategy="steps",
        logging_steps=10,
        save_strategy="no",
        load_best_model_at_end=False
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_eval,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        data_collator=DataCollatorWithPadding(tokenizer)
    )

    trainer.train()
    eval_result = trainer.evaluate()
    print(f"Validation accuracy for {model_name}: {eval_result['eval_accuracy'] * 100:.2f}%")

    # Prediction on validation set
    predictions_eval = trainer.predict(tokenized_eval)
    preds_eval = np.argmax(predictions_eval.predictions, axis=-1)
    labels_eval = raw_datasets["validation"]["label"]
    accuracy_eval = calculate_accuracy(preds_eval, labels_eval)
    print(f"Proxy Test (Validation) accuracy for {model_name}: {accuracy_eval * 100:.2f}%")

    # Prediction on the actual test set
    predictions_test = trainer.predict(tokenized_test)
    preds_test = np.argmax(predictions_test.predictions, axis=-1)
    labels_test = raw_datasets["test"]["label"]
    accuracy_test = calculate_accuracy(preds_test, labels_test)
    print(f"Real Test accuracy for {model_name}: {accuracy_test * 100:.2f}%\n")

    # Qualitative Analysis: Identify mismatches
    mismatches = [(i, p, l) for i, (p, l) in enumerate(zip(preds_eval, labels_eval)) if p != l]
    print(f"\nMismatches for {model_name}: {len(mismatches)} out of {len(labels_eval)}\n")
    for i, pred, label in mismatches[:5]:
        print(f"Example {i}:")
        print(f"  Sentence 1: {raw_datasets['validation'][i]['sentence1']}")
        print(f"  Sentence 2: {raw_datasets['validation'][i]['sentence2']}")
        print(f"  True Label: {label}")
        print(f"  Predicted Label: {pred}\n")

Main

In [9]:
def main():
    # Configuration 1: Small LR - Low Accuracy
    args1 = SimpleNamespace(
        max_train_samples=-1,
        max_eval_samples=-1,
        max_predict_samples=-1,
        num_train_epochs=3,
        lr=1e-6,
        batch_size=32,
        do_train=True,
        do_predict=True,
        model_path="bert-base-uncased"
    )

    # Configuration 2: High LR & Few Epochs - Medium Accuracy
    args2 = SimpleNamespace(
        max_train_samples=-1,
        max_eval_samples=-1,
        max_predict_samples=-1,
        num_train_epochs=1,
        lr=5e-5,
        batch_size=32,
        do_train=True,
        do_predict=True,
        model_path="bert-base-uncased"
    )

    # Configuration 3: Good Configuration - High Accuracy
    args3 = SimpleNamespace(
        max_train_samples=-1,
        max_eval_samples=-1,
        max_predict_samples=-1,
        num_train_epochs=3,
        lr=2e-5,
        batch_size=16,
        do_train=True,
        do_predict=True,
        model_path="bert-base-uncased"
    )

    # Run all configurations
    run_model(args1, "Model 1 (Small LR - Low Acc)")
    run_model(args2, "Model 2 (High LR & Few Epochs - Medium Acc)")
    run_model(args3, "Model 3 (Good Configuration - High Acc)")

main()

Running Model 1 (Small LR - Low Acc)...


Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6062,0.61635,0.683824,0.812227
2,0.6421,0.610336,0.683824,0.812227
3,0.622,0.607445,0.683824,0.812227


Validation accuracy for Model 1 (Small LR - Low Acc): 68.38%
Proxy Test (Validation) accuracy for Model 1 (Small LR - Low Acc): 68.38%


Real Test accuracy for Model 1 (Small LR - Low Acc): 66.49%


Mismatches for Model 1 (Small LR - Low Acc): 129 out of 408

Example 1:
  Sentence 1: Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .
  Sentence 2: His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
  True Label: 0
  Predicted Label: 1

Example 2:
  Sentence 1: The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .
  Sentence 2: The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
  True Label: 0
  Predicted Label: 1

Example 4:
  Sentence 1: No dates have been set for the civil or the criminal trial .
  Sentence 2: No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
  True Label: 0
  Predicted Label: 1

Ex

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4663,0.453073,0.803922,0.87013


Validation accuracy for Model 2 (High LR & Few Epochs - Medium Acc): 80.39%
Proxy Test (Validation) accuracy for Model 2 (High LR & Few Epochs - Medium Acc): 80.39%


Real Test accuracy for Model 2 (High LR & Few Epochs - Medium Acc): 79.65%


Mismatches for Model 2 (High LR & Few Epochs - Medium Acc): 80 out of 408

Example 2:
  Sentence 1: The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .
  Sentence 2: The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
  True Label: 0
  Predicted Label: 1

Example 6:
  Sentence 1: While dioxin levels in the environment were up last year , they have dropped by 75 percent since the 1970s , said Caswell .
  Sentence 2: The Institute said dioxin levels in the environment have fallen by as much as 76 percent since the 1970s .
  True Label: 0
  Predicted Label: 1

Example 11:
  Sentence 1: " Sanitation is poor ... there could be typhoid and cholera , " he said .
  Sentence 2: " Sanitation is poor , drinking water is generally left behind . . . there could be typhoid an

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4404,0.427266,0.823529,0.868613
2,0.2582,0.380771,0.862745,0.90378
3,0.1653,0.432356,0.860294,0.90087


Validation accuracy for Model 3 (Good Configuration - High Acc): 86.03%
Proxy Test (Validation) accuracy for Model 3 (Good Configuration - High Acc): 86.03%


Real Test accuracy for Model 3 (Good Configuration - High Acc): 83.42%


Mismatches for Model 3 (Good Configuration - High Acc): 57 out of 408

Example 6:
  Sentence 1: While dioxin levels in the environment were up last year , they have dropped by 75 percent since the 1970s , said Caswell .
  Sentence 2: The Institute said dioxin levels in the environment have fallen by as much as 76 percent since the 1970s .
  True Label: 0
  Predicted Label: 1

Example 35:
  Sentence 1: Bush wanted " to see an aircraft landing the same way that the pilots saw an aircraft landing , " White House press secretary Ari Fleischer said yesterday .
  Sentence 2: On Tuesday , before Byrd 's speech , Fleischer said Bush wanted ' ' to see an aircraft landing the same way that the pilots saw an aircraft landing .
  True Label: 0
  Predicted Label: 1

Example 60:
  Sentence 1: Terri Schiavo , 39 , is expected to die sometime in the next two weeks in the Tampa-area hospice where she has spent the past several yea