In [1]:
!pip -q install transformers datasets torch scikit-learn tqdm


In [2]:
import random
import numpy as np
import torch
import torch.nn as nn

from tqdm import tqdm
from datasets import load_dataset
from torch.utils.data import DataLoader

from transformers import BertTokenizer, BertModel
from sklearn.metrics import classification_report, precision_recall_fscore_support


In [3]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


Device: cpu


In [4]:
dataset = load_dataset("amazon_polarity")

train_dataset = dataset["train"].select(range(500))
test_dataset  = dataset["test"].select(range(5))  # tiny for quick runs per assignment


In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

MAX_LEN = 128  # you can tune this later

def tokenize_function(example):
    # amazon_polarity has fields: "title", "content", "label"
    # We'll combine title + content for stronger signal.
    text = (example["title"] + " " + example["content"]).strip()

    return tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN
    )

train_dataset = train_dataset.map(tokenize_function)
test_dataset  = test_dataset.map(tokenize_function)


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [6]:
columns = ["input_ids", "attention_mask", "token_type_ids", "label"]
train_dataset.set_format(type="torch", columns=columns)
test_dataset.set_format(type="torch", columns=columns)


In [7]:
BATCH_SIZE = 16  # tune later

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=BATCH_SIZE)


In [8]:
class CustomBERTClassifier(nn.Module):
    def __init__(self, bert_model_name="bert-base-uncased", num_labels=2, dropout=0.3):
        super().__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        pooled_output = outputs.pooler_output  # [batch, hidden]
        x = self.dropout(pooled_output)
        logits = self.classifier(x)
        return logits


In [9]:
model = CustomBERTClassifier().to(device)

criterion = nn.CrossEntropyLoss()

# Simple Adam optimizer (fine for assignment). You can switch to AdamW if you want.
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [10]:
def train_one_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch in tqdm(dataloader, desc="Training"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()

        logits = model(input_ids, attention_mask, token_type_ids)
        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item() * labels.size(0)
        preds = torch.argmax(logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    avg_loss = total_loss / max(total, 1)
    avg_acc = correct / max(total, 1)
    print(f"Train Loss: {avg_loss:.4f} | Train Acc: {avg_acc:.4f}")


In [11]:
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            token_type_ids = batch["token_type_ids"].to(device)
            labels = batch["label"].to(device)

            logits = model(input_ids, attention_mask, token_type_ids)
            loss = criterion(logits, labels)

            total_loss += loss.item() * labels.size(0)
            preds = torch.argmax(logits, dim=1)

            correct += (preds == labels).sum().item()
            total += labels.size(0)

            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())

    avg_loss = total_loss / max(total, 1)
    avg_acc = correct / max(total, 1)

    print(f"Eval Loss: {avg_loss:.4f} | Eval Acc: {avg_acc:.4f}")
    return all_labels, all_preds


In [12]:
NUM_EPOCHS = 3

for epoch in range(NUM_EPOCHS):
    print(f"\nEpoch {epoch+1}/{NUM_EPOCHS}")
    train_one_epoch(model, train_loader, optimizer, criterion, device)
    labels, preds = evaluate(model, test_loader, criterion, device)



Epoch 1/3


Training: 100%|████████████████████████████████████████████████████████████████████████| 32/32 [04:46<00:00,  8.94s/it]


Train Loss: 0.6345 | Train Acc: 0.6400


Evaluating: 100%|████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.08s/it]


Eval Loss: 0.4368 | Eval Acc: 0.8000

Epoch 2/3


Training: 100%|████████████████████████████████████████████████████████████████████████| 32/32 [04:51<00:00,  9.10s/it]


Train Loss: 0.3220 | Train Acc: 0.9080


Evaluating: 100%|████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.00it/s]


Eval Loss: 0.5501 | Eval Acc: 0.8000

Epoch 3/3


Training: 100%|████████████████████████████████████████████████████████████████████████| 32/32 [04:44<00:00,  8.90s/it]


Train Loss: 0.1987 | Train Acc: 0.9400


Evaluating: 100%|████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.20it/s]

Eval Loss: 0.3715 | Eval Acc: 0.8000





In [13]:
print("\nClassification Report:")
print(classification_report(labels, preds, digits=4))



Classification Report:
              precision    recall  f1-score   support

           0     0.5000    1.0000    0.6667         1
           1     1.0000    0.7500    0.8571         4

    accuracy                         0.8000         5
   macro avg     0.7500    0.8750    0.7619         5
weighted avg     0.9000    0.8000    0.8190         5



In [14]:
label_map = {0: "negative", 1: "positive"}

def predict_sentiment(text: str):
    model.eval()
    encoded = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
        return_tensors="pt"
    )
    input_ids = encoded["input_ids"].to(device)
    attention_mask = encoded["attention_mask"].to(device)
    token_type_ids = encoded.get("token_type_ids", None)
    if token_type_ids is not None:
        token_type_ids = token_type_ids.to(device)

    with torch.no_grad():
        logits = model(input_ids, attention_mask, token_type_ids)
        pred = torch.argmax(logits, dim=1).item()

    return label_map[pred]

sample_text = "The product quality was amazing and it exceeded my expectations!"
print("Predicted sentiment:", predict_sentiment(sample_text))


Predicted sentiment: positive


In [15]:
class CustomBERTClassifierV2(nn.Module):
    def __init__(self, bert_model_name="bert-base-uncased", num_labels=2, dropout=0.3, hidden_dim=256):
        super().__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)

        self.dropout1 = nn.Dropout(dropout)
        self.fc1 = nn.Linear(self.bert.config.hidden_size, hidden_dim)
        self.act = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden_dim, num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        x = outputs.pooler_output
        x = self.dropout1(x)
        x = self.fc1(x)
        x = self.act(x)
        x = self.dropout2(x)
        logits = self.fc2(x)
        return logits


In [16]:
model_v2 = CustomBERTClassifierV2(hidden_dim=256, dropout=0.3).to(device)
criterion_v2 = nn.CrossEntropyLoss()
optimizer_v2 = torch.optim.Adam(model_v2.parameters(), lr=2e-5)


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [17]:
def run_experiment(model_factory, train_ds, test_ds, *, batch_size, lr, epochs, name):
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    test_loader  = DataLoader(test_ds, batch_size=batch_size)

    model = model_factory().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        print(f"\n[{name}] Epoch {epoch+1}/{epochs}")
        train_one_epoch(model, train_loader, optimizer, criterion, device)
        labels, preds = evaluate(model, test_loader, criterion, device)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = sum(int(p == y) for p, y in zip(preds, labels)) / max(len(labels), 1)

    result = {
        "experiment": name,
        "batch_size": batch_size,
        "lr": lr,
        "epochs": epochs,
        "accuracy": acc,
        "precision": float(precision),
        "recall": float(recall),
        "f1": float(f1),
        "report": classification_report(labels, preds, digits=4)
    }
    return result


In [18]:
def base_factory():
    return CustomBERTClassifier(dropout=0.3)

def v2_factory():
    return CustomBERTClassifierV2(hidden_dim=256, dropout=0.3)


In [19]:
class CustomBERTClassifierV3(nn.Module):
    def __init__(self, bert_model_name="bert-base-uncased", num_labels=2, dropout=0.3, h1=256, h2=128):
        super().__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)

        self.net = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(self.bert.config.hidden_size, h1),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(h1, h2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(h2, num_labels),
        )

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        x = outputs.pooler_output
        return self.net(x)

def v3_factory():
    return CustomBERTClassifierV3(dropout=0.3, h1=256, h2=128)


In [20]:
def evaluate_with_metrics(model, dataloader, criterion, device):
    labels, preds = evaluate(model, dataloader, criterion, device)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = sum(int(p == y) for p, y in zip(preds, labels)) / max(len(labels), 1)

    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print("\nClassification Report:")
    print(classification_report(labels, preds, digits=4))

    return {"accuracy": acc, "precision": float(precision), "recall": float(recall), "f1": float(f1)}


In [21]:
results = []

# Combo 1 (your original): V2 head, bs=32, lr=2e-5, epochs=5
results.append(
    run_experiment(
        v2_factory, train_dataset, test_dataset,
        batch_size=32, lr=2e-5, epochs=5,
        name="Combo1: V2 head (2-layer), epochs=5, bs=32, lr=2e-5"
    )
)

# Combo 2 (your original): V2 head, bs=16, lr=2e-5, epochs=5
results.append(
    run_experiment(
        v2_factory, train_dataset, test_dataset,
        batch_size=16, lr=2e-5, epochs=5,
        name="Combo2: V2 head (2-layer), epochs=5, bs=16, lr=2e-5"
    )
)

# REQUIRED: Learning rate experiment (same V2 head, different LR)
results.append(
    run_experiment(
        v2_factory, train_dataset, test_dataset,
        batch_size=16, lr=1e-5, epochs=5,
        name="LR Test: V2 head (2-layer), epochs=5, bs=16, lr=1e-5"
    )
)

# REQUIRED: V3 experiment (deeper head)
results.append(
    run_experiment(
        v3_factory, train_dataset, test_dataset,
        batch_size=16, lr=2e-5, epochs=5,
        name="V3 Test: deeper head (3-layer), epochs=5, bs=16, lr=2e-5"
    )
)

# Print summary for each experiment
for r in results:
    print("\n" + r["experiment"])
    print(f"Acc={r['accuracy']:.4f} | P={r['precision']:.4f} | R={r['recall']:.4f} | F1={r['f1']:.4f}")

# Show best run by F1 
best = max(results, key=lambda x: x["f1"])
print("\nBEST RUN (by F1):", best["experiment"])
print(f"Acc={best['accuracy']:.4f} | P={best['precision']:.4f} | R={best['recall']:.4f} | F1={best['f1']:.4f}")



Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.



[Combo1: V2 head (2-layer), epochs=5, bs=32, lr=2e-5] Epoch 1/5


Training: 100%|████████████████████████████████████████████████████████████████████████| 16/16 [04:30<00:00, 16.91s/it]


Train Loss: 0.6832 | Train Acc: 0.5400


Evaluating: 100%|████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.10it/s]


Eval Loss: 0.6309 | Eval Acc: 0.8000

[Combo1: V2 head (2-layer), epochs=5, bs=32, lr=2e-5] Epoch 2/5


Training: 100%|████████████████████████████████████████████████████████████████████████| 16/16 [04:15<00:00, 15.95s/it]


Train Loss: 0.5834 | Train Acc: 0.7660


Evaluating: 100%|████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.12s/it]


Eval Loss: 0.5788 | Eval Acc: 0.8000

[Combo1: V2 head (2-layer), epochs=5, bs=32, lr=2e-5] Epoch 3/5


Training: 100%|████████████████████████████████████████████████████████████████████████| 16/16 [04:21<00:00, 16.34s/it]


Train Loss: 0.3520 | Train Acc: 0.9100


Evaluating: 100%|████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.14s/it]


Eval Loss: 0.5986 | Eval Acc: 0.8000

[Combo1: V2 head (2-layer), epochs=5, bs=32, lr=2e-5] Epoch 4/5


Training: 100%|████████████████████████████████████████████████████████████████████████| 16/16 [04:18<00:00, 16.15s/it]


Train Loss: 0.2224 | Train Acc: 0.9460


Evaluating: 100%|████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.04s/it]


Eval Loss: 0.7918 | Eval Acc: 0.6000

[Combo1: V2 head (2-layer), epochs=5, bs=32, lr=2e-5] Epoch 5/5


Training: 100%|████████████████████████████████████████████████████████████████████████| 16/16 [04:08<00:00, 15.51s/it]


Train Loss: 0.1313 | Train Acc: 0.9740


Evaluating: 100%|████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.10it/s]


Eval Loss: 0.6157 | Eval Acc: 0.8000


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.



[Combo2: V2 head (2-layer), epochs=5, bs=16, lr=2e-5] Epoch 1/5


Training: 100%|████████████████████████████████████████████████████████████████████████| 32/32 [04:46<00:00,  8.95s/it]


Train Loss: 0.6902 | Train Acc: 0.5240


Evaluating: 100%|████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.20it/s]


Eval Loss: 0.6611 | Eval Acc: 0.8000

[Combo2: V2 head (2-layer), epochs=5, bs=16, lr=2e-5] Epoch 2/5


Training: 100%|████████████████████████████████████████████████████████████████████████| 32/32 [04:40<00:00,  8.75s/it]


Train Loss: 0.5048 | Train Acc: 0.8160


Evaluating: 100%|████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.08s/it]


Eval Loss: 0.3454 | Eval Acc: 0.8000

[Combo2: V2 head (2-layer), epochs=5, bs=16, lr=2e-5] Epoch 3/5


Training: 100%|████████████████████████████████████████████████████████████████████████| 32/32 [24:16<00:00, 45.51s/it]


Train Loss: 0.2463 | Train Acc: 0.9320


Evaluating: 100%|████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.02it/s]


Eval Loss: 0.3403 | Eval Acc: 0.8000

[Combo2: V2 head (2-layer), epochs=5, bs=16, lr=2e-5] Epoch 4/5


Training: 100%|████████████████████████████████████████████████████████████████████████| 32/32 [04:31<00:00,  8.49s/it]


Train Loss: 0.1556 | Train Acc: 0.9600


Evaluating: 100%|████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.11s/it]


Eval Loss: 0.1947 | Eval Acc: 1.0000

[Combo2: V2 head (2-layer), epochs=5, bs=16, lr=2e-5] Epoch 5/5


Training: 100%|████████████████████████████████████████████████████████████████████████| 32/32 [04:39<00:00,  8.74s/it]


Train Loss: 0.0904 | Train Acc: 0.9760


Evaluating: 100%|████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.35it/s]


Eval Loss: 0.1073 | Eval Acc: 1.0000


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.



[LR Test: V2 head (2-layer), epochs=5, bs=16, lr=1e-5] Epoch 1/5


Training: 100%|████████████████████████████████████████████████████████████████████████| 32/32 [11:22<00:00, 21.34s/it]


Train Loss: 0.6794 | Train Acc: 0.5620


Evaluating: 100%|████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.14it/s]


Eval Loss: 0.6859 | Eval Acc: 0.8000

[LR Test: V2 head (2-layer), epochs=5, bs=16, lr=1e-5] Epoch 2/5


Training: 100%|████████████████████████████████████████████████████████████████████████| 32/32 [04:31<00:00,  8.48s/it]


Train Loss: 0.6129 | Train Acc: 0.7340


Evaluating: 100%|████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.06s/it]


Eval Loss: 0.5133 | Eval Acc: 0.8000

[LR Test: V2 head (2-layer), epochs=5, bs=16, lr=1e-5] Epoch 3/5


Training: 100%|████████████████████████████████████████████████████████████████████████| 32/32 [33:28<00:00, 62.78s/it]


Train Loss: 0.4282 | Train Acc: 0.8820


Evaluating: 100%|████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.33it/s]


Eval Loss: 0.3221 | Eval Acc: 0.8000

[LR Test: V2 head (2-layer), epochs=5, bs=16, lr=1e-5] Epoch 4/5


Training: 100%|████████████████████████████████████████████████████████████████████████| 32/32 [04:44<00:00,  8.89s/it]


Train Loss: 0.2606 | Train Acc: 0.9380


Evaluating: 100%|████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.18it/s]


Eval Loss: 0.1621 | Eval Acc: 1.0000

[LR Test: V2 head (2-layer), epochs=5, bs=16, lr=1e-5] Epoch 5/5


Training: 100%|████████████████████████████████████████████████████████████████████████| 32/32 [04:42<00:00,  8.83s/it]


Train Loss: 0.1587 | Train Acc: 0.9720


Evaluating: 100%|████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.21s/it]


Eval Loss: 0.3453 | Eval Acc: 0.8000


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.



[V3 Test: deeper head (3-layer), epochs=5, bs=16, lr=2e-5] Epoch 1/5


Training: 100%|████████████████████████████████████████████████████████████████████████| 32/32 [04:56<00:00,  9.25s/it]


Train Loss: 0.6956 | Train Acc: 0.4980


Evaluating: 100%|████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.12s/it]


Eval Loss: 0.6794 | Eval Acc: 0.8000

[V3 Test: deeper head (3-layer), epochs=5, bs=16, lr=2e-5] Epoch 2/5


Training: 100%|████████████████████████████████████████████████████████████████████████| 32/32 [04:43<00:00,  8.85s/it]


Train Loss: 0.6929 | Train Acc: 0.5000


Evaluating: 100%|████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.01it/s]


Eval Loss: 0.7116 | Eval Acc: 0.2000

[V3 Test: deeper head (3-layer), epochs=5, bs=16, lr=2e-5] Epoch 3/5


Training: 100%|████████████████████████████████████████████████████████████████████████| 32/32 [04:44<00:00,  8.90s/it]


Train Loss: 0.6424 | Train Acc: 0.6680


Evaluating: 100%|████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.02s/it]


Eval Loss: 0.6144 | Eval Acc: 0.6000

[V3 Test: deeper head (3-layer), epochs=5, bs=16, lr=2e-5] Epoch 4/5


Training: 100%|████████████████████████████████████████████████████████████████████████| 32/32 [04:44<00:00,  8.89s/it]


Train Loss: 0.4343 | Train Acc: 0.8960


Evaluating: 100%|████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.04it/s]


Eval Loss: 0.4784 | Eval Acc: 0.8000

[V3 Test: deeper head (3-layer), epochs=5, bs=16, lr=2e-5] Epoch 5/5


Training: 100%|████████████████████████████████████████████████████████████████████████| 32/32 [04:34<00:00,  8.57s/it]


Train Loss: 0.2742 | Train Acc: 0.9340


Evaluating: 100%|████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.08s/it]

Eval Loss: 0.4045 | Eval Acc: 0.8000

Combo1: V2 head (2-layer), epochs=5, bs=32, lr=2e-5
Acc=0.8000 | P=1.0000 | R=0.7500 | F1=0.8571

Combo2: V2 head (2-layer), epochs=5, bs=16, lr=2e-5
Acc=1.0000 | P=1.0000 | R=1.0000 | F1=1.0000

LR Test: V2 head (2-layer), epochs=5, bs=16, lr=1e-5
Acc=0.8000 | P=1.0000 | R=0.7500 | F1=0.8571

V3 Test: deeper head (3-layer), epochs=5, bs=16, lr=2e-5
Acc=0.8000 | P=0.8000 | R=1.0000 | F1=0.8889

BEST RUN (by F1): Combo2: V2 head (2-layer), epochs=5, bs=16, lr=2e-5
Acc=1.0000 | P=1.0000 | R=1.0000 | F1=1.0000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [25]:
import pandas as pd

if "results" not in globals() or not results:
    raise RuntimeError("Run the experiments cell first so `results` exists.")

df_results = pd.DataFrame(results)

cols = ["experiment", "batch_size", "lr", "epochs", "accuracy", "precision", "recall", "f1"]
df_results[cols].to_csv("bert_experiment_results.csv", index=False)

best = max(results, key=lambda x: x["f1"])
with open("best_run_report.txt", "w", encoding="utf-8") as f:
    f.write(best["experiment"] + "\n\n" + best["report"])

print(df_results[cols].sort_values("f1", ascending=False))
print("\nBEST RUN:", best["experiment"])


                                          experiment  batch_size       lr  \
1  Combo2: V2 head (2-layer), epochs=5, bs=16, lr...          16  0.00002   
3  V3 Test: deeper head (3-layer), epochs=5, bs=1...          16  0.00002   
0  Combo1: V2 head (2-layer), epochs=5, bs=32, lr...          32  0.00002   
2  LR Test: V2 head (2-layer), epochs=5, bs=16, l...          16  0.00001   

   epochs  accuracy  precision  recall        f1  
1       5       1.0        1.0    1.00  1.000000  
3       5       0.8        0.8    1.00  0.888889  
0       5       0.8        1.0    0.75  0.857143  
2       5       0.8        1.0    0.75  0.857143  

BEST RUN: Combo2: V2 head (2-layer), epochs=5, bs=16, lr=2e-5


In [26]:
import json
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

if "results" not in globals() or not results:
    raise RuntimeError("Run the experiments cell first so `results` exists.")

best = max(results, key=lambda x: x["f1"])

batch_size = int(best["batch_size"])
lr = float(best["lr"])
epochs = int(best["epochs"])

exp_name = best["experiment"].lower()
if "v3" in exp_name or "deeper" in exp_name or "3-layer" in exp_name:
    model_factory = v3_factory
    model_tag = "v3"
else:
    model_factory = v2_factory
    model_tag = "v2"

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model = model_factory().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for _ in range(epochs):
    if "train_one_epoch" in globals():
        train_one_epoch(model, train_loader, optimizer, criterion, device)
    else:
        train(model, train_loader, optimizer, criterion, device)

_ = evaluate_with_metrics(model, test_loader, criterion, device)

weights_path = f"best_bert_model_{model_tag}.pth"
torch.save(model.state_dict(), weights_path)

config_path = f"best_bert_model_{model_tag}_config.json"
with open(config_path, "w", encoding="utf-8") as f:
    json.dump(
        {
            "best_experiment": best["experiment"],
            "model_tag": model_tag,
            "batch_size": batch_size,
            "learning_rate": lr,
            "epochs": epochs,
        },
        f,
        indent=2
    )

print("Saved:", weights_path)
print("Saved:", config_path)

if torch.cuda.is_available():
    torch.cuda.empty_cache()


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
Training: 100%|████████████████████████████████████████████████████████████████████████| 32/32 [04:57<00:00,  9.28s/it]


Train Loss: 0.6824 | Train Acc: 0.5800


Training: 100%|████████████████████████████████████████████████████████████████████████| 32/32 [04:47<00:00,  8.98s/it]


Train Loss: 0.4647 | Train Acc: 0.8780


Training: 100%|████████████████████████████████████████████████████████████████████████| 32/32 [04:29<00:00,  8.41s/it]


Train Loss: 0.2119 | Train Acc: 0.9520


Training: 100%|████████████████████████████████████████████████████████████████████████| 32/32 [04:40<00:00,  8.76s/it]


Train Loss: 0.1100 | Train Acc: 0.9760


Training: 100%|████████████████████████████████████████████████████████████████████████| 32/32 [04:42<00:00,  8.83s/it]


Train Loss: 0.0552 | Train Acc: 0.9920


Evaluating: 100%|████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.14s/it]


Eval Loss: 0.0219 | Eval Acc: 1.0000
Precision: 1.0000
Recall:    1.0000
F1 Score:  1.0000

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000         1
           1     1.0000    1.0000    1.0000         4

    accuracy                         1.0000         5
   macro avg     1.0000    1.0000    1.0000         5
weighted avg     1.0000    1.0000    1.0000         5

Saved: best_bert_model_v2.pth
Saved: best_bert_model_v2_config.json
