In [5]:
!pip install -U evaluate
!pip install -U datasets
!pip install -U accelerate
!pip install -U transformers
!pip install tensorboardX

import torch
import numpy as np
import pandas as pd
from evaluate import load 
import accelerate
from datasets import load_dataset
from transformers import AutoTokenizer, pipeline
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

Collecting tensorboardX
  Using cached tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Using cached tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
Installing collected packages: tensorboardX
Successfully installed tensorboardX-2.6.2.2


In [6]:
dataset = load_dataset("dair-ai/emotion")
print(dataset["train"])

Dataset({
    features: ['text', 'label'],
    num_rows: 16000
})


In [None]:
# Make sure you have this import if you haven't run it already
# !pip install evaluate tensorboard # Or conda install evaluate tensorboard
# Restart kernel if you just installed

import evaluate # Use the new evaluate library
import numpy as np
from datasets import load_dataset # Keep load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
import torch
import os # Import os for path manipulation

# --- Configuration ---
MODEL_CHECKPOINT = "google/bert_uncased_L-2_H-128_A-2"
DATASET_NAME = "dair-ai/emotion"
MAX_LEN = 128 # Reduced max_len slightly, often sufficient for smaller models like BERT-Tiny
NUM_EPOCHS = 8 # Reduced epochs for quicker demonstration, adjust as needed
BATCH_SIZE = 16 # Adjusted batch size, tune based on GPU memory
LEARNING_RATE = 5e-5
WEIGHT_DECAY = 0.01
NUM_LABELS = 6 # For emotion dataset

# --- Load Data ---
print(f"Loading dataset: {DATASET_NAME}...")
dataset = load_dataset(DATASET_NAME)
print("Dataset loaded.")
# print("Train dataset example:", dataset["train"][0])

# --- Load Tokenizer ---
print(f"\nLoading tokenizer: {MODEL_CHECKPOINT}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, use_fast=True)
print("Tokenizer loaded.")

# --- Preprocessing ---
def preprocess_function(examples):
    return tokenizer(examples["text"], max_length=MAX_LEN, padding="max_length", truncation=True)

print("\nEncoding dataset...")
encoded_dataset = dataset.map(preprocess_function, batched=True)
print("Encoding complete.")

# --- Load Model ---
print(f"\nLoading model: {MODEL_CHECKPOINT}...")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=NUM_LABELS,
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1
)
print("Model loaded.")

# --- Training Arguments ---
model_name = MODEL_CHECKPOINT.split("/")[-1]
# Define the main output directory
output_dir = f"./results/{model_name}-finetuned-{DATASET_NAME.split('/')[-1]}"
logging_dir = f"{output_dir}/logs"

args = TrainingArguments(
    output_dir=output_dir,             # Checkpoints saved here during training
    logging_dir=logging_dir,
    eval_strategy="epoch",
    save_strategy="epoch",             # Saves checkpoints automatically
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    load_best_model_at_end=True,       # Loads best model into memory after training
    metric_for_best_model='eval_f1_score',
    save_total_limit=2,                # Limits disk usage by checkpoints
    push_to_hub=False,
    fp16=True,
    logging_steps=50,
    report_to="tensorboard",
    # report_to="none",
)

# --- Metrics ---
print("\nLoading metrics using 'evaluate' library...")
acc_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
print("Metrics loaded.")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="micro")
    return {"accuracy": acc["accuracy"], "f1_score": f1["f1"]}

# --- Initialize Trainer ---
print("\nInitializing Trainer...")
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
print("Trainer initialized.")

# --- Start Training ---
print("\nStarting training...")
train_result = trainer.train()
print("Training finished.")
# train_result contains metrics, training loss etc.
# print("Training Summary:", train_result)

# --- Explicitly Save the Best Model ---
# Since load_best_model_at_end=True, trainer.model now holds the best model weights.
# We save this model to a specific directory for easier access later.
print("\n--- Saving the best model identified during training ---")

# Define a path for the final best model (can be inside or outside output_dir)
best_model_path = os.path.join(output_dir, "best_model") # Save to './results/.../best_model/'

# Use trainer.save_model() to save the current state (model & tokenizer)
trainer.save_model(best_model_path)

print(f"Best model and tokenizer saved to: {best_model_path}")
print("(This contains the model identified as best based on 'eval_f1_score')")
# --- End of Save ---


# --- Final Evaluation (Optional but Recommended) ---
# These blocks evaluate the BEST model loaded at the end of training

# --- Evaluate on Test Set ---
print("\n--- Evaluating BEST model on TEST set ---")
test_results = trainer.evaluate(eval_dataset=encoded_dataset['test'])
test_accuracy = test_results.get("eval_accuracy")
test_loss = test_results.get("eval_loss")
print(f"Final Test Loss:     {test_loss:.4f}" if test_loss is not None else "Final Test Loss: N/A")
print(f"Final Test Accuracy: {test_accuracy:.4f}" if test_accuracy is not None else "Final Test Accuracy: N/A")
# print("\nFull Final Test Evaluation Output:")
# print(test_results)

# --- Evaluate on Train Set ---
# print("\n--- Evaluating BEST model on TRAIN set ---")
# train_results = trainer.evaluate(eval_dataset=encoded_dataset['train'])
# train_accuracy = train_results.get("eval_accuracy")
# train_loss = train_results.get("eval_loss")
# print(f"Final Train Loss:     {train_loss:.4f}" if train_loss is not None else "Final Train Loss: N/A")
# print(f"Final Train Accuracy: {train_accuracy:.4f}" if train_accuracy is not None else "Final Train Accuracy: N/A")
# print("\nFull Final Train Evaluation Output:")
# print(train_results)
# --- End of Evaluation ---

# Model Info
## BertForSequenceClassification Model Summary

### Bert Model:
- **Embeddings:**
  - `word_embeddings`: Embedding(30522, 128, padding_idx=0)
  - `position_embeddings`: Embedding(512, 128)
  - `token_type_embeddings`: Embedding(2, 128)
  - `LayerNorm`: LayerNorm((128,), eps=1e-12, elementwise_affine=True)
  - `dropout`: Dropout(p=0.2, inplace=False)

- **Encoder:**
  - **Layers (2x BertLayer):**
    - **Attention:**
      - `query`: Linear(in_features=128, out_features=128, bias=True)
      - `key`: Linear(in_features=128, out_features=128, bias=True)
      - `value`: Linear(in_features=128, out_features=128, bias=True)
      - `dropout`: Dropout(p=0.2, inplace=False)
    - **Self-Output:**
      - `dense`: Linear(in_features=128, out_features=128, bias=True)
      - `LayerNorm`: LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      - `dropout`: Dropout(p=0.2, inplace=False)

### Additional Components:
- `dropout`: Dropout(p=0.2, inplace=False)
- `classifier`: Linear(in_features=128, out_features=10, bias=True)

# Training Results (# of Epochs)

Findings: The higher the # of Epochs, the better the model.

### 3 Epochs - Base

| Epoch | Training Loss | Validation Loss | Accuracy | F1 Score  |
|-------|--------------|----------------|------------|-----------|
| 1     | 1.565900	   | 1.508827       | 0.584000   | 0.584000  |
| 2     | 1.273500     | 1.133131       | 0.647000   | 0.647000  |
| 3     | 1.174500     | 1.043239       | 0.632000   | 0.632000  |


### 5 Epochs

| Epoch | Training Loss | Validation Loss | Accuracy | F1 Score  |
|-------|--------------|----------------|------------|-----------|
| 1     | 1.407300	   | 1.240899       | 0.561500   | 0.561500  |
| 2     | 1.073100     | 0.920887       | 0.675500   | 0.675500  |
| 3     | 0.910300     | 0.741255       | 0.761000   | 0.761000  |
| 4     | 0.770300     | 0.664742       | 0.795500   | 0.795500  |
| 5     | 0.745900     | 0.635276       | 0.805500   | 0.805500  |

### 8 Epochs

| Epoch | Training Loss | Validation Loss | Accuracy | F1 Score  |
|-------|--------------|----------------|------------|-----------|
| 1     | 1.426200	   | 1.256119       | 0.557000   | 0.557000  |
| 2     | 1.020400     | 0.854230       | 0.730000   | 0.730000  |
| 3     | 0.830600     | 0.673912       | 0.797500   | 0.797500  |
| 4     | 0.674600	   | 0.559353       | 0.833000   | 0.833000  |
| 5     | 0.600300     | 0.492488       | 0.849000   | 0.849000  |
| 6     | 0.538900     | 0.457193       | 0.857000   | 0.857000  |
| 7     | 0.528300     | 0.439362       | 0.864500   | 0.864500  |
| 8     | 0.535700     | 0.433461       | 0.865000   | 0.865000  |

# Training Results (Learning Rate)

Findings: Model performs better with higher LR but within the range of 2e-5 to 5e-5

### Learning Rate = 2e-5

| Epoch | Training Loss | Validation Loss | Accuracy | F1 Score  |
|-------|--------------|----------------|------------|-----------|
| 1     | 1.426200	   | 1.256119       | 0.557000   | 0.557000  |
| 2     | 1.020400     | 0.854230       | 0.730000   | 0.730000  |
| 3     | 0.830600     | 0.673912       | 0.797500   | 0.797500  |
| 4     | 0.674600	   | 0.559353       | 0.833000   | 0.833000  |
| 5     | 0.600300     | 0.492488       | 0.849000   | 0.849000  |
| 6     | 0.538900     | 0.457193       | 0.857000   | 0.857000  |
| 7     | 0.528300     | 0.439362       | 0.864500   | 0.864500  |
| 8     | 0.535700     | 0.433461       | 0.865000   | 0.865000  |
    
### Learning Rate = 1e-5

| Epoch | Training Loss | Validation Loss | Accuracy | F1 Score  |
|-------|--------------|----------------|------------|-----------|
| 1     | 1.576500    | 1.535615       | 0.368500   | 0.368500  |
| 2     | 1.440700    | 1.321964       | 0.567500   | 0.567500  |
| 3     | 1.240900    | 1.091327       | 0.668500   | 0.668500  |
| 4     | 1.075400    | 0.952145       | 0.710500   | 0.710500  |
| 5     | 1.001000    | 0.866790       | 0.730500   | 0.730500  |
| 6     | 0.947100    | 0.818463       | 0.737000   | 0.737000  |
| 7     | 0.920000    | 0.793186       | 0.745000   | 0.745000  |
| 8     | 0.910800    | 0.784847       | 0.746500   | 0.746500  |

### Learning Rate = 3e-5

| Epoch | Training Loss | Validation Loss | Accuracy | F1 Score  |
|-------|--------------|----------------|----------|-----------|
| 1     | 1.222200    | 1.009960       | 0.683500 | 0.683500  |
| 2     | 0.793400    | 0.632730       | 0.812500 | 0.812500  |
| 3     | 0.580700    | 0.454973       | 0.862000 | 0.862000  |
| 4     | 0.456200    | 0.395221       | 0.872500 | 0.872500  |
| 5     | 0.426800    | 0.372895       | 0.874500 | 0.874500  |
| 6     | 0.380300    | 0.363105       | 0.878500 | 0.878500  |
| 7     | 0.383800    | 0.360529       | 0.877500 | 0.877500  |
| 8     | 0.393600    | 0.358225       | 0.879000 | 0.879000  |

### Learning Rate = 5e-5
| Epoch | Training Loss | Validation Loss | Accuracy | F1 Score  |
|-------|--------------|----------------|----------|-----------|
| 1     | 0.911700    | 0.718967       | 0.782000 | 0.782000  |
| 2     | 0.562400    | 0.443868       | 0.864000 | 0.864000  |
| 3     | 0.429400    | 0.373164       | 0.880500 | 0.880500  |
| 4     | 0.370700    | 0.353360       | 0.888500 | 0.888500  |
| 5     | 0.360100    | 0.368519       | 0.885500 | 0.885500  |
|**6**  | **0.313700**| **0.354942**   | **0.893500** | **0.893500**  |
| 7     | 0.319100    | 0.355341       | 0.891000 | 0.891000  |
| 8     | 0.323700    | 0.355058       | 0.889500 | 0.889500  |


# Training Results 
### Dropout = 0.2
| Epoch | Training Loss | Validation Loss | Accuracy | F1 Score  |
|-------|--------------|----------------|----------|-----------|
| 1     | 0.911700    | 0.718967       | 0.782000 | 0.782000  |
| 2     | 0.562400    | 0.443868       | 0.864000 | 0.864000  |
| 3     | 0.429400    | 0.373164       | 0.880500 | 0.880500  |
| 4     | 0.370700    | 0.353360       | 0.888500 | 0.888500  |
| 5     | 0.360100    | 0.368519       | 0.885500 | 0.885500  |
|**6**  | **0.313700**| **0.354942**   | **0.893500** | **0.893500**  |
| 7     | 0.319100    | 0.355341       | 0.891000 | 0.891000  |
| 8     | 0.323700    | 0.355058       | 0.889500 | 0.889500  |

### Dropout = 0.1
| Epoch | Training Loss | Validation Loss | Accuracy | F1 Score  |
|-------|--------------|----------------|----------|-----------|
| 1     | 0.687700     | 0.523786       | 0.853500 | 0.853500  |
| 2     | 0.388700     | 0.354088       | 0.886000 | 0.886000  |
| 3     | 0.327800     | 0.322002       | 0.891500 | 0.891500  |
| 4     | 0.295400     | 0.325484       | 0.898000 | 0.898000  |
| 5     | 0.283400     | 0.322322       | 0.899000 | 0.899000  |
| 6     | 0.237700     | 0.316104       | 0.901500 | 0.901500  |
| 7     | 0.231900     | 0.319066       | 0.902000 | 0.902000  |
| **8**     | **0.220700**     | **0.325635**       | **0.903000** | **0.903000**  |

### Dropout = 0.3
| Epoch | Training Loss | Validation Loss | Accuracy | F1 Score  |
|-------|--------------|----------------|----------|-----------|
| 1     | 1.156200     | 0.909088       | 0.701500 | 0.701500  |
| 2     | 0.765800     | 0.580633       | 0.838500 | 0.838500  |
| 3     | 0.588700     | 0.444506       | 0.866500 | 0.866500  |
| 4     | 0.469400     | 0.385133       | 0.877000 | 0.877000  |
| 5     | 0.442800     | 0.374441       | 0.879000 | 0.879000  |
| 6     | 0.392400     | 0.368438       | 0.884500 | 0.884500  |
| 7     | 0.396700     | 0.367876       | 0.884500 | 0.884500  |
| 8     | 0.398700     | 0.366798       | 0.883500 | 0.883500  |

# Training Results 
### Batch size = 8
| Epoch | Training Loss | Validation Loss | Accuracy | F1 Score  |
|-------|--------------|----------------|----------|-----------|
| 1     | 0.687700     | 0.523786       | 0.853500 | 0.853500  |
| 2     | 0.388700     | 0.354088       | 0.886000 | 0.886000  |
| 3     | 0.327800     | 0.322002       | 0.891500 | 0.891500  |
| 4     | 0.295400     | 0.325484       | 0.898000 | 0.898000  |
| 5     | 0.283400     | 0.322322       | 0.899000 | 0.899000  |
| 6     | 0.237700     | 0.316104       | 0.901500 | 0.901500  |
| 7     | 0.231900     | 0.319066       | 0.902000 | 0.902000  |
| **8**     | **0.220700**     | **0.325635**       | **0.903000** | **0.903000**  |

### Batch Size = 32
| Epoch | Training Loss | Validation Loss | Accuracy | F1 Score |
|-------|--------------|----------------|----------|----------|
| 1     | 1.459500    | 1.039186       | 0.672000 | 0.672000 |
| 2     | 0.881500    | 0.615181       | 0.838500 | 0.838500 |
| 3     | 0.568400    | 0.442513       | 0.872000 | 0.872000 |
| 4     | 0.426600    | 0.371119       | 0.876500 | 0.876500 |
| 5     | 0.355400    | 0.338125       | 0.880500 | 0.880500 |
| 6     | 0.317300    | 0.319918       | 0.888000 | 0.888000 |
| 7     | 0.294400    | 0.312631       | 0.888000 | 0.888000 |
| 8     | 0.286600    | 0.310849       | 0.886000 | 0.886000 |

### Batch size = 64
| Epoch | Training Loss | Validation Loss | Accuracy | F1 Score |
|-------|--------------|----------------|----------|----------|
| 1     | No log      | 1.340407       | 0.561000 | 0.561000 |
| 2     | 1.362000    | 0.866351       | 0.756000 | 0.756000 |
| 3     | 1.362000    | 0.631371       | 0.843500 | 0.843500 |
| 4     | 0.703300    | 0.515631       | 0.861000 | 0.861000 |
| 5     | 0.703300    | 0.453638       | 0.872000 | 0.872000 |
| 6     | 0.479100    | 0.418776       | 0.878000 | 0.878000 |
| 7     | 0.479100    | 0.401627       | 0.880500 | 0.880500 |
| 8     | 0.405200    | 0.395891       | 0.879500 | 0.879500 |



## Training Arguments

- **Learning Rate:** `2e-5`
- **Batch Size:**
  - **Train:** `8`
  - **Evaluation:** `8`
- **Number of Training Epochs:** `8`
- **Weight Decay:** `0.01`
- **Metric for Best Model:** `eval_f1_score` (F1 Score)
- **Mixed Precision (FP16):** `True`
