In [1]:
import sys
sys.path.append(r"C:\Users\zhaoh\Desktop\CDS")
import pandas as pd
import numpy as np
from src.utils import evaluate
from datasets import Dataset
from transformers import (
    Trainer,
    TrainingArguments,
    AutoTokenizer,
    AutoModelForSequenceClassification
)
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import torch
import torch.nn as nn

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# =====================
# Load data
# =====================
train_metadata_csv = "C:/Users/zhaoh/Desktop/processed_train/metadata.csv"
test_metadata_csv  = "C:/Users/zhaoh/Desktop/processed_test/metadata.csv"

train_df = pd.read_csv(train_metadata_csv)[["utterance", "emotion"]]
test_df  = pd.read_csv(test_metadata_csv)[["utterance", "emotion"]]

# =====================
# Convert to HF Dataset
# =====================
train_dataset = Dataset.from_pandas(train_df)
test_dataset  = Dataset.from_pandas(test_df)
# =====================
# Label encoding
# =====================
label_list = sorted(train_df["emotion"].unique())
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

def encode_labels(example):
    example["label"] = label2id[example["emotion"]]
    return example

train_labels = train_df["emotion"].map(label2id).values

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(train_labels),
    y=train_labels
)

class_weights = torch.tensor(class_weights, dtype=torch.float)

train_dataset = train_dataset.map(encode_labels)
test_dataset  = test_dataset.map(encode_labels)

# =====================
# Load tokenizer FIRST
# =====================
model_name = "bhadresh-savani/bert-base-uncased-emotion"

tokenizer = AutoTokenizer.from_pretrained(model_name)

# =====================
# Tokenization
# =====================
def tokenize_function(example):
    return tokenizer(
        example["utterance"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset  = test_dataset.map(tokenize_function, batched=True)

# =====================
# Torch format
# =====================
train_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"]
)

test_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"]
)

# =====================
# Load model
# =====================
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

# =====================
# Metrics
# =====================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted"),
    }

# =====================
# Training args
# =====================
training_args = TrainingArguments(
    output_dir="C:/Users/zhaoh/Desktop/CDS/text_checkpoint",
    num_train_epochs=6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)
class WeightedTrainer(Trainer):

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")

        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss_fct = nn.CrossEntropyLoss(
            weight=class_weights.to(model.device)
        )

        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

# =====================
# Trainer
# =====================
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics   # <-- keep this
)

# =====================
# Train
# =====================
trainer.train()

# =====================
# Evaluate
# =====================
results = trainer.evaluate()
print("\nEvaluation Results:")
print(results)

# =====================
# Confusion Matrix
# =====================
predictions = trainer.predict(test_dataset)

y_pred = predictions.predictions.argmax(axis=1)
y_true = predictions.label_ids

evaluate(y_true,y_pred)

Map: 100%|██████████| 9989/9989 [00:00<00:00, 28828.43 examples/s]
Map: 100%|██████████| 2610/2610 [00:00<00:00, 43138.69 examples/s]
Map: 100%|██████████| 9989/9989 [00:00<00:00, 20078.43 examples/s]
Map: 100%|██████████| 2610/2610 [00:00<00:00, 16893.12 examples/s]
Loading weights: 100%|██████████| 201/201 [00:00<00:00, 780.57it/s, Materializing param=classifier.weight]                                      
[1mBertForSequenceClassification LOAD REPORT[0m from: bhadresh-savani/bert-base-uncased-emotion
Key                          | Status     |                                                                                       
-----------------------------+------------+---------------------------------------------------------------------------------------
bert.embeddings.position_ids | UNEXPECTED |                                                                                       
classifier.bias              | MISMATCH   | Reinit due to size mismatch - ckpt: torch.Size([6]) 

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.740529,1.470338,0.559387,0.58215
2,1.396162,1.412774,0.536782,0.571562
3,1.159585,1.577146,0.56092,0.58069
4,0.710251,1.704891,0.590805,0.600958
5,0.497361,1.911708,0.587356,0.595331
6,0.414249,1.982632,0.598467,0.605152


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  4.07it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  4.29it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.92it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.44it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.50it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  4.11it/s]
There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.


Evaluation Results:
{'eval_loss': 1.9826315641403198, 'eval_accuracy': 0.5984674329501916, 'eval_f1': 0.6051522347453633, 'eval_runtime': 13.4268, 'eval_samples_per_second': 194.387, 'eval_steps_per_second': 12.214, 'epoch': 6.0}
Validation Accuracy: 0.5985
Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.43      0.43       345
           1       0.34      0.24      0.28        68
           2       0.14      0.24      0.18        50
           3       0.56      0.64      0.60       402
           4       0.80      0.70      0.75      1256
           5       0.33      0.35      0.34       208
           6       0.50      0.61      0.55       281

    accuracy                           0.60      2610
   macro avg       0.44      0.46      0.45      2610
weighted avg       0.62      0.60      0.61      2610

Confusion Matrix:
[[147  14  13  47  48  30  46]
 [ 12  16   3   5  17   6   9]
 [  7   0  12   3  15   7   6]
 [ 35   1 