<a href="https://colab.research.google.com/github/zahraniayudyaa/finnalterm-dl/blob/main/01_GOEmotions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **FINE-TUNING HUGGINGFACE MODELS (GOEmotions)**

## **1. Setup dan Instalasi**

In [None]:
# 1. Setup
!pip install transformers datasets torch scikit-learn pandas numpy matplotlib seaborn

import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from datasets import load_dataset
import seaborn as sns
from typing import List, Dict
import warnings
warnings.filterwarnings('ignore')

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

## **2. Load Dataset**

In [None]:
# 2. Load Dataset - GoEmotions
print("Loading GoEmotions dataset...")
dataset = load_dataset("google-research-datasets/go_emotions")

print("\nDataset structure:")
print(dataset)
print(f"Train samples: {len(dataset['train'])}")
print(f"Validation samples: {len(dataset['validation'])}")
print(f"Test samples: {len(dataset['test'])}")

# Check sample
print("\nSample data:")
sample = dataset['train'][0]
print(f"Text: {sample['text']}")
print(f"Labels: {sample['labels']}")
print(f"Emotions: {sample['emotions']}")

# Emotion labels (28 classes)
emotion_labels = [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
    'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
    'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
    'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization',
    'relief', 'remorse', 'sadness', 'surprise', 'neutral'
]

print(f"\nTotal emotion classes: {len(emotion_labels)}")

# 3. Analyze label distribution
def analyze_label_distribution(dataset_split, split_name):
    print(f"\n{split_name} Label Analysis:")

    # Count samples with single vs multiple labels
    single_label = 0
    multi_label = 0

    label_counts = np.zeros(len(emotion_labels))

    for item in dataset_split:
        labels = item['labels']
        if len(labels) == 1:
            single_label += 1
        else:
            multi_label += 1

        for label in labels:
            if label < len(emotion_labels):  # Ensure valid index
                label_counts[label] += 1

    total = single_label + multi_label
    print(f"  Single-label samples: {single_label} ({single_label/total*100:.1f}%)")
    print(f"  Multi-label samples: {multi_label} ({multi_label/total*100:.1f}%)")

    # Show top emotions
    print(f"\n  Top 10 emotions:")
    sorted_indices = np.argsort(label_counts)[::-1][:10]
    for idx in sorted_indices:
        print(f"    {emotion_labels[idx]}: {int(label_counts[idx])} samples")

analyze_label_distribution(dataset['train'], 'Training')
analyze_label_distribution(dataset['validation'], 'Validation')

## **3. Preprocessing Data**

In [None]:
# 4. Preprocessing - Convert to multi-hot encoding
def convert_to_multi_hot(example):
    # Create multi-hot vector (28 dimensions)
    multi_hot = np.zeros(len(emotion_labels), dtype=np.float32)
    for label in example['labels']:
        if label < len(emotion_labels):  # Ensure valid index
            multi_hot[label] = 1.0
    example['labels'] = multi_hot.tolist()
    return example

print("\nConverting to multi-hot encoding...")
dataset = dataset.map(convert_to_multi_hot)

# 5. Tokenization
MODEL_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        padding=True,
        max_length=128
    )

print("\nTokenizing dataset...")
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['text', 'emotions'])

## **4. Load Model dan Training**

In [None]:
# 6. Load Model for Multi-label Classification
print(f"\nLoading model: {MODEL_NAME}")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(emotion_labels),
    problem_type="multi_label_classification"
)
model.to(device)

# 7. Custom Metrics for Multi-label
def compute_metrics_multi_label(eval_pred):
    logits, labels = eval_pred
    predictions = torch.sigmoid(torch.tensor(logits)).numpy()

    # Apply threshold (0.5)
    pred_labels = (predictions > 0.5).astype(int)

    # Calculate metrics
    accuracy = accuracy_score(labels, pred_labels)
    f1_micro = f1_score(labels, pred_labels, average='micro')
    f1_macro = f1_score(labels, pred_labels, average='macro')

    # Try to calculate AUC (might fail if some labels don't have positive samples)
    try:
        auc = roc_auc_score(labels, predictions, average='macro')
    except:
        auc = 0.0

    return {
        "accuracy": accuracy,
        "f1_micro": f1_micro,
        "f1_macro": f1_macro,
        "auc_macro": auc
    }

# 8. Training Arguments
training_args = TrainingArguments(
    output_dir="./results_goemotions",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    logging_dir="./logs",
    logging_steps=50,
    report_to="none",
    save_total_limit=2
)

# 9. Data Collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 10. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_multi_label
)

# 11. Train Model
print("\nTraining model for multi-label emotion classification...")
train_result = trainer.train()

## **5. Evaluasi**

In [None]:
# 12. Evaluate
print("\nEvaluating model...")
eval_result = trainer.evaluate()
print(f"\nEvaluation results:")
for key, value in eval_result.items():
    print(f"  {key}: {value:.4f}")

# 13. Test on Test Set
print("\nTesting on test set...")
test_results = trainer.predict(tokenized_datasets['test'])
test_metrics = test_results.metrics
print(f"\nTest set metrics:")
for key, value in test_metrics.items():
    if key not in ['eval_loss', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second']:
        print(f"  {key}: {value:.4f}")

In [None]:
# 14. Save Model
print("\nSaving model...")
trainer.save_model("./saved_model_goemotions")
tokenizer.save_pretrained("./saved_model_goemotions")

# 15. Inference Function for Multi-label
def predict_emotions(text, model, tokenizer, device, emotion_labels, threshold=0.3):
    inputs = tokenizer(
        text,
        truncation=True,
        padding=True,
        max_length=128,
        return_tensors="pt"
    )

    inputs = {k: v.to(device) for k, v in inputs.items()}

    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)

    # Apply sigmoid and threshold
    probabilities = torch.sigmoid(outputs.logits).cpu().numpy()[0]
    predictions = (probabilities > threshold).astype(int)

    # Get predicted emotions
    predicted_indices = np.where(predictions == 1)[0]
    predicted_emotions = [(emotion_labels[i], probabilities[i]) for i in predicted_indices]

    # Sort by probability
    predicted_emotions.sort(key=lambda x: x[1], reverse=True)

    # Get top emotions overall
    top5_indices = np.argsort(probabilities)[-5:][::-1]
    top5_emotions = [(emotion_labels[i], probabilities[i]) for i in top5_indices]

    return {
        "text": text[:100] + "..." if len(text) > 100 else text,
        "predicted_emotions": predicted_emotions,
        "probabilities": probabilities.tolist(),
        "top5_emotions": top5_emotions,
        "has_emotions": len(predicted_emotions) > 0
    }

# 16. Test Inference
test_samples = [
    "I'm so excited about this amazing opportunity!",
    "This is absolutely terrible, I can't believe it.",
    "Thank you so much for your help, I really appreciate it.",
    "I'm not sure what to do, this is confusing.",
    "That's hilarious, I can't stop laughing!"
]

print("\nEmotion Prediction Examples:")
print("=" * 80)
for text in test_samples:
    result = predict_emotions(text, model, tokenizer, device, emotion_labels, threshold=0.3)
    print(f"\nText: {result['text']}")

    if result['predicted_emotions']:
        print("Predicted emotions:")
        for emotion, prob in result['predicted_emotions']:
            print(f"  - {emotion}: {prob:.2%}")
    else:
        print("No strong emotions detected (below threshold)")

    print("Top 5 emotions by probability:")
    for emotion, prob in result['top5_emotions']:
        print(f"  - {emotion}: {prob:.2%}")