In [None]:
# Install necessary packages
!pip install transformers datasets scikit-learn torch --quiet

# Import required libraries
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, hamming_loss, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from torch.utils.data import Dataset


In [2]:
# Load your GoEmotions CSV file
df = pd.read_csv('go_emotions_dataset.csv')
df.head()


Unnamed: 0,id,text,example_very_unclear,admiration,amusement,anger,annoyance,approval,caring,confusion,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,eew5j0j,That game hurt.,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,eemcysk,>sexuality shouldn’t be a grouping category I...,True,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ed2mah1,"You do right, if you don't care then fuck 'em!",False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,eeibobj,Man I love reddit.,False,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,eda6yn6,"[NAME] was nowhere near them, he was by the Fa...",False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
# Identify the text and label columns
text_column = 'text'
label_columns = df.columns[3:]  # From 'admiration' to 'neutral'

# Extract features and labels
texts = df[text_column].tolist()
labels = df[label_columns].values

# Show the label names
label_names = label_columns.tolist()
print(f"Emotion Classes: {label_names}")


In [4]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), labels, test_size=0.2, random_state=42)


In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)


In [6]:
class GoEmotionsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).float()
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = GoEmotionsDataset(train_encodings, train_labels)
val_dataset = GoEmotionsDataset(val_encodings, val_labels)


In [None]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label_names),
    problem_type="multi_label_classification"
)


In [8]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    logging_dir="./logs",
    logging_steps=10
    # Removed: evaluation_strategy, save_strategy, load_best_model_at_end, metric_for_best_model
)


In [9]:
def compute_metrics(p):
    preds = p.predictions > 0.5
    labels = p.label_ids
    return {
        'f1': f1_score(labels, preds, average='micro'),
        'hamming_loss': hamming_loss(labels, preds)
    }


In [10]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"  # 💡 This disables W&B and other loggers
)


In [14]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


# Rebuild Trainer to fix AcceleratorState issue
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Now safely evaluate the model
eval_results = trainer.evaluate()
print("📊 Evaluation Results:")
print(eval_results)


  trainer = Trainer(


📊 Evaluation Results:
{'eval_loss': 0.6466298699378967, 'eval_model_preparation_time': 0.0119, 'eval_f1': 0.003258243918880962, 'eval_hamming_loss': 0.045000253622575793, 'eval_runtime': 191.0502, 'eval_samples_per_second': 221.12, 'eval_steps_per_second': 3.46}


In [17]:
def predict_emotions(text, threshold=0.5):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(model.device)

    # Get model outputs
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.sigmoid(logits).cpu().numpy()[0]

    # Apply threshold and get emotion labels
    predicted_indices = [i for i, prob in enumerate(probs) if prob >= threshold]
    predicted_emotions = [label_names[i] for i in predicted_indices]

    return predicted_emotions


In [None]:
eval_results = trainer.evaluate()
print(eval_results)
