In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
import numpy as np


# Load tokenizer and model
model_name = "nateraw/bert-base-uncased-emotion"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)


  return torch.load(checkpoint_file, map_location="cpu")


In [None]:
# Load dataset
df = pd.read_csv(r"C:\Users\DELL\Desktop\combined_emotion.csv")

dataset = Dataset.from_pandas(df)


In [3]:
unique_labels = set(dataset['emotion'])
print(unique_labels)


{'suprise', 'love', 'sad', 'joy', 'fear', 'anger'}


In [4]:
label2id = {'sad': 0, 'joy': 1, 'love': 2, 'anger': 3, 'fear': 4, 'suprise': 5}

def tokenize(batch):
    tokenized = tokenizer(batch["sentence"], padding=True, truncation=True)
    tokenized["labels"] = [label2id[label] for label in batch["emotion"]]
    return tokenized


In [5]:
tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)


Map:   0%|          | 0/422746 [00:00<?, ? examples/s]

In [None]:
# Train-test split
split = tokenized_dataset.train_test_split(test_size=0.2)
train_ds = split["train"]
eval_ds  = split["test"]

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch", 
    num_train_epochs=2,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    logging_dir="./logs",
    load_best_model_at_end=True,
    no_cuda=True,  
)

# Metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    return {"accuracy": acc, "f1": f1}

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [8]:
# Train model
trainer.train()



  0%|          | 0/507294 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.7311, 'learning_rate': 4.995071891250439e-05, 'epoch': 0.0}
{'loss': 0.6246, 'learning_rate': 4.9901437825008774e-05, 'epoch': 0.01}
{'loss': 0.6558, 'learning_rate': 4.985215673751316e-05, 'epoch': 0.01}
{'loss': 0.7011, 'learning_rate': 4.9802875650017545e-05, 'epoch': 0.01}
{'loss': 0.6831, 'learning_rate': 4.9753594562521934e-05, 'epoch': 0.01}
{'loss': 0.5287, 'learning_rate': 4.970431347502632e-05, 'epoch': 0.02}
{'loss': 0.6505, 'learning_rate': 4.9655032387530705e-05, 'epoch': 0.02}
{'loss': 0.5391, 'learning_rate': 4.9605751300035094e-05, 'epoch': 0.02}
{'loss': 0.5191, 'learning_rate': 4.9556470212539476e-05, 'epoch': 0.03}
{'loss': 0.4452, 'learning_rate': 4.9507189125043865e-05, 'epoch': 0.03}
{'loss': 0.4798, 'learning_rate': 4.9457908037548254e-05, 'epoch': 0.03}
{'loss': 0.6419, 'learning_rate': 4.9408626950052636e-05, 'epoch': 0.04}


In [None]:
#  Test a sample sentence
inputs = tokenizer("I am so happy today!", return_tensors="pt")
outputs = model(**inputs)
predicted_label = outputs.logits.argmax(dim=-1).item()
label_map = model.config.id2label
print(f"Predicted Emotion: {label_map[predicted_label]}")