In [1]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())


2.5.1
False


In [2]:
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from tqdm import tqdm

In [5]:
df = pd.read_csv("text_emotions.csv")

text_col = "content"      # change if needed
label_col = "sentiment"  # change if needed

texts = df[text_col].tolist()
labels = df[label_col].tolist()

# Encode labels
le = LabelEncoder()
labels = le.fit_transform(labels)

# Train/Valid Split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.1, random_state=42
)

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long)
        }

train_dataset = EmotionDataset(train_texts, train_labels, tokenizer)
val_dataset   = EmotionDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=16)

# ============================
#   MODEL
# ============================

num_classes = len(le.classes_)

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_classes
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

# ============================
#   TRAINING LOOP
# ============================

epochs = 3

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} | Training Loss: {avg_loss}")

    # --------- Validation ----------
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            predictions = outputs.logits.argmax(dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    acc = correct / total
    print(f"Validation Accuracy: {acc:.4f}\n")

# ============================
#   SAVE MODEL
# ============================

model.save_pretrained("emotion_model")
tokenizer.save_pretrained("emotion_model")
pd.Series(le.classes_).to_csv("emotion_labels.csv", index=False)

print("Model Saved!")

# ============================
#   PREDICTION FUNCTION
# ============================

def predict_emotion(sentence):
    model.eval()

    encoding = tokenizer(
        sentence,
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model(**encoding)
        pred_id = outputs.logits.argmax(dim=1).item()

    return le.inverse_transform([pred_id])[0]

# Example
print(predict_emotion("I am really upset today"))

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████████████████████████████████████████████████████| 1125/1125 [1:36:43<00:00,  5.16s/it]


Epoch 1 | Training Loss: 0.4369593536704779


Validating: 100%|████████████████████████████████████████████████████████████████████| 125/125 [02:34<00:00,  1.23s/it]


Validation Accuracy: 0.9360



Training Epoch 2: 100%|██████████████████████████████████████████████████████████| 1125/1125 [1:20:51<00:00,  4.31s/it]


Epoch 2 | Training Loss: 0.13839342298896776


Validating: 100%|████████████████████████████████████████████████████████████████████| 125/125 [03:12<00:00,  1.54s/it]


Validation Accuracy: 0.9365



Training Epoch 3: 100%|██████████████████████████████████████████████████████████| 1125/1125 [1:23:13<00:00,  4.44s/it]


Epoch 3 | Training Loss: 0.10581489467910594


Validating: 100%|████████████████████████████████████████████████████████████████████| 125/125 [02:26<00:00,  1.17s/it]


Validation Accuracy: 0.9415

Model Saved!
anger
