In [1]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
pashupatigupta_emotion_detection_from_text_path = kagglehub.dataset_download('pashupatigupta/emotion-detection-from-text')

print('Data source import complete.')


Downloading from https://www.kaggle.com/api/v1/datasets/download/pashupatigupta/emotion-detection-from-text?dataset_version_number=1...


100%|██████████| 1.56M/1.56M [00:00<00:00, 129MB/s]

Extracting files...
Data source import complete.





In [1]:
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
df = pd.read_csv(
    './tweet_emotions.csv',
    usecols=['content', 'sentiment'],
    dtype={'content': 'string', 'sentiment': 'category'}
)

In [3]:
df = df.rename(columns={'content': 'tweet', 'sentiment': 'label'})

In [4]:
# Encode labels
label_encoder = LabelEncoder()
df['label_enc'] = label_encoder.fit_transform(df['label'])

df.rename(columns={'label':'label_desc'},inplace=True)
df.rename(columns={'label_enc':'label'},inplace=True)

In [5]:
label_map = {}
for index, row in df[['label_desc', 'label']].drop_duplicates(keep='first').iterrows():
    label_map[row['label']] = row['label_desc']
label_map

{2: 'empty',
 10: 'sadness',
 3: 'enthusiasm',
 8: 'neutral',
 12: 'worry',
 11: 'surprise',
 7: 'love',
 4: 'fun',
 6: 'hate',
 5: 'happiness',
 1: 'boredom',
 9: 'relief',
 0: 'anger'}

In [6]:
# Define emotion-to-category mapping
emotion_to_category = {
    'neutral': 0,
    'happiness': 1, 'love': 1, 'relief': 1, 'enthusiasm': 1, 'surprise': 1, 'fun': 1,
    'anger': 2, 'sadness': 2, 'worry': 2, 'hate': 2, 'boredom': 2, 'empty': 2
}

# Map the 'label_desc' column to new categories
df['label_desc'] = df['label_desc'].map(emotion_to_category)

In [7]:
data_final = df

In [8]:
# Split data
X = data_final['tweet']
y = data_final['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=42)
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_encoder.classes_)).to(device)

# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 50

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
train_inputs = [tokenizer.encode(sent, add_special_tokens=True, max_length=256, pad_to_max_length=True) for sent in X_train]
test_inputs = [tokenizer.encode(sent, add_special_tokens=True, max_length=256, pad_to_max_length=True) for sent in X_test]

# Create attention masks
train_masks = [[float(i > 0) for i in seq] for seq in train_inputs]
test_masks = [[float(i > 0) for i in seq] for seq in test_inputs]

# Convert all data into torch tensors
train_inputs = torch.tensor(train_inputs)
train_masks = torch.tensor(train_masks)
train_labels = torch.tensor(y_train.values)

test_inputs = torch.tensor(test_inputs)
test_masks = torch.tensor(test_masks)
test_labels = torch.tensor(y_test.values)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [12]:
batch_size = 32

# Create DataLoader for training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create DataLoader for test set
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [13]:
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [None]:
# Training loop
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    model.train()

    total_loss = 0
    train_accuracy = 0
    for step, batch in enumerate(tqdm(train_dataloader, desc="Training")):
        batch_input_ids, batch_input_mask, batch_labels = batch
        batch_input_ids = batch_input_ids.to(device)
        batch_input_mask = batch_input_mask.to(device)
        batch_labels = batch_labels.to(device)

        model.zero_grad()

        outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_input_mask, labels=batch_labels)
        loss = outputs.loss
        logits = outputs.logits

        total_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        preds = torch.argmax(logits, dim=1)
        train_accuracy += (preds == batch_labels).sum().item() / len(batch_labels)

    avg_train_loss = total_loss / len(train_dataloader)
    avg_train_accuracy = train_accuracy / len(train_dataloader)
    print(f"Training loss: {avg_train_loss:.4f}")
    print(f"Training accuracy: {avg_train_accuracy:.4f}")

    # Validation loop
    model.eval()
    val_accuracy = 0
    for batch in tqdm(test_dataloader, desc="Validation"):
        batch_input_ids, batch_input_mask, batch_labels = batch
        batch_input_ids = batch_input_ids.to(device)
        batch_input_mask = batch_input_mask.to(device)
        batch_labels = batch_labels.to(device)

        with torch.no_grad():
            outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_input_mask, labels=batch_labels)

        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        val_accuracy += (preds == batch_labels).sum().item() / len(batch_labels)

    avg_val_accuracy = val_accuracy / len(test_dataloader)
    print(f"Validation accuracy: {avg_val_accuracy:.4f}")

Epoch 1/50


Training:   0%|          | 0/125 [00:00<?, ?it/s]

In [None]:
model.save_pretrained('./bert-emotion-classifier')
tokenizer.save_pretrained('./bert-emotion-classifier')

In [None]:
model_path = './bert-emotion-classifier'
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.to(device)
model.eval()

In [None]:
def predict_emotion(text):
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    return label_encoder.inverse_transform([predicted_class])[0]

In [None]:
example_indices = [0, 1, 2]
example_texts = X_test.iloc[example_indices].tolist()
example_labels = y_test.iloc[example_indices].tolist()

for text, true_label in zip(example_texts, example_labels):
    predicted_label = predict_emotion(text)
    print(f"Text: {text}")
    print(f"True Label: {label_map[true_label]}")
    print(f"Predicted Label: {predicted_label}\n")