In [1]:
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from torch.optim import AdamW
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import pandas as pd
from tqdm.notebook import tqdm
# from dotenv import load_dotenv
# load_dotenv()

In [2]:
df = pd.read_csv(
    '../tweet_emotions.csv',
    usecols=['content', 'sentiment'],
    dtype={'content': 'string', 'sentiment': 'category'}
)

In [3]:
df = df.rename(columns={'content': 'tweet', 'sentiment': 'label'})
data_final = df

In [4]:
# Encode labels
label_encoder = LabelEncoder()
data_final['label_enc'] = label_encoder.fit_transform(data_final['label'])

data_final.rename(columns={'label':'label_desc'},inplace=True)
data_final.rename(columns={'label_enc':'label'},inplace=True)

In [5]:
label_map = {}
for index, row in data_final[['label_desc', 'label']].drop_duplicates(keep='first').iterrows():
    label_map[row['label']] = row['label_desc']
label_map

{2: 'empty',
 10: 'sadness',
 3: 'enthusiasm',
 8: 'neutral',
 12: 'worry',
 11: 'surprise',
 7: 'love',
 4: 'fun',
 6: 'hate',
 5: 'happiness',
 1: 'boredom',
 9: 'relief',
 0: 'anger'}

In [6]:
# Split data
X = data_final['tweet']
y = data_final['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_encoder.classes_)).to(device)

# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 3



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
train_inputs = [tokenizer.encode(sent, add_special_tokens=True, max_length=256, padding='max_length') for sent in X_train]
test_inputs = [tokenizer.encode(sent, add_special_tokens=True, max_length=256, padding='max_length') for sent in X_test]

# Create attention masks
train_masks = [[float(i > 0) for i in seq] for seq in train_inputs]
test_masks = [[float(i > 0) for i in seq] for seq in test_inputs]

# Convert all data into torch tensors
train_inputs = torch.tensor(train_inputs)
train_masks = torch.tensor(train_masks)
train_labels = torch.tensor(y_train.values)

test_inputs = torch.tensor(test_inputs)
test_masks = torch.tensor(test_masks)
test_labels = torch.tensor(y_test.values)

In [10]:
batch_size = 32

# Create DataLoader for training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create DataLoader for test set
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [11]:
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [12]:
# Training loop
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    model.train()
    
    total_loss = 0
    train_accuracy = 0
    for step, batch in enumerate(tqdm(train_dataloader, desc="Training")):
        batch_input_ids, batch_input_mask, batch_labels = batch
        batch_input_ids = batch_input_ids.to(device)
        batch_input_mask = batch_input_mask.to(device)
        batch_labels = batch_labels.to(device)
        
        model.zero_grad()
        
        outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_input_mask, labels=batch_labels)
        loss = outputs.loss
        logits = outputs.logits
        
        total_loss += loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        preds = torch.argmax(logits, dim=1)
        train_accuracy += (preds == batch_labels).sum().item() / len(batch_labels)
    
    avg_train_loss = total_loss / len(train_dataloader)
    avg_train_accuracy = train_accuracy / len(train_dataloader)
    print(f"Training loss: {avg_train_loss:.4f}")
    print(f"Training accuracy: {avg_train_accuracy:.4f}")
    
    # Validation loop
    model.eval()
    val_accuracy = 0
    all_val_preds = []
    all_val_labels = []
    for batch in tqdm(test_dataloader, desc="Validation"):
        batch_input_ids, batch_input_mask, batch_labels = batch
        batch_input_ids = batch_input_ids.to(device)
        batch_input_mask = batch_input_mask.to(device)
        batch_labels = batch_labels.to(device)
        
        with torch.no_grad():
            outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_input_mask, labels=batch_labels)
        
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        val_accuracy += (preds == batch_labels).sum().item() / len(batch_labels)

        all_val_preds.extend(preds.cpu().numpy())
        all_val_labels.extend(batch_labels.cpu().numpy())
    
    avg_val_accuracy = val_accuracy / len(test_dataloader)
    val_f1_macro = f1_score(all_val_labels, all_val_preds, average='macro', zero_division=0)

    print(f"Validation accuracy: {avg_val_accuracy:.4f}")
    print(f"Validation F1 Macro: {val_f1_macro:.4f}")

Epoch 1/3


Training:   0%|          | 0/1000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [37]:
model.save_pretrained('./bert-emotion-classifier')
tokenizer.save_pretrained('./bert-emotion-classifier')

('./bert-emotion-classifier/tokenizer_config.json',
 './bert-emotion-classifier/special_tokens_map.json',
 './bert-emotion-classifier/vocab.txt',
 './bert-emotion-classifier/added_tokens.json')

In [38]:
model_path = './bert-emotion-classifier'
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.to(device)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [39]:
def predict_emotion(text):
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    return label_encoder.inverse_transform([predicted_class])[0]

In [40]:
example_indices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
example_texts = X_test.iloc[example_indices].tolist()
example_labels = y_test.iloc[example_indices].tolist()

for text, true_label in zip(example_texts, example_labels):
    predicted_label = predict_emotion(text)
    print(f"Text: {text}")
    print(f"True Label: {label_map[true_label]}")
    print(f"Predicted Label: {predicted_label}\n")

Text: Good Morning
True Label: neutral
Predicted Label: neutral

Text: I just put my computer up on craigslist. I've had the same case, monitor, and speakers for over 5 years.
True Label: empty
Predicted Label: neutral

Text: in ten minutes shopping   demi lovato-back around demi lovato-behind enemy lines have you all seen the titanic 2 trailer? its really good!
True Label: love
Predicted Label: happiness

Text: From twitterberry moved to ubertwitter - suffered from BB cache errors
True Label: neutral
Predicted Label: worry

Text: @thriftymom TEAR*
True Label: sadness
Predicted Label: sadness

Text: Ps... I got Ben Button today... completes my Best Picture noms....got all 5 now. It was a GREAT year!!
True Label: happiness
Predicted Label: happiness

Text: Just fell asleep for 2 &amp; 1/2 hours so missed both chances for first-look Hollyoaks, ugh
True Label: worry
Predicted Label: sadness

Text: @VanetaRogers Thank ye, Vaneta.  Much appreciated
True Label: happiness
Predicted Label: lov