# Emotion Detection

In [15]:
from tqdm import tqdm
# Split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

# Dataset Class for PyTorch
class EmotionDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.df.iloc[index]['text']
        labels = self.df.iloc[index][selected_emotions].values.astype(float)  # Convert labels to float
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.float)
        }

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = EmotionDataset(train_df, tokenizer, max_len=128)
test_dataset = EmotionDataset(test_df, tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Define LSTM Model
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim=128, hidden_dim=256, output_dim=len(selected_emotions)):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(30522, embedding_dim)  # Assuming vocab size of BERT tokenizer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, input_ids):
        embedded = self.embedding(input_ids)
        lstm_output, _ = self.lstm(embedded)
        logits = self.fc(lstm_output[:, -1, :])
        return logits

# Define RNN Model (Simple RNN)
class RNNClassifier(nn.Module):
    def __init__(self, embedding_dim=128, hidden_dim=256, output_dim=len(selected_emotions)):
        super(RNNClassifier, self).__init__()
        self.embedding = nn.Embedding(30522, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, input_ids):
        embedded = self.embedding(input_ids)
        rnn_output, _ = self.rnn(embedded)
        logits = self.fc(rnn_output[:, -1, :])
        return logits

# Define CNN Model
class CNNClassifier(nn.Module):
    def __init__(self, vocab_size=30522, embedding_dim=128, num_classes=len(selected_emotions)):
        super(CNNClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv1d = nn.Conv1d(in_channels=embedding_dim,
                                 out_channels=128,
                                 kernel_size=3,
                                 padding=1)  # Adjust padding as needed
        self.relu = nn.ReLU()
        self.fc = nn.Linear(128, num_classes)

    def forward(self, input_ids):
        embedded = self.embedding(input_ids).permute(0, 2, 1)  # Change shape to (batch_size, embedding_dim, seq_length)
        conv_output = self.relu(self.conv1d(embedded))
        pooled_output = torch.max(conv_output, dim=2)[0]  # Max pooling over the time dimension
        logits = self.fc(pooled_output)
        return logits

# Initialize models and optimizers
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

lstm_model = LSTMClassifier().to(device)
rnn_model = RNNClassifier().to(device)
cnn_model = CNNClassifier().to(device)

optimizer_lstm = torch.optim.Adam(lstm_model.parameters(), lr=0.001)
optimizer_rnn = torch.optim.Adam(rnn_model.parameters(), lr=0.001)
optimizer_cnn = torch.optim.Adam(cnn_model.parameters(), lr=0.001)

# Training Function
def train_model(model, optimizer):
    model.train()
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        
        outputs = model(input_ids)
        
        loss_fn = nn.BCEWithLogitsLoss()  # Multi-label classification loss function
        loss = loss_fn(outputs, labels)  # No need to apply sigmoid here
        
        loss.backward()
        optimizer.step()

# Evaluate Function
def evaluate_model(model):
    model.eval()
    predictions_list = []
    actuals_list = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].numpy()

            outputs = model(input_ids).sigmoid()  # Apply sigmoid to get probabilities
            
            predictions_list.extend((outputs.cpu().numpy() > 0.5).astype(int))  # Convert probabilities to binary predictions
            actuals_list.extend(labels)

    return np.array(predictions_list), np.array(actuals_list)

# Train and evaluate each model

# Train LSTM Model
print("Training LSTM Model...")
for epoch in range(3):  # Number of epochs can be adjusted
    print(f"Epoch {epoch + 1}")
    train_model(lstm_model, optimizer_lstm)

lstm_predictions, lstm_actuals = evaluate_model(lstm_model)
print("\nLSTM Model Classification Report:")
print(classification_report(lstm_actuals, lstm_predictions, target_names=selected_emotions))

# Train RNN Model
print("Training RNN Model...")
for epoch in range(3):  # Number of epochs can be adjusted
    print(f"Epoch {epoch + 1}")
    train_model(rnn_model, optimizer_rnn)

rnn_predictions, rnn_actuals = evaluate_model(rnn_model)
print("\nRNN Model Classification Report:")
print(classification_report(rnn_actuals, rnn_predictions, target_names=selected_emotions))

# Train CNN Model
print("Training CNN Model...")
for epoch in range(3):  # Number of epochs can be adjusted
    print(f"Epoch {epoch + 1}")
    train_model(cnn_model, optimizer_cnn)

cnn_predictions, cnn_actuals = evaluate_model(cnn_model)
print("\nCNN Model Classification Report:")
print(classification_report(cnn_actuals, cnn_predictions, target_names=selected_emotions))

Training LSTM Model...
Epoch 1


100%|████████████████████████████████████████████████████████████████████████████████| 156/156 [00:22<00:00,  6.91it/s]


Epoch 2


100%|████████████████████████████████████████████████████████████████████████████████| 156/156 [00:23<00:00,  6.68it/s]


Epoch 3


100%|████████████████████████████████████████████████████████████████████████████████| 156/156 [00:33<00:00,  4.64it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 18/18 [00:01<00:00, 12.24it/s]



LSTM Model Classification Report:
              precision    recall  f1-score   support

       Anger       0.00      0.00      0.00        29
        Fear       0.54      1.00      0.70       150
         Joy       0.00      0.00      0.00        68
     Sadness       0.00      0.00      0.00        87
    Surprise       0.00      0.00      0.00        85

   micro avg       0.54      0.36      0.43       419
   macro avg       0.11      0.20      0.14       419
weighted avg       0.19      0.36      0.25       419
 samples avg       0.54      0.32      0.39       419

Training RNN Model...
Epoch 1


100%|████████████████████████████████████████████████████████████████████████████████| 156/156 [00:58<00:00,  2.67it/s]


Epoch 2


100%|████████████████████████████████████████████████████████████████████████████████| 156/156 [00:19<00:00,  8.02it/s]


Epoch 3


100%|████████████████████████████████████████████████████████████████████████████████| 156/156 [00:18<00:00,  8.41it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 26.93it/s]



RNN Model Classification Report:
              precision    recall  f1-score   support

       Anger       0.00      0.00      0.00        29
        Fear       0.54      1.00      0.70       150
         Joy       0.00      0.00      0.00        68
     Sadness       0.00      0.00      0.00        87
    Surprise       0.00      0.00      0.00        85

   micro avg       0.54      0.36      0.43       419
   macro avg       0.11      0.20      0.14       419
weighted avg       0.19      0.36      0.25       419
 samples avg       0.54      0.32      0.39       419

Training CNN Model...
Epoch 1


100%|████████████████████████████████████████████████████████████████████████████████| 156/156 [00:12<00:00, 12.19it/s]


Epoch 2


100%|████████████████████████████████████████████████████████████████████████████████| 156/156 [00:12<00:00, 12.53it/s]


Epoch 3


100%|████████████████████████████████████████████████████████████████████████████████| 156/156 [00:12<00:00, 12.16it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 42.23it/s]


CNN Model Classification Report:
              precision    recall  f1-score   support

       Anger       0.00      0.00      0.00        29
        Fear       0.65      0.74      0.69       150
         Joy       0.42      0.12      0.18        68
     Sadness       0.43      0.07      0.12        87
    Surprise       0.68      0.42      0.52        85

   micro avg       0.63      0.38      0.48       419
   macro avg       0.44      0.27      0.30       419
weighted avg       0.53      0.38      0.41       419
 samples avg       0.45      0.34      0.37       419






In [14]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader, Dataset
import torch
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW
from tqdm import tqdm

# Load the training dataset
train_file = 'public_data/train/track_a/eng.csv'
df = pd.read_csv(train_file)

# Select relevant columns and convert emotions to multi-label binary format
selected_emotions = ['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']
df = df[selected_emotions + ['text']]
mlb = MultiLabelBinarizer()
df[selected_emotions] = df[selected_emotions].apply(lambda x: x > 0).astype(int)

# Split the data into training and testing sets (using the same data for both)
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

# Dataset Class for PyTorch
class EmotionDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.df.iloc[index]['text']
        labels = self.df.iloc[index][selected_emotions].values.astype(float)  # Convert labels to float
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.float)
        }

# Tokenizer for ALBERT
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
train_dataset = EmotionDataset(train_df, tokenizer, max_len=128)
test_dataset = EmotionDataset(test_df, tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Initialize ALBERT Model for Multi-Label Classification
model_albert = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(selected_emotions))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_albert.to(device)

optimizer_albert = AdamW(model_albert.parameters(), lr=2e-5)

# Training Loop for ALBERT Model
for epoch in range(3):
    model_albert.train()
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}", unit="batch")
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer_albert.zero_grad()
        
        outputs = model_albert(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        loss.backward()
        optimizer_albert.step()
        
        progress_bar.set_postfix(loss=loss.item())

# Evaluate ALBERT Model on Test Set
model_albert.eval()
predictions_list = []
actuals_list = []

with torch.no_grad():
    progress_bar = tqdm(test_loader, desc="Evaluating", unit="batch")
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        outputs = model_albert(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.sigmoid(outputs.logits).cpu().numpy()
        
        predictions_list.extend(preds)
        actuals_list.extend(batch['labels'].numpy())

# Convert predictions to binary (0 or 1) based on threshold (0.5)
predictions_binary = (np.array(predictions_list) > 0.5).astype(int)

# Print classification report for ALBERT Model
print("\nALBERT Model Classification Report:")
print(classification_report(actuals_list, predictions_binary, target_names=selected_emotions))

# Save the trained ALBERT model and tokenizer in a specified folder
model_save_path = './albert_emotion_model'
model_albert.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 156/156 [47:10<00:00, 18.14s/batch, loss=0.577]
Epoch 2: 100%|██████████| 156/156 [49:06<00:00, 18.89s/batch, loss=0.46] 
Epoch 3: 100%|██████████| 156/156 [36:41<00:00, 14.11s/batch, loss=0.315]
Evaluating: 100%|██████████| 18/18 [01:02<00:00,  3.49s/batch]


ALBERT Model Classification Report:
              precision    recall  f1-score   support

       Anger       1.00      0.10      0.19        29
        Fear       0.87      0.58      0.70       150
         Joy       0.63      0.57      0.60        68
     Sadness       0.91      0.37      0.52        87
    Surprise       0.84      0.48      0.61        85

   micro avg       0.81      0.48      0.60       419
   macro avg       0.85      0.42      0.52       419
weighted avg       0.84      0.48      0.59       419
 samples avg       0.54      0.45      0.48       419






('./albert_emotion_model\\tokenizer_config.json',
 './albert_emotion_model\\special_tokens_map.json',
 './albert_emotion_model\\spiece.model',
 './albert_emotion_model\\added_tokens.json')

In [3]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, accuracy_score
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from tqdm import tqdm

# Load the training dataset
train_file = 'public_data/train/track_a/eng.csv'
df = pd.read_csv(train_file)

# Select relevant columns and convert emotions to multi-label binary format
selected_emotions = ['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']
df = df[selected_emotions + ['text']]
mlb = MultiLabelBinarizer()
df[selected_emotions] = df[selected_emotions].apply(lambda x: x > 0).astype(int)

# Split the data into training and testing sets (using the same data for both)
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

# Dataset Class for PyTorch
class EmotionDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.df.iloc[index]['text']
        labels = self.df.iloc[index][selected_emotions].values.astype(float)  # Convert labels to float
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.float)
        }

# Tokenizer for RoBERTa
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
train_dataset = EmotionDataset(train_df, tokenizer, max_len=128)
test_dataset = EmotionDataset(test_df, tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Initialize RoBERTa Model for Multi-Label Classification
model_roberta = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(selected_emotions))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_roberta.to(device)

optimizer_roberta = AdamW(model_roberta.parameters(), lr=2e-5)

# Training Loop for RoBERTa Model
for epoch in range(3):
    model_roberta.train()
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}", unit="batch")
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer_roberta.zero_grad()
        
        outputs = model_roberta(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        loss.backward()
        optimizer_roberta.step()
        
        progress_bar.set_postfix(loss=loss.item())

# Evaluate RoBERTa Model on Test Set
model_roberta.eval()
predictions_list = []
actuals_list = []

with torch.no_grad():
    progress_bar = tqdm(test_loader, desc="Evaluating", unit="batch")
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        outputs = model_roberta(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.sigmoid(outputs.logits).cpu().numpy()
        
        predictions_list.extend(preds)
        actuals_list.extend(batch['labels'].numpy())

# Convert predictions to binary (0 or 1) based on threshold (0.5)
predictions_binary = (np.array(predictions_list) > 0.5).astype(int)

# Print classification report for RoBERTa Model
print("\nRoBERTa Model Classification Report:")
print(classification_report(actuals_list, predictions_binary, target_names=selected_emotions))

# Save the trained RoBERTa model and tokenizer in a specified folder
model_save_path = './roberta_emotion_model'
model_roberta.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 156/156 [41:03<00:00, 15.79s/batch, loss=0.448]
Epoch 2: 100%|██████████| 156/156 [40:56<00:00, 15.75s/batch, loss=0.302]
Epoch 3: 100%|██████████| 156/156 [41:30<00:00, 15.96s/batch, loss=0.211]
Evaluating: 100%|██████████| 18/18 [01:26<00:00,  4.83s/batch]



RoBERTa Model Classification Report:
              precision    recall  f1-score   support

       Anger       0.60      0.62      0.61        29
        Fear       0.72      0.89      0.80       150
         Joy       0.77      0.53      0.63        68
     Sadness       0.66      0.70      0.68        87
    Surprise       0.79      0.59      0.68        85

   micro avg       0.72      0.71      0.71       419
   macro avg       0.71      0.67      0.68       419
weighted avg       0.72      0.71      0.71       419
 samples avg       0.63      0.63      0.61       419



('./roberta_emotion_model\\tokenizer_config.json',
 './roberta_emotion_model\\special_tokens_map.json',
 './roberta_emotion_model\\vocab.json',
 './roberta_emotion_model\\merges.txt',
 './roberta_emotion_model\\added_tokens.json')

In [29]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader, Dataset
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from tqdm import tqdm

# Load the training dataset
train_file = 'public_data/train/track_a/eng.csv'
df = pd.read_csv(train_file)

# Select relevant columns and convert emotions to multi-label binary format
selected_emotions = ['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']
df = df[selected_emotions + ['text']]
mlb = MultiLabelBinarizer()
df[selected_emotions] = df[selected_emotions].apply(lambda x: x > 0).astype(int)

# Split the data into training and testing sets (using the same data for both)
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

# Dataset Class for PyTorch
class EmotionDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.df.iloc[index]['text']
        labels = self.df.iloc[index][selected_emotions].values.astype(float)  # Convert labels to float
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.float)
        }

# Tokenizer for DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_dataset = EmotionDataset(train_df, tokenizer, max_len=128)
test_dataset = EmotionDataset(test_df, tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Initialize DistilBERT Model for Multi-Label Classification
model_distilbert = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(selected_emotions))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_distilbert.to(device)

optimizer_distilbert = AdamW(model_distilbert.parameters(), lr=2e-5)

# Training Loop for DistilBERT Model
for epoch in range(3):
    model_distilbert.train()
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}", unit="batch")
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer_distilbert.zero_grad()
        
        outputs = model_distilbert(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        loss.backward()
        optimizer_distilbert.step()
        
        progress_bar.set_postfix(loss=loss.item())

# Evaluate DistilBERT Model on Test Set
model_distilbert.eval()
predictions_list = []
actuals_list = []

with torch.no_grad():
    progress_bar = tqdm(test_loader, desc="Evaluating", unit="batch")
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        outputs = model_distilbert(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.sigmoid(outputs.logits).cpu().numpy()
        
        predictions_list.extend(preds)
        actuals_list.extend(batch['labels'].numpy())

# Convert predictions to binary (0 or 1) based on threshold (0.5)
predictions_binary = (np.array(predictions_list) > 0.5).astype(int)

# Print classification report for DistilBERT Model
print("\nDistilBERT Model Classification Report:")
print(classification_report(actuals_list, predictions_binary, target_names=selected_emotions))

# Save the trained DistilBERT model and tokenizer in a specified folder
model_save_path = './distilbert_emotion_model'
model_distilbert.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 156/156 [13:29<00:00,  5.19s/batch, loss=0.348]
Epoch 2: 100%|██████████| 156/156 [13:02<00:00,  5.02s/batch, loss=0.448]
Epoch 3: 100%|██████████| 156/156 [13:39<00:00,  5.25s/batch, loss=0.184]
Evaluating: 100%|██████████| 18/18 [00:29<00:00,  1.61s/batch]



DistilBERT Model Classification Report:
              precision    recall  f1-score   support

       Anger       0.80      0.28      0.41        29
        Fear       0.73      0.88      0.80       150
         Joy       0.73      0.44      0.55        68
     Sadness       0.62      0.59      0.60        87
    Surprise       0.70      0.62      0.66        85

   micro avg       0.70      0.65      0.68       419
   macro avg       0.72      0.56      0.60       419
weighted avg       0.70      0.65      0.66       419
 samples avg       0.60      0.58      0.57       419



('./distilbert_emotion_model\\tokenizer_config.json',
 './distilbert_emotion_model\\special_tokens_map.json',
 './distilbert_emotion_model\\vocab.txt',
 './distilbert_emotion_model\\added_tokens.json')

In [30]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout
from tqdm import tqdm

# Load the training and test datasets
train_file = 'public_data/train/track_a/eng.csv'
test_file = 'public_data/test/track_a/eng_a.csv'

df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)

# Select the relevant columns
selected_emotions = ['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']
df = df[selected_emotions + ['text']]

# Convert emotions to multi-label binary format
df[selected_emotions] = df[selected_emotions].apply(lambda x: x > 0).astype(int)

# Split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

# Dataset Class for PyTorch
class EmotionDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.df.iloc[index]['text']
        labels = self.df.iloc[index][selected_emotions].values.astype(float)  # Convert labels to float
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.float)
        }

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = EmotionDataset(train_df, tokenizer, max_len=128)
test_dataset = EmotionDataset(test_df, tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# BERT Model
model_bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(selected_emotions))
model_bert = model_bert.to(device)

optimizer_bert = AdamW(model_bert.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_loader) * 3
scheduler_bert = get_linear_schedule_with_warmup(optimizer_bert, num_warmup_steps=0, num_training_steps=total_steps)

# Training Loop for BERT Model
for epoch in range(3):
    model_bert.train()
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}", unit="batch")
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer_bert.zero_grad()
        outputs = model_bert(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer_bert.step()
        scheduler_bert.step()
        
        progress_bar.set_postfix(loss=loss.item())

# Evaluate BERT Model on Test Set
model_bert.eval()
predictions = []
actuals = []

with torch.no_grad():
    progress_bar = tqdm(test_loader, desc="Evaluating", unit="batch")
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        outputs = model_bert(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.sigmoid(outputs.logits).cpu().numpy()
        
        predictions.extend(preds)
        actuals.extend(batch['labels'].numpy())

# Convert predictions to binary (0 or 1) based on threshold (0.5)
predictions_binary = (np.array(predictions) > 0.5).astype(int)

# Print classification report for BERT Model
print("\nBERT Model Classification Report:")
print(classification_report(actuals, predictions_binary, target_names=selected_emotions))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 156/156 [45:19<00:00, 17.43s/batch, loss=0.349]
Epoch 2: 100%|██████████| 156/156 [43:06<00:00, 16.58s/batch, loss=0.263]
Epoch 3: 100%|██████████| 156/156 [52:58<00:00, 20.38s/batch, loss=0.145]
Evaluating: 100%|██████████| 18/18 [01:37<00:00,  5.44s/batch]


BERT Model Classification Report:
              precision    recall  f1-score   support

       Anger       0.67      0.41      0.51        29
        Fear       0.78      0.85      0.82       150
         Joy       0.74      0.57      0.64        68
     Sadness       0.73      0.53      0.61        87
    Surprise       0.74      0.65      0.69        85

   micro avg       0.75      0.67      0.71       419
   macro avg       0.73      0.60      0.66       419
weighted avg       0.75      0.67      0.70       419
 samples avg       0.63      0.60      0.60       419






# TESTING

In [39]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download necessary NLTK data
nltk.download('vader_lexicon', quiet=True)

# Initialize the NLTK sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Define the selected emotions
selected_emotions = ['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']

def classify_emotions(text):
    try:
        # Get the sentiment scores
        scores = sia.polarity_scores(text)
        
        # Map sentiment scores to emotions
        emotions = []
        if scores['compound'] >= 0.5:
            emotions.append('Joy')
        elif scores['compound'] <= -0.5:
            emotions.append('Sadness')
            if scores['neg'] > 0.5:
                emotions.append('Anger')
        
        if abs(scores['compound']) < 0.5 and scores['neu'] > 0.5:
            emotions.append('Surprise')
        
        if scores['neg'] > 0.3 and 'Joy' not in emotions:
            emotions.append('Fear')
        
        # If no emotions detected, return the dominant sentiment
        if not emotions:
            if scores['pos'] > scores['neg']:
                return 'Positive'
            elif scores['neg'] > scores['pos']:
                return 'Negative'
            else:
                return 'Neutral'
        
        return ', '.join(emotions)
    except Exception as e:
        return f"Error: {str(e)}"

# Example text inputs
texts = [
    "I am so excited about the new movie release!",
    "I feel anxious about the upcoming exam.",
    "This betrayal makes me very angry and sad.",
    "I am feeling quite down today but there was also a pleasant surprise.",
    "The surprise party was amazing and it made me very happy!"
]

# Classify emotions for each text
for text in texts:
    emotions = classify_emotions(text)
    print(f"Text: {text}")
    print(f"Detected Emotions: {emotions}\n")

Text: I am so excited about the new movie release!
Detected Emotions: Surprise

Text: I feel anxious about the upcoming exam.
Detected Emotions: Surprise

Text: This betrayal makes me very angry and sad.
Detected Emotions: Sadness, Anger, Fear

Text: I am feeling quite down today but there was also a pleasant surprise.
Detected Emotions: Joy

Text: The surprise party was amazing and it made me very happy!
Detected Emotions: Joy



In [31]:
import torch
from transformers import BertTokenizer

#tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#emotion labels
selected_emotions = ['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']

#function to preprocess text
def preprocess_text(text, tokenizer, max_len=128):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )
    return encoding['input_ids'].flatten(), encoding['attention_mask'].flatten()

#prediction function
def predict(text):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        lstm_output = lstm_model(input_ids).sigmoid()
        rnn_output = rnn_model(input_ids).sigmoid()
        cnn_output = cnn_model(input_ids).sigmoid()

    lstm_predicted_labels = (lstm_output.cpu().numpy() > 0.5).astype(int)[0]
    rnn_predicted_labels = (rnn_output.cpu().numpy() > 0.5).astype(int)[0]
    cnn_predicted_labels = (cnn_output.cpu().numpy() > 0.5).astype(int)[0]

    return lstm_predicted_labels, rnn_predicted_labels, cnn_predicted_labels

#sample texts
sample_texts = [
    "I am so happy with the outcome of the project!",
    "The news was really shocking and surprising.",
    "I am angry",
    "The movie was very sad and touching."
]

# format predictions
def format_predictions(predictions):
    return {
        emotion: 'Present' if pred == 1 else 'Not Present'
        for emotion, pred in zip(selected_emotions, predictions)
    }

# Print the formatted predictions for each sample text
for sample_text in sample_texts:
    print(f"Sample Text: '{sample_text}'")
    
    lstm_prediction, rnn_prediction, cnn_prediction = predict(sample_text)
    
    print("LSTM Model Predictions:")
    formatted_lstm_predictions = format_predictions(lstm_prediction)
    for emotion, status in formatted_lstm_predictions.items():
        print(f"  Emotion '{emotion}': {status}")
    print()
    
    print("RNN Model Predictions:")
    formatted_rnn_predictions = format_predictions(rnn_prediction)
    for emotion, status in formatted_rnn_predictions.items():
        print(f"  Emotion '{emotion}': {status}")
    print()
    
    print("CNN Model Predictions:")
    formatted_cnn_predictions = format_predictions(cnn_prediction)
    for emotion, status in formatted_cnn_predictions.items():
        print(f"  Emotion '{emotion}': {status}")
    print()


Sample Text: 'I am so happy with the outcome of the project!'
LSTM Model Predictions:
  Emotion 'Anger': Present
  Emotion 'Fear': Not Present
  Emotion 'Joy': Not Present
  Emotion 'Sadness': Present
  Emotion 'Surprise': Not Present

RNN Model Predictions:
  Emotion 'Anger': Present
  Emotion 'Fear': Present
  Emotion 'Joy': Not Present
  Emotion 'Sadness': Not Present
  Emotion 'Surprise': Not Present

CNN Model Predictions:
  Emotion 'Anger': Not Present
  Emotion 'Fear': Not Present
  Emotion 'Joy': Present
  Emotion 'Sadness': Not Present
  Emotion 'Surprise': Not Present

Sample Text: 'The news was really shocking and surprising.'
LSTM Model Predictions:
  Emotion 'Anger': Present
  Emotion 'Fear': Not Present
  Emotion 'Joy': Not Present
  Emotion 'Sadness': Present
  Emotion 'Surprise': Not Present

RNN Model Predictions:
  Emotion 'Anger': Present
  Emotion 'Fear': Present
  Emotion 'Joy': Not Present
  Emotion 'Sadness': Not Present
  Emotion 'Surprise': Not Present

CNN Mod

In [14]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout
from tqdm import tqdm
from transformers import BertForSequenceClassification, BertTokenizer

model_save_path='./bert_emotion_model'
loaded_model = BertForSequenceClassification.from_pretrained(model_save_path)
loaded_tokenizer = BertTokenizer.from_pretrained(model_save_path)

import torch

# Define the device to use (GPU if available, otherwise CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def predict(text):
    encoding = loaded_tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = loaded_model(input_ids=input_ids, attention_mask=attention_mask)
    
    preds = torch.sigmoid(outputs.logits).cpu().numpy()
    return (preds > 0.5).astype(int)

complex_text = (
    "Despite facing numerous challenges in life, I find solace in my ability to adapt and overcome. "
    "The constant pressure from work and personal expectations can be overwhelming, yet I strive to maintain "
    "a positive outlook. However, there are moments of doubt that creep in, especially when I reflect on past "
    "failures and the fear of repeating them. It's a bittersweet journey filled with highs and lows, but I know "
    "that every experience shapes who I am and I am joyful in my life."
)

predicted_probs = predict(complex_text)

# Convert probabilities to binary predictions (0 or 1)
predicted_labels = (predicted_probs > 0.5).astype(int)[0]

# Display detailed results
emotion_labels = ['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']
detailed_results = {emotion: predicted_labels[i] for i, emotion in enumerate(emotion_labels)}

print(f"Predicted probabilities for '{complex_text}': {predicted_probs}")
print(f"Predicted labels: {detailed_results}")

Predicted probabilities for 'Despite facing numerous challenges in life, I find solace in my ability to adapt and overcome. The constant pressure from work and personal expectations can be overwhelming, yet I strive to maintain a positive outlook. However, there are moments of doubt that creep in, especially when I reflect on past failures and the fear of repeating them. It's a bittersweet journey filled with highs and lows, but I know that every experience shapes who I am and I am joyful in my life.': [[0 0 1 1 0]]
Predicted labels: {'Anger': 0, 'Fear': 0, 'Joy': 1, 'Sadness': 1, 'Surprise': 0}


In [22]:
import torch
from transformers import AlbertTokenizer, AlbertForSequenceClassification

# Define the emotion classes (corresponding to your model's label indices)
emotion_classes = ['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']

# Load the saved ALBERT model and tokenizer
model_save_path = './albert_emotion_model'
loaded_model = AlbertForSequenceClassification.from_pretrained(model_save_path)
loaded_tokenizer = AlbertTokenizer.from_pretrained(model_save_path)

# Define the device (CPU or GPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
loaded_model.to(device)

# Define a prediction function using the loaded model and tokenizer
def predict(text):
    encoding = loaded_tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
       outputs = loaded_model(input_ids=input_ids, attention_mask=attention_mask)
       preds = torch.sigmoid(outputs.logits).cpu().numpy()

    # Convert predictions to binary labels
    binary_preds = (preds > 0.5).astype(int)
    
    # Map binary predictions to emotion classes
    predicted_emotions = [emotion_classes[i] for i, pred in enumerate(binary_preds[0]) if pred == 1]
    
    return predicted_emotions

# Example usage of prediction function with a complex text:
complex_text = "Despite facing numerous challenges in life, I find solace in my ability to adapt and overcome."
predicted_emotions = predict(complex_text)

# Create a detailed output showing which emotions are present and which are not
detailed_output = {emotion: (emotion in predicted_emotions) for emotion in emotion_classes}

print(f"Text: '{complex_text}'")
print("Detailed Prediction Output:")
for emotion, present in detailed_output.items():
    print(f"{emotion}: {'Present' if present else 'Not Present'}")

Text: 'Despite facing numerous challenges in life, I find solace in my ability to adapt and overcome.'
Detailed Prediction Output:
Anger: Not Present
Fear: Not Present
Joy: Present
Sadness: Not Present
Surprise: Not Present


In [12]:
import torch
from transformers import RobertaForSequenceClassification, RobertaTokenizer

# Define device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the saved RoBERTa model and tokenizer (for future use)
loaded_model = RobertaForSequenceClassification.from_pretrained(model_save_path)
loaded_model = loaded_model.to(device)  # Move the model to the device
loaded_tokenizer = RobertaTokenizer.from_pretrained(model_save_path)

# List of emotions corresponding to the model's output labels
emotions = ['joy', 'sadness', 'anger', 'fear', 'surprise', 'disgust']

# Define a prediction function using the loaded model and tokenizer
def predict(text):
    encoding = loaded_tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = loaded_model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.sigmoid(outputs.logits).cpu().numpy()  # Get probabilities

    return (preds > 0.5).astype(int)[0]  # Convert probabilities to binary (0 or 1)

# Example usage of prediction function with a complex text:
complex_text = "Despite facing numerous challenges in life, I find solace in my ability to adapt and overcome but i am sad that i never succeeded."
predicted_labels = predict(complex_text)

# Create a detailed output mapping the predicted labels to the emotions
def detailed_emotion_output(predicted_labels):
    emotion_results = {}
    for i, label in enumerate(predicted_labels):
        emotion_results[emotions[i]] = 'Present' if label == 1 else 'Not Present'
    return emotion_results

# Get detailed output
detailed_output = detailed_emotion_output(predicted_labels)

# Print detailed output
print(f"Predicted labels for '{complex_text}':")
for emotion, status in detailed_output.items():
    print(f"{emotion.capitalize()}: {status}")

Predicted labels for 'Despite facing numerous challenges in life, I find solace in my ability to adapt and overcome but i am sad that i never succeeded.':
Joy: Not Present
Sadness: Not Present
Anger: Present
Fear: Present
Surprise: Not Present


In [36]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Define the emotion classes (corresponding to your model's label indices)
emotion_classes = ['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']

# Load the saved DistilBERT model and tokenizer
model_save_path = './distilbert_emotion_model'
loaded_model = DistilBertForSequenceClassification.from_pretrained(model_save_path)
loaded_tokenizer = DistilBertTokenizer.from_pretrained(model_save_path)

# Define the device (CPU or GPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
loaded_model.to(device)

# Define a prediction function using the loaded model and tokenizer
def predict(text):
    encoding = loaded_tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
       outputs = loaded_model(input_ids=input_ids, attention_mask=attention_mask)
       preds = torch.sigmoid(outputs.logits).cpu().numpy()

    # Convert predictions to binary labels
    binary_preds = (preds > 0.5).astype(int)
    
    # Map binary predictions to emotion classes
    predicted_emotions = [emotion_classes[i] for i, pred in enumerate(binary_preds[0]) if pred == 1]
    
    return predicted_emotions

# Example usage of prediction function with a complex text:
complex_text = "Despite facing numerous challenges in life, I find solace in my ability to adapt and overcome."
predicted_emotions = predict(complex_text)

# Create a detailed output showing which emotions are present and which are not
detailed_output = {emotion: (emotion in predicted_emotions) for emotion in emotion_classes}

print(f"Text: '{complex_text}'")
print("Detailed Prediction Output:")
for emotion, present in detailed_output.items():
    print(f"{emotion}: {'Present' if present else 'Not Present'}")

Text: 'Despite facing numerous challenges in life, I find solace in my ability to adapt and overcome.'
Detailed Prediction Output:
Anger: Not Present
Fear: Not Present
Joy: Present
Sadness: Not Present
Surprise: Not Present


In [21]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import torch
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AlbertTokenizer, AlbertForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import Dataset, DataLoader

class EmotionDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.df.iloc[index]['text']
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }

class EnsembleModel(nn.Module):
    def __init__(self, num_labels, num_models):
        super(EnsembleModel, self).__init__()
        self.classifier = nn.Linear(num_labels * num_models, num_labels)
        
    def forward(self, model_outputs):
        logits = self.classifier(model_outputs)
        return logits

def predict(model, tokenizer, texts, device, max_len=128):
    model.eval()
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_len, return_tensors="pt")
    
    with torch.no_grad():
        input_ids = encodings['input_ids'].to(device)
        attention_mask = encodings['attention_mask'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = torch.sigmoid(logits).cpu().numpy()
    
    return probs

def ensemble_predict(ensemble_model, models, tokenizers, texts, device, max_len=128):
    ensemble_model.eval()
    for model in models:
        model.eval()
    
    with torch.no_grad():
        model_outputs = []
        for model, tokenizer in zip(models, tokenizers):
            encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_len, return_tensors="pt")
            input_ids = encodings['input_ids'].to(device)
            attention_mask = encodings['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            model_outputs.append(torch.sigmoid(logits))
        
        model_outputs = torch.cat(model_outputs, dim=1)
        ensemble_logits = ensemble_model(model_outputs)
        probs = torch.sigmoid(ensemble_logits).cpu().numpy()
    
    return probs

if __name__ == "__main__":
    selected_emotions = ['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Load models and tokenizers
    distilbert_path = './distilbert_emotion_model'
    albert_path = './albert_emotion_model'
    roberta_path = './roberta_emotion_model'
    
    distilbert_model = DistilBertForSequenceClassification.from_pretrained(distilbert_path).to(device)
    albert_model = AlbertForSequenceClassification.from_pretrained(albert_path).to(device)
    roberta_model = RobertaForSequenceClassification.from_pretrained(roberta_path).to(device)
    
    distilbert_tokenizer = DistilBertTokenizer.from_pretrained(distilbert_path)
    albert_tokenizer = AlbertTokenizer.from_pretrained(albert_path)
    roberta_tokenizer = RobertaTokenizer.from_pretrained(roberta_path)
    
    ensemble_model = EnsembleModel(len(selected_emotions), 3).to(device)
    
    print("Models loaded successfully.")
    
    # Test texts
    test_texts = [
        "The rollercoaster of emotions I experienced during the movie's climax left me breathless, tears streaming down my face even as I smiled at the bittersweet resolution.",
        "As I stood atop the mountain, gazing at the vast expanse before me, I felt a mix of exhilaration and trepidation, my heart racing with the thrill of accomplishment and the fear of the descent ahead.",
        "The unexpected news of my promotion filled me with a paradoxical blend of joy and anxiety, as I celebrated my success while grappling with the weight of new responsibilities.",
        "Watching the sunset over the ocean, I was overcome by a profound sense of peace tinged with a melancholic awareness of life's transient nature.",
        "The heated argument with my best friend left me feeling a tumultuous mix of anger, regret, and a desperate hope for reconciliation."
    ]
    
    # Make predictions
    models = [distilbert_model, albert_model, roberta_model]
    tokenizers = [distilbert_tokenizer, albert_tokenizer, roberta_tokenizer]
    predictions = ensemble_predict(ensemble_model, models, tokenizers, test_texts, device)
    
    # Print results
    for i, text in enumerate(test_texts):
        print(f"\nText {i+1}: {text}")
        print("Predictions:")
        for emotion, prob in zip(selected_emotions, predictions[i]):
            print(f"{emotion}: {prob:.4f} ({'Yes' if prob > 0.5 else 'No'})")
        print("---")

print("Testing completed.")

Models loaded successfully.

Text 1: The rollercoaster of emotions I experienced during the movie's climax left me breathless, tears streaming down my face even as I smiled at the bittersweet resolution.
Predictions:
Anger: 0.4656 (No)
Fear: 0.5063 (Yes)
Joy: 0.4453 (No)
Sadness: 0.5369 (Yes)
Surprise: 0.4720 (No)
---

Text 2: As I stood atop the mountain, gazing at the vast expanse before me, I felt a mix of exhilaration and trepidation, my heart racing with the thrill of accomplishment and the fear of the descent ahead.
Predictions:
Anger: 0.5751 (Yes)
Fear: 0.5885 (Yes)
Joy: 0.3489 (No)
Sadness: 0.4909 (No)
Surprise: 0.4803 (No)
---

Text 3: The unexpected news of my promotion filled me with a paradoxical blend of joy and anxiety, as I celebrated my success while grappling with the weight of new responsibilities.
Predictions:
Anger: 0.5834 (Yes)
Fear: 0.6297 (Yes)
Joy: 0.3331 (No)
Sadness: 0.5186 (Yes)
Surprise: 0.4679 (No)
---

Text 4: Watching the sunset over the ocean, I was over

In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import (
    DistilBertTokenizer, DistilBertForSequenceClassification,
    AlbertTokenizer, AlbertForSequenceClassification,
    RobertaTokenizer, RobertaForSequenceClassification,
    BertTokenizer, BertForSequenceClassification,
    AdamW
)
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# Load and preprocess the data
def load_data(file_path):
    df = pd.read_csv(file_path)
    df.fillna('', inplace=True)  # Replace NaN values with an empty string
    return df

# Dataset Class for PyTorch
class EmotionDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.df.iloc[index]['text']
        labels = self.df.iloc[index][['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']].values.astype(float)
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.float)
        }

# Function to get predictions from a model
def get_predictions(model, tokenizer, df):
    dataset = EmotionDataset(df, tokenizer)
    loader = DataLoader(dataset, batch_size=16)

    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in tqdm(loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.sigmoid(outputs.logits).cpu().numpy()
            predictions.extend(preds)

    return np.array(predictions)

# Load data
df = load_data('public_data/train/track_a/eng.csv')

# Initialize models and tokenizers
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

distilbert_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5).to(device)
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

albert_model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=5).to(device)
albert_tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=5).to(device)
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Step 1: Train DistilBERT
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
distilbert_dataset = EmotionDataset(train_df, distilbert_tokenizer)
distilbert_loader = DataLoader(distilbert_dataset, batch_size=16)

optimizer_distilbert = AdamW(distilbert_model.parameters(), lr=2e-5)

for epoch in range(3):
    distilbert_model.train()
    for batch in tqdm(distilbert_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer_distilbert.zero_grad()
        
        outputs = distilbert_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        loss.backward()
        optimizer_distilbert.step()

# Get predictions from DistilBERT
predictions_distilbert_train = get_predictions(distilbert_model, distilbert_tokenizer, train_df)

# Check if the number of predictions matches the number of rows
assert len(predictions_distilbert_train) == len(train_df), \
    f"Number of predictions ({len(predictions_distilbert_train)}) does not match the number of rows ({len(train_df)})"

# Assign predictions to train_df_albert
train_df_albert = train_df.copy()
train_df_albert['DistilBERT_Predictions'] = predictions_distilbert_train.tolist()

# Train ALBERT using DistilBERT outputs
albert_dataset = EmotionDataset(train_df_albert, albert_tokenizer)
albert_loader = DataLoader(albert_dataset, batch_size=16)

optimizer_albert = AdamW(albert_model.parameters(), lr=2e-5)

for epoch in range(3):
    albert_model.train()
    for batch in tqdm(albert_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer_albert.zero_grad()
        
        outputs = albert_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        loss.backward()
        optimizer_albert.step()

# Get predictions from ALBERT
predictions_albert_train = get_predictions(albert_model, albert_tokenizer, train_df)

# Check if the number of predictions matches the number of rows
assert len(predictions_albert_train) == len(train_df), \
    f"Number of predictions ({len(predictions_albert_train)}) does not match the number of rows ({len(train_df)})"

# Prepare data for RoBERTa
train_df_roberta = train_df.copy()
train_df_roberta['DistilBERT_Predictions'] = predictions_distilbert_train.tolist()
train_df_roberta['ALBERT_Predictions'] = predictions_albert_train.tolist()

roberta_dataset = EmotionDataset(train_df_roberta, roberta_tokenizer)
roberta_loader = DataLoader(roberta_dataset, batch_size=16)

optimizer_roberta = AdamW(roberta_model.parameters(), lr=2e-5)

for epoch in range(3):
    roberta_model.train()
    for batch in tqdm(roberta_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer_roberta.zero_grad()
        
        outputs = roberta_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        loss.backward()
        optimizer_roberta.step()

# Combined dataset class for BERT
class CombinedEmotionDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.df.iloc[index]['text']
        
        # Combine previous model outputs as features for BERT input
        distilbert_output = self.df.iloc[index]['DistilBERT_Predictions']
        albert_output = self.df.iloc[index]['ALBERT_Predictions']
        
        labels = self.df.iloc[index][['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']].values.astype(float)
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.float),
            'distilbert_output': torch.tensor(distilbert_output, dtype=torch.float),
            'albert_output': torch.tensor(albert_output, dtype=torch.float),
        }

# Update the train_final_bert_model function
def train_final_bert_model(model, tokenizer, df):
    combined_dataset = CombinedEmotionDataset(df, tokenizer)
    combined_loader = DataLoader(combined_dataset, batch_size=16, shuffle=True)

    optimizer_final_bert = AdamW(model.parameters(), lr=2e-5)

    for epoch in range(3):
        model.train()
        progress_bar = tqdm(combined_loader, desc=f"Training BERT Epoch {epoch+1}", unit="batch")
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer_final_bert.zero_grad()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            loss.backward()
            optimizer_final_bert.step()

            progress_bar.set_postfix(loss=loss.item())

# Initialize BERT model and tokenizer
final_bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infer

In [33]:
import torch
import pickle
import os

def save_models_and_data(distilbert_model, albert_model, roberta_model, final_bert_model, 
                         distilbert_tokenizer, albert_tokenizer, roberta_tokenizer,
                         train_df, test_df):
    
    # Create a directory to store all saved files
    os.makedirs('saved_models', exist_ok=True)
    
    # Save models
    torch.save(distilbert_model.state_dict(), 'saved_models/final_distilbert_model.pth')
    torch.save(albert_model.state_dict(), 'saved_models/final_albert_model.pth')
    torch.save(roberta_model.state_dict(), 'saved_models/final_roberta_model.pth')
    torch.save(final_bert_model.state_dict(), 'saved_models/final_bert_model.pth')
    
    # Save tokenizers
    distilbert_tokenizer.save_pretrained('saved_models/final_distilbert_tokenizer')
    albert_tokenizer.save_pretrained('saved_models/final_albert_tokenizer')
    roberta_tokenizer.save_pretrained('saved_models/final_roberta_tokenizer')
    
    # Save BERT tokenizer
    bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_tokenizer.save_pretrained('saved_models/final_bert_tokenizer')
    
    # Save DataFrames
    train_df.to_pickle('saved_models/final_train_df.pkl')
    test_df.to_pickle('saved_models/final_test_df.pkl')
    
    print("All models, tokenizers, and data have been saved successfully.")

# Assuming you have your models, tokenizers, and data ready, call the function like this:
save_models_and_data(distilbert_model, albert_model, roberta_model, final_bert_model,
                     distilbert_tokenizer, albert_tokenizer, roberta_tokenizer,
                     train_df, test_df)

All models, tokenizers, and data have been saved successfully.




In [45]:
import torch
import pandas as pd
from transformers import (
    DistilBertForSequenceClassification, AlbertForSequenceClassification,
    RobertaForSequenceClassification, BertForSequenceClassification,
    DistilBertTokenizer, AlbertTokenizer, RobertaTokenizer, BertTokenizer
)
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm

class CombinedEmotionDataset(Dataset):
    def __init__(self, df, tokenizer, include_model_outputs=False):
        self.df = df
        self.tokenizer = tokenizer
        self.include_model_outputs = include_model_outputs
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.df.iloc[index]['text']
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        
        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }
        
        if self.include_model_outputs:
            item['distilbert_output'] = torch.tensor(self.df.iloc[index]['DistilBERT_Predictions'], dtype=torch.float)
            item['albert_output'] = torch.tensor(self.df.iloc[index]['ALBERT_Predictions'], dtype=torch.float)
        
        if 'Anger' in self.df.columns:
            labels = self.df.iloc[index][['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']].values.astype(float)
            item['labels'] = torch.tensor(labels, dtype=torch.float)
        
        return item

def predict(model, texts, tokenizer, batch_size=16, include_model_outputs=False):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    dataset = CombinedEmotionDataset(texts, tokenizer, include_model_outputs)
    dataloader = DataLoader(dataset, batch_size=batch_size)
    
    predictions = []
    
    model.eval()
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Predicting"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probs = torch.sigmoid(logits)
            predictions.extend(probs.cpu().numpy())
    
    return np.array(predictions)

# Load models and data
(distilbert_model, albert_model, roberta_model, final_bert_model,
 distilbert_tokenizer, albert_tokenizer, roberta_tokenizer, bert_tokenizer,
 train_df, test_df) = load_models_and_data()

# Test with complex texts
complex_texts = [
    "The intricate interplay between quantum mechanics and general relativity continues to puzzle physicists, as they strive to reconcile these fundamental theories into a unified framework of quantum gravity.",
    "In the realm of cognitive neuroscience, researchers are exploring the neural correlates of consciousness, attempting to unravel the mysteries of subjective experience and its relationship to brain activity.",
    "The advent of CRISPR-Cas9 gene editing technology has revolutionized molecular biology, offering unprecedented precision in genetic manipulation and raising profound ethical questions about the future of human evolution.",
    "Climate change presents a multifaceted challenge, intertwining environmental, economic, and social factors in a complex web of cause and effect that demands a coordinated global response.",
    "The emergence of artificial general intelligence poses both exciting possibilities and potential risks, as we grapple with the implications of creating machines that can match or surpass human cognitive abilities across a wide range of tasks."
]

# Create a DataFrame for the complex texts
complex_df = pd.DataFrame({'text': complex_texts})

# Get predictions from each model
distilbert_preds = predict(distilbert_model, complex_df, distilbert_tokenizer)
albert_preds = predict(albert_model, complex_df, albert_tokenizer)
roberta_preds = predict(roberta_model, complex_df, roberta_tokenizer)

# Add predictions to the DataFrame
complex_df['DistilBERT_Predictions'] = distilbert_preds.tolist()
complex_df['ALBERT_Predictions'] = albert_preds.tolist()

# Get final predictions from BERT
final_preds = predict(final_bert_model, complex_df, bert_tokenizer, include_model_outputs=True)

# Print results
emotion_labels = ['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']
for text, pred in zip(complex_texts, final_preds):
    print(f"Text: {text}")
    print("Predictions:")
    for emotion, score in zip(emotion_labels, pred):
        print(f"  {emotion}: {score:.4f}")
    print()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  distilbert_model.load_state_dict(torch.load('saved_models/final_distilbert_model.pth', map_location=device))
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  albert_model.load_state_dict(torch.load('saved_models/final_albert_model.pth', map_location=device))
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['

Text: The intricate interplay between quantum mechanics and general relativity continues to puzzle physicists, as they strive to reconcile these fundamental theories into a unified framework of quantum gravity.
Predictions:
  Anger: 0.4566
  Fear: 0.5543
  Joy: 0.3810
  Sadness: 0.5607
  Surprise: 0.4202

Text: In the realm of cognitive neuroscience, researchers are exploring the neural correlates of consciousness, attempting to unravel the mysteries of subjective experience and its relationship to brain activity.
Predictions:
  Anger: 0.4409
  Fear: 0.5532
  Joy: 0.3919
  Sadness: 0.5261
  Surprise: 0.4516

Text: The advent of CRISPR-Cas9 gene editing technology has revolutionized molecular biology, offering unprecedented precision in genetic manipulation and raising profound ethical questions about the future of human evolution.
Predictions:
  Anger: 0.4519
  Fear: 0.5577
  Joy: 0.3888
  Sadness: 0.5324
  Surprise: 0.4399

Text: Climate change presents a multifaceted challenge, inter


