## N. Shelke, S. Chaudhury, S. Chakrabarti, S. L. Bangare, G. Yogapriya, and P. Pandey, “An efficient way of text-based emotion analysis from social media using lra-dnn,” Neuroscience Informatics, vol. 2, no. 3, p. 100048, 2022.

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from transformers import RobertaModel, RobertaTokenizerFast
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler

pd.set_option("display.max_columns", None)

In [2]:
import torch
print(torch.cuda.is_available())


False


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
path1 = '/content/drive/My Drive/PROJECT/Text Data/train.tsv'
path2 = '/content/drive/My Drive/PROJECT/Text Data/dev.tsv'

In [None]:
import pandas as pd

df_train = pd.read_csv(path1, sep='\t', header=None, names=['Text', 'Class', 'ID'])
df_dev = pd.read_csv(path2, sep='\t', header=None, names=['Text', 'Class', 'ID'])

In [None]:
#Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#Load the dataset
train_df = pd.read_csv(path1 + 'train.txt', sep=';', names=['content', 'sentiment'], header=0)
test_df = pd.read_csv(path2 + 'test.txt', sep=';', names=['content', 'sentiment'], header=0)
valid_df = pd.read_csv(path1 + 'val.txt', sep=';', names=['content', 'sentiment'], header=0)
df_train = pd.concat([train_df, test_df, valid_df], axis=0)

#Split the data into train, test, and validation sets
X_train, X_test, y_train, y_test = train_test_split(df_train['content'], df_train['sentiment'], test_size=0.2, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

#Oversample the minority classes using RandomOverSampler
ros = RandomOverSampler()
X_train, y_train = ros.fit_resample(np.array(X_train).reshape(-1, 1), np.array(y_train).reshape(-1, 1))
train_os = pd.DataFrame(list(zip([x[0] for x in X_train], y_train)), columns=['content', 'sentiment'])

#Encode the labels
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(np.array(y_train).reshape(-1, 1))
y_valid_encoded = encoder.transform(np.array(y_valid).reshape(-1, 1))
y_test_encoded = encoder.transform(np.array(y_test).reshape(-1, 1))

In [None]:
#Tokenize the input data
MAX_LEN = 128

class EmotionsDataset(Dataset):
    def __init__(self, data, labels, tokenizer, max_len):
        self.data = data
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = str(self.data[idx])
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_tensors='pt',
            truncation=True
        )
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
train_dataset = EmotionsDataset(train_os['content'].values, y_train_encoded, tokenizer, MAX_LEN)
valid_dataset = EmotionsDataset(X_valid.values, y_valid_encoded, tokenizer, MAX_LEN)
test_dataset = EmotionsDataset(X_test.values, y_test_encoded, tokenizer, MAX_LEN)

#Create data loaders
batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)



In [None]:
#Create the classification model
class EmotionClassifier(nn.Module):
    def __init__(self, bert_model, num_classes):
        super(EmotionClassifier, self).__init__()
        self.bert_model = bert_model
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(bert_model.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

#Set hyperparameters
num_classes = len(encoder.classes_)
lr = 2e-5
num_epochs = 4

#Initialize the classifier
classifier = EmotionClassifier(roberta_model, num_classes).to(device)


In [None]:
if is_train:
    #Set optimizer and loss function
    optimizer = torch.optim.Adam(classifier.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    #Training loop
    for epoch in range(num_epochs):
        classifier.train()
        train_loss = 0.0
        train_correct = 0
        total = 0

        for batch_idx, batch in enumerate(train_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = classifier(input_ids, attention_mask)
            _, predicted = torch.max(outputs, dim=1)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            total += labels.size(0)
            train_correct += (predicted == labels).sum().item()
            train_loss += loss.item()

            if batch_idx % 100 == 0:
                batch_acc = train_correct / total
                batch_loss = train_loss / (batch_idx + 1)
                print(f"Epoch {epoch + 1}/{num_epochs} | Batch {batch_idx}/{len(train_loader)} | Loss: {batch_loss:.4f} | Accuracy: {batch_acc:.4f}")

        train_accuracy = train_correct / total
        train_loss /= len(train_loader)

classifier.eval()
valid_loss = 0.0
valid_correct = 0
total = 0


In [6]:
with torch.no_grad():
    for batch in valid_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = classifier(input_ids, attention_mask)
        _, predicted = torch.max(outputs, dim=1)
        loss = criterion(outputs, labels)

        total += labels.size(0)
        valid_correct += (predicted == labels).sum().item()

        valid_loss += loss.item()

    valid_accuracy = valid_correct / total
    valid_loss /= len(valid_loader)

# Print training and validation results
print(f'Epoch {epoch + 1}/{num_epochs}')
print(f'Training Loss: {train_loss:.4f} | Training Accuracy: {train_accuracy:.4f}')
print(f'Validation Loss: {valid_loss:.4f} | Validation Accuracy: {valid_accuracy:.4f}')
print('------------------------------------------')

Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/16 | Batch 0/1009 | Loss: 0.4779 | Accuracy: 0.4213
Epoch 1/16 | Batch 100/1009 | Loss: 0.4779 | Accuracy: 0.4513
Epoch 1/16 | Batch 200/1009 | Loss: 0.4724 | Accuracy: 0.4833
Epoch 1/16 | Batch 300/1009 | Loss: 0.4703 | Accuracy: 0.5093
Epoch 1/16 | Batch 400/1009 | Loss: 0.4607 | Accuracy: 0.5239
Epoch 1/16 | Batch 500/1009 | Loss: 0.4954 | Accuracy: 0.5204
Epoch 1/16 | Batch 600/1009 | Loss: 0.4488 | Accuracy: 0.5741
Epoch 1/16 | Batch 700/1009 | Loss: 0.4863 | Accuracy: 0.5829
Epoch 1/16 | Batch 800/1009 | Loss: 0.4402 | Accuracy: 0.6099
Epoch 1/16 | Batch 900/1009 | Loss: 0.4382 | Accuracy: 0.6265
Epoch 1/16 | Batch 1000/1009 | Loss: 0.4375 | Accuracy: 0.6396
E

In [8]:
#Calculate classification metrics
classification_metrics = classification_report(true_labels, predicted_labels)
confusion_mtx = confusion_matrix(true_labels, predicted_labels)

print('Classification Report:')
print(classification_metrics)

print('Confusion Matrix:')
print(confusion_mtx)

Classification Report:
              precision    recall  f1-score   support

       anger       0.72      0.70      0.66       265
        fear       0.71      0.71      0.67       245
         joy       0.68      0.75      0.71       694
        love       0.66      0.77      0.69       169
     sadness       0.65      0.71      0.68       563
    surprise       0.66      0.72      0.67        64

    accuracy                          0.72      2000
   macro avg       0.58      0.72      0.42      2000
weighted avg       0.60      0.45      0.51      2000

Confusion Matrix:
[[247   8   0   1   9   0]
 [  5 222   1   0   4  13]
 [  3   3 624  56   6   2]
 [  3   0   4 162   0   0]
 [  7  12   2   0 541   1]
 [  0   6   0   0   0  58]]
