In [1]:

from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from datasets import load_dataset
import torch
import torch.nn as nn
from transformers import AutoModel
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoTokenizer
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW

import numpy as np
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset,DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_curve, roc_curve,auc
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

In [2]:
emotions = load_dataset('emotion')

emotions.set_format('pandas')
df = emotions['train'][:]
df.head()

def label_int2str(row):
    return emotions['train'].features['label'].int2str(row)

df['label_name'] = df['label'].apply(label_int2str)
df.head()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Unnamed: 0,text,label,label_name
0,i didnt feel humiliated,0,sadness
1,i can go from feeling so hopeless to so damned...,0,sadness
2,im grabbing a minute to post i feel greedy wrong,3,anger
3,i am ever feeling nostalgic about the fireplac...,2,love
4,i am feeling grouchy,3,anger


In [3]:
def evaluate(model, dataloader,device):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for data in dataloader:
            inputs = {k: v.to(device) for k, v in data.items()}
            outputs = model(**inputs)
            logits = outputs.logits
            predicted_labels = torch.argmax(logits, axis=1)
            predictions.extend(predicted_labels.cpu().numpy())
            true_labels.extend(data['labels'].cpu().numpy())

    predictions, true_labels = np.array(predictions), np.array(true_labels)

    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average='weighted')
    precision = precision_score(true_labels, predictions, average='weighted')
    recall = recall_score(true_labels, predictions, average='weighted')

    return accuracy, f1, precision, recall

In [4]:

text_list_p=df["text"].tolist()
labels_p=df["label_name"].tolist()

text_list=[]
labels=[]
for i in range(len(labels_p)):
    if text_list_p[i]!='':
        text_list.append(text_list_p[i])
        labels.append(labels_p[i])

labels=np.array(labels)

label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(labels)
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
num_classes=int(np.max(integer_encoded)+1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
mapping = dict(zip(label_encoder.classes_, onehot_encoder.transform(label_encoder.transform(label_encoder.classes_).reshape(num_classes, 1))))
for key,value in mapping.items():
    print("",key," ----------------------------- ",value)


text_listp= [str(text) for text in text_list]
text_list=text_listp

# randomly create indices for train and test
indices = np.arange(len(text_list))
np.random.shuffle(indices)
splitter=0.6
train_indices = indices[:int(splitter*len(text_list))]
test_indices = indices[int(splitter*len(text_list)):]
text_list_train = np.array(text_list)[train_indices]
text_list_test = np.array(text_list)[test_indices]
labels_train = np.array(labels)[train_indices]
labels_test = np.array(labels)[test_indices]
onehot_encoded_train = np.array(onehot_encoded)[train_indices]
onehot_encoded_test = np.array(onehot_encoded)[test_indices]
integer_encoded_train = np.array(integer_encoded)[train_indices]
integer_encoded_test = np.array(integer_encoded)[test_indices]

text_list_train=text_list_train.tolist()
text_list_test=text_list_test.tolist()
labels_train=labels_train.tolist()
labels_test=labels_test.tolist()
onehot_encoded_train=onehot_encoded_train.tolist()
onehot_encoded_test=onehot_encoded_test.tolist()


# Tokenization
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(text_list_train, integer_encoded_train)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataset = TextDataset(text_list_test, integer_encoded_test)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)



 anger  -----------------------------  [1. 0. 0. 0. 0. 0.]
 fear  -----------------------------  [0. 1. 0. 0. 0. 0.]
 joy  -----------------------------  [0. 0. 1. 0. 0. 0.]
 love  -----------------------------  [0. 0. 0. 1. 0. 0.]
 sadness  -----------------------------  [0. 0. 0. 0. 1. 0.]
 surprise  -----------------------------  [0. 0. 0. 0. 0. 1.]


In [6]:
# Load pre-trained model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(set(labels)))

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
def train(epoch):
    model.train()
    run_loss = 0.0
    for _, data in enumerate(train_dataloader):
        inputs = {k: v.to(device) for k, v in data.items()}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        run_loss += loss.item()

    print(f"Epoch: {epoch}, Loss: {run_loss / len(train_dataloader)}")

# Train the model
for epoch in range(3):  # Number of epochs
    train(epoch)
    accuracy, f1, precision, recall = evaluate(model, test_dataloader, device)
    print(f"Validation Metrics: Accuracy: {accuracy}, F1: {f1}, Precision: {precision}, Recall: {recall}")
    print("-------------------------------------------------------------------------")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 0, Loss: 0.4800435796193778
Validation Metrics: Accuracy: 0.92875, F1: 0.9283317845033622, Precision: 0.9285011337164571, Recall: 0.92875
-------------------------------------------------------------------------
Epoch: 1, Loss: 0.15828492659066493
Validation Metrics: Accuracy: 0.9209375, F1: 0.9197349447679253, Precision: 0.9210890995911762, Recall: 0.9209375
-------------------------------------------------------------------------
Epoch: 2, Loss: 0.11085968179162592
Validation Metrics: Accuracy: 0.92828125, F1: 0.928589429983056, Precision: 0.9300843902463825, Recall: 0.92828125
-------------------------------------------------------------------------
