# Initialisation

## Installing necessary components

In [None]:
!pip install transformers
!pip install datasets
!pip install osfclient
!wget https://raw.githubusercontent.com/yjthay/DZ_GenresAndStyle/master/utils.py
import csv
import os
from os import listdir
from os.path import isfile, join
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle
import itertools
from sklearn.preprocessing import MultiLabelBinarizer

from datasets import load_dataset, list_datasets
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer

## Load full dataset

In [None]:
data = load_dataset('go_emotions')

## Analysis of labels

In [None]:
data_labels_all = data['train']['labels'] + data['test']['labels'] + data['validation']['labels']
print("Total number of train, test and validation samples is {}".format(len(data_labels_all)))
_labels, _counts = np.unique([len(y) for y in data_labels_all], return_counts=True)
plt.bar(_labels, _counts, align='center')
plt.gca().set_xticks(_labels)
[print("Number of samples with {} label/labels = {}".format(i, j)) for i, j in zip(_labels, _counts)]
plt.show()

In [None]:
label_mapping = {}
label_names = data['train'].features['labels'].__dict__['feature'].names
for i, emotion in enumerate(label_names):
    label_mapping[i] = emotion.strip()
label_mapping

In [None]:
all_labels = list(itertools.chain.from_iterable(data_labels_all))
_labels, _counts = np.unique(all_labels, return_counts=True)
fig, ax = plt.subplots()
fig = plt.bar(_labels, _counts, align='center')
plt.gca().set_xticks(_labels)
ax.set_xticklabels(label_names, horizontalalignment='right', wrap=True, rotation=45)
plt.show()

## Analysis of text

In [None]:
data_text_all = data['train']['text'] + data['test']['text'] + data['validation']['text']
BERT_TYPE_LIST = ['bert-base-cased', 'roberta-base']
bert_type = 'bert-base-cased'
[print(i) for i in data_text_all[:5]]
tokenizer = BertTokenizer.from_pretrained(bert_type)
tokens = tokenizer(data_text_all, padding='max_length', truncation=True, max_length=60, return_tensors="pt")
# tokens = tokenizer(data_text_all, padding=True, return_tensors="pt")

In [None]:
from scipy import stats

max_len = 0
total = 0
a = [sum(i != 0).item() for i in tokens['input_ids']]
plt.hist(a)

In [None]:
for input, att in zip(tokens['input_ids'], tokens['attention_mask']):
    if sum(att) >= 60:
        print(sum(att))
        print(tokenizer.decode(input))

In [None]:
# from sklearn.manifold import TSNE
# tsne_model = TSNE(perplexity=30,
#                   n_components=2,
#                   n_iter=1000,
#                   random_state=23,
#                   learning_rate=500,
#                   init="pca")
# new_values = tsne_model.fit_transform(tokens['input_ids'])


## Utility functions

In [None]:
from utils import *


In [None]:
# Function for training
def train(model, train_dataset, val_dataset, epochs, lr, batch_size, weight_decay, show_progress=False, save_path=None):
    criterion = torch.nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    # Construct data loader from training and validation dataset
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    val_losses = []
    train_losses = []

    num_train = len(train_dataset)

    # Training
    for epoch in range(epochs):
        # backprop
        running_loss = 0.0
        inner_iter = 0
        pbar = tqdm(train_loader, position=0, leave=True)
        epoch_idx = int(epoch+1)
        for x, y in pbar:
            pbar.set_description("Processing Epoch %d" % epoch_idx)

            outputs = model(x)
            optimizer.zero_grad()
            # print(outputs.type(),y.type())
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        # calculate training loss
        train_loss = running_loss / len(train_loader)  # calculate validation loss

        with torch.no_grad():
            running_loss = 0.0
            for x_val, y_val in val_loader:
                outputs_val = model(x_val)
                running_loss += criterion(outputs_val, y_val).item()

        # calculate validation loss
        val_loss = running_loss / len(val_loader)  # calculate validation loss

        # print status
        if show_progress:
            print('\n Epoch = %d, Train loss = %.5f, Val loss = %.5f' % (epoch_idx, train_loss, val_loss))

        # append training and validation loss
        val_losses.append(val_loss)
        train_losses.append(train_loss)

        # save model at each epoch
        if save_path is not None:
            save_path_name = save_path + 'epoch_{}_{:.5f}.pt'.format(epoch_idx, val_loss)
            torch.save(model, save_path_name)
        pbar.reset()

    return train_losses, val_losses


def plot_confusion(genre_dict, title, confusion_matrix):
    genres = sorted(genre_dict, key=genre_dict.get)
    fig, ax = plt.subplots()
    im = ax.imshow(confusion_matrix, cmap='Blues')
    # We want to show all ticks...
    ax.set_xticks(np.arange(len(genres)))
    ax.set_yticks(np.arange(len(genres)))
    # ... and label them with the respective list entries
    ax.set_xticklabels(genres)
    ax.set_yticklabels(genres)

    # Loop over data dimensions and create text annotations.
    for i in range(len(genres)):
        for j in range(len(genres)):
            text = ax.text(j, i, '%.2f' % confusion_matrix[i, j], ha="center", va="center", color="tab:red", size=15)

    ax.set_title(title)
    ax.set_xlabel('Predicted label')
    ax.set_ylabel('True label')
    fig.tight_layout()
    plt.show()

# Training

In [None]:
# Genre-label conversion dictionary
DEVICE = 'cuda'  # cuda or cpu
model, tokenizer = BertModel, BertTokenizer
BERT_TYPE = 'bert-base-cased'

train_dataset = EmotionsDataset(data['train'], device=DEVICE, Model=model, Tokenizer=tokenizer, bert_type=BERT_TYPE)
val_dataset = EmotionsDataset(data['validation'], device=DEVICE, Model=model, Tokenizer=tokenizer, bert_type=BERT_TYPE)
model_save_path = 'model/'
pickle_dump(train_dataset, model_save_path + BERT_TYPE + '_train_EmotionsDataset.pkl')
pickle_dump(val_dataset, model_save_path + BERT_TYPE + '_val_EmotionsDataset.pkl')

## Training model for various BERTs

In [None]:
# BERT_TYPE_LIST = ['bert-base-cased','bert-base-uncased','distilbert-base-cased', 'roberta-large','roberta-base','distilroberta-base']
DEVICE = 'cuda'  # cuda or cpu
max_length = 12
# DEVICE = torch.device('cuda:0')

print(BERT_TYPE)
model_save_path = 'model/'
train_dataset = pickle_load(model_save_path + BERT_TYPE + '_train_EmotionsDataset.pkl')
val_dataset = pickle_load(model_save_path + BERT_TYPE + '_val_EmotionsDataset.pkl')
model = Layers((max_length * 768, 1000, 28)).to(DEVICE)
train_losses, val_losses = train(model, train_dataset, val_dataset, epochs=15, lr=1e-4, batch_size=16,
                                 weight_decay=0.001, show_progress=True, save_path='model/epochs/')

Processing Epoch 18:  84%|████████▍ | 2279/2714 [19:43<06:18,  1.15it/s]

# Testing 


Below is the confusion matrix for each of the various emotions as well as their associated f1 score
