# Initialisation

## Installing necessary components

In [None]:
!pip install transformers
!pip install datasets
!pip install osfclient
!pip install sentencepiece
!pip install simpletransformers==0.61.14
from simpletransformers.classification import MultiLabelClassificationModel
# !git clone https://github.com/yjthay/DZ_GenresAndStyle.git
import csv
import os
from os import listdir
from os.path import isfile, join
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle
import itertools
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import nltk
import string
from nltk.stem import PorterStemmer
from wordcloud import WordCloud
import seaborn as sns
from scipy import stats

sns.set_theme(style="whitegrid")

from wordcloud import WordCloud
import matplotlib.pyplot as plt

import re

from datasets import load_dataset, list_datasets
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, DistilBertModel, DistilBertTokenizer,RobertaModel, RobertaTokenizer

nltk.download('stopwords')
!wget https://raw.githubusercontent.com/yjthay/DZ_GenresAndStyle/master/utils.py
!wget https://raw.githubusercontent.com/yjthay/DZ_GenresAndStyle/master/data/ekman_mapping.json -P ./data/
!wget https://raw.githubusercontent.com/yjthay/DZ_GenresAndStyle/master/data/sentiment_mapping.json -P ./data/
# %cd /content/DZ_GenresAndStyle/

## Pull OSF Repository

In [None]:
!osf -p cku2b clone /content/

## Utility functions

In [None]:
from utils import *

## Load full dataset

In [None]:
data = load_dataset('go_emotions')

## Analysis of labels

In [None]:
data_labels_all = data['train']['labels'] + data['test']['labels'] + data['validation']['labels']
print("Total number of train, test and validation samples is {}".format(len(data_labels_all)))
_labels, _counts = np.unique([len(y) for y in data_labels_all], return_counts=True)
fig = plt.figure()
plt.bar(_labels, _counts, align='center')
plt.gca().set_xticks(_labels)
[print("Number of samples with {} label/labels = {}".format(i, j)) for i, j in zip(_labels, _counts)]
plt.title("Number of samples vs Labels/Sample")
plt.show()
fig.savefig("/Samples vs Labels-per-Sample.png")

In [None]:
label_mapping = {}
label_names = data['train'].features['labels'].__dict__['feature'].names
for i, emotion in enumerate(label_names):
    label_mapping[i] = emotion.strip()
label_mapping

In [None]:
all_labels = list(itertools.chain.from_iterable(data_labels_all))
_labels, _counts = np.unique(all_labels, return_counts=True)
fig,ax = plt.subplots(constrained_layout=True)
plt.bar(_labels, _counts, align='center')
plt.gca().set_xticks(_labels)
ax.set_xticklabels(label_names, horizontalalignment='right', wrap=True, rotation=45)
plt.title("Number of samples per GoEmotion label")
plt.show()
fig.savefig("/number-of-samples-per-goemo.png")

In [None]:
def ratio_of_multi_label(list_of_list, label_mapping):
    num = len(label_mapping)
    all_labels = list(itertools.chain.from_iterable(list_of_list))
    _labels, _counts = np.unique(all_labels, return_counts=True)
    output = dict(zip([label_mapping[i] for i in range(num)],[0 for i in range(num)]))
    for labels in list_of_list:
        if len(labels)>1:
            for label in labels:
                output[label_mapping[label]]+=1/_counts[label]
    return output
create_heatmap(pd.DataFrame(ratio_of_multi_label(data_labels_all,label_mapping),['Ratio']).transpose(),
               None,
               "Ratio of Multi-Class labels.png",
               fig_size=(2,8))

## Analysis of text

In [None]:
data_text_all = data['train']['text'] + data['test']['text'] + data['validation']['text']
BERT_TYPE_LIST = ['bert-base-cased', 'roberta-base']
bert_type = 'bert-base-cased'
[print(i) for i in data_text_all[:5]]
tokenizer = BertTokenizer.from_pretrained(bert_type)
tokens = tokenizer(data_text_all, padding='max_length', truncation=True, max_length=64, return_tensors="pt")
# tokens = tokenizer(data_text_all, padding=True, return_tensors="pt")

### Histogram of input sequence length

In [None]:
max_len = 0
total = 0
a = [sum(i != 0).item() for i in tokens['input_ids']]
fig = plt.figure()
plt.hist(a)
plt.title("Distribution of sequence length of input text")
fig.savefig("/histogram-of-seq-length.png")

In [None]:
count=0
for input, att in zip(tokens['input_ids'], tokens['attention_mask']):
    if sum(att) > 64:
        count+=1
        print(sum(att),count)
        print(tokenizer.decode(input))

### Word Cloud

In [None]:
from nltk.corpus import stopwords

data_text_all = data['train']['text'] + data['test']['text'] + data['validation']['text']
data_labels_all = data['train']['labels'] + data['test']['labels'] + data['validation']['labels']
stop = stopwords.words('english')
stop.append('')
stop.append('[name]')

In [None]:
porter = PorterStemmer()
sentences = [sentence.split(" ") for sentence in data_text_all]
sentence_word = []

# Remove stop words and create a list of list of the stemmed words
for sentence in sentences:
    sentence_word.append([porter.stem(w) for w in sentence if w.lower() not in stop])

# Allocated stemmed words to the specific labels we see them
words_of_label = {}

for idx, multilabel in enumerate(data_labels_all):
    for label in multilabel:
        if label not in words_of_label.keys():
            words_of_label[label] = sentence_word[idx]
        else:
            words_of_label[label] += sentence_word[idx]

corpus=[[] for _ in range(28)]
for idx, multilabel in enumerate(data_labels_all):
    for label in multilabel:
        corpus[label] += sentence_word[idx]

output= []
for i in range(len(corpus)):
    output.append(' '.join(corpus[i]))

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(output)
tf_idf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

f = plt.figure(figsize=(30,15))
f.suptitle("Most Frequent 30 Words",fontsize=20)
for i in range(28):
    wordCloud = WordCloud(max_words=30, width=800, height=800, background_color='lightgrey')
    wordCloud.generate_from_frequencies(nltk.FreqDist(words_of_label[i]))

    ax = f.add_subplot(4,7, i+1)
    ax.set_title(label_mapping[i].capitalize(),fontsize=16)
    ax.imshow(wordCloud, interpolation='bilinear', aspect="auto")
    ax.axis("off")
f.tight_layout()
f.subplots_adjust(top=0.94)
f.savefig("/wc.png")

### GoEmotions TF-IDF Similarity to Neutral

In [None]:
tfidf={}
for i in range(28):
    tfidf[label_mapping[i]]=tf_idf.loc[i,:] @ tf_idf.loc[27,:].T

create_heatmap(pd.DataFrame(tfidf,index=['Dot Product']).transpose(),None,"TF-IDF Similarity to Neutral", fig_size=(1,6))

### Similarity of Tokenizer's Vocabulary

In [None]:
replacer = lambda x: x.replace('##', "")

bert_cased = AutoTokenizer.from_pretrained('bert-base-cased')
distilbert = AutoTokenizer.from_pretrained('distilbert-base-cased')
roberta = AutoTokenizer.from_pretrained('roberta-base')
t5 = AutoTokenizer.from_pretrained('t5-base')

bert_cased_woc = list(map(replacer, bert_cased.vocab.keys()))
distilbert_woc = list(map(replacer,distilbert.vocab.keys()))
roberta_woc = list(map(replacer,roberta.vocab.keys()))
t5_woc = list(map(replacer,t5.vocab.keys()))

wlists=[distilbert_woc, bert_cased_woc, roberta_woc, t5_woc]
output=np.zeros((4,4))
for idx_f,wlist_f in enumerate(wlists):
    for idx_s, wlist_s in enumerate(wlists):
        _, f, s = get_vocab_similarity(wlist_f, wlist_s)
        output[idx_s][idx_f], output[idx_f][idx_s]=f,s

model_names=['distilbert','bert','roberta','t5']
create_heatmap(pd.DataFrame(output,index=model_names,columns=model_names),None,'Vocab Token Similarity',fig_size=(5,5))

# Training

In [None]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

pre_path='/content/drive/MyDrive/DeepZen/'

## Training model for various BERTs

In [None]:
DEVICE = 'cuda'  # cuda or cpu
save_path = pre_path+'model/epochs/'
max_length=64

data = load_dataset('go_emotions')
train_dataset = EmotionsDataset(data['train'], device=DEVICE, max_length=max_length)
val_dataset = EmotionsDataset(data['validation'], device=DEVICE, max_length=max_length)

train_data = pd.DataFrame(list(zip(train_dataset.text, train_dataset.labels.tolist())),columns=["text","labels"])
val_data= pd.DataFrame(list(zip(val_dataset.text, val_dataset.labels.tolist())),columns=["text","labels"])
for name in ['roberta-base', 'distilbert-base-cased', 'bert-base-cased']:
    architecture = re.findall("^(.*?)-",name)[0]
    model = MultiLabelClassificationModel(architecture, name, args=gen_train_args(name, save_path), num_labels = 28)
    model.train_model(train_data, eval_df=val_data)

## Training model for various BERTs - Reduced

In [None]:
DEVICE = 'cuda'  # cuda or cpu
save_path = pre_path+'model/epochs/reduced/'
max_length=64

data = load_dataset('go_emotions')
# val_data = T5Dataset(data['validation'], goemo_ratio=1.0)
train,test,val = data_reduction(data['train']), data_reduction(data['test']), data_reduction(data['validation'])
data = {'train':train, 'validation':val, 'test':test}

train_dataset = EmotionsDataset(data['train'], device=DEVICE, max_length=max_length)
val_dataset = EmotionsDataset(data['validation'], device=DEVICE, max_length=max_length)

train_data = pd.DataFrame(list(zip(train_dataset.text, train_dataset.labels.tolist())),columns=["text","labels"])
val_data= pd.DataFrame(list(zip(val_dataset.text, val_dataset.labels.tolist())),columns=["text","labels"])
for name in ['roberta-base', 'distilbert-base-cased', 'bert-base-cased']:
    architecture = re.findall("^(.*?)-",name)[0]
    model = MultiLabelClassificationModel(architecture, name, args=gen_train_args(name, save_path), num_labels = 28)
    model.train_model(train_data, eval_df=val_data)

## T5 Training - Full Model

In [None]:
data = load_dataset('go_emotions')
val_data = T5Dataset(data['validation'], goemo_ratio=1.0)
DEVICE = 'cuda'  # cuda or cpu

for ratio in [1.0,0.75,0.5,0.333]:
    save_path = '/content/drive/MyDrive/DeepZen/model/epochs/t5-base/'
    # model_file_path = best_model_filename(pre_path+str(ratio)+'/')
    # tuned_model = torch.load(model_file_path, map_location=torch.device(DEVICE))
    model = T5Model().to(DEVICE)
    train_losses, val_losses = train_T5(model, data, goemo_ratio=ratio, epochs=20, lr=2e-5, batch_size=16, show_progress=True, save_path=save_path+"/"+str(ratio)+"/")

## T5 Training - Reduced Model

In [None]:
data = load_dataset('go_emotions')
# val_data = T5Dataset(data['validation'], goemo_ratio=1.0)
train,test,val = data_reduction(data['train']), data_reduction(data['test']), data_reduction(data['validation'])
data = {'train':train, 'validation':val, 'test':test}
DEVICE = 'cuda'  # cuda or cpu

for ratio in [1.0,0.75, 0.5,0.333]:
    save_path = '/content/drive/MyDrive/DeepZen/model/epochs/reduced/'
    # model_file_path = best_model_filename(pre_path+str(ratio)+'/')
    # tuned_model = torch.load(model_file_path, map_location=torch.device(DEVICE))
    model = T5Model().to(DEVICE)
    train_losses, val_losses = train_T5(model, data, goemo_ratio=ratio, epochs=20, lr=2e-5, batch_size=16, show_progress=True, save_path=save_path)

## T5-0.75 Training - Epochs Per Metric
Extract metrics of every epoch in T5-0.75 training

In [None]:
from sklearn.metrics import f1_score, recall_score, precision_score, hamming_loss, accuracy_score, jaccard_score

def temp_train_T5(model, data, goemo_ratio, epochs, lr, batch_size, early_stopping=True, show_progress=False, save_path=None):
    criterion = torch.nn.BCELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    test_data = EmotionsDataset(data['test'], max_length=64, device=DEVICE)
    test_data_val = EmotionsDataset(data['validation'], max_length=64, device=DEVICE)
    t5_data_set = T5Dataset(data['test'], goemo_ratio=1.0)
    t5_data_set_val = T5Dataset(data['validation'], goemo_ratio=1.0)

    # Construct data loader from training and validation dataset
    val_loader = DataLoader(T5Dataset(data['validation'], goemo_ratio=1.0), batch_size=batch_size)
    output={}
    val_losses = []
    train_losses = []

    best_model, best_val_loss = None, np.inf
    # Training
    for epoch in range(epochs):
        # backprop
        train_loader = DataLoader(T5Dataset(data['train'], goemo_ratio=goemo_ratio), batch_size=batch_size,
                                  shuffle=True)
        running_loss = 0.0
        inner_iter = 0
        pbar = tqdm(train_loader, position=0, leave=True)
        epoch_idx = int(epoch + 1)
        model.train()
        for x_inputs, x_masks, y_inputs, y_masks, _, _ in pbar:
            pbar.set_description("Processing Epoch %d" % epoch_idx)

            lm_labels = y_inputs
            lm_labels[lm_labels[:, :] == config.TOKENIZER.pad_token_id] = -100

            optimizer.zero_grad()
            outputs = model(input_ids=x_inputs,
                            attention_mask=x_masks,
                            lm_labels=lm_labels,
                            decoder_attention_mask=y_masks)
            loss = outputs[0]
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        # calculate training loss
        train_loss = running_loss / len(train_loader)  # calculate validation loss

        # y_pred, y_true = [], []
        with torch.no_grad():
            running_loss = 0.0
            for x_val_inputs, x_val_masks, y_val_inputs, y_val_masks, x_text, y_labels in val_loader:
                lm_labels = y_val_inputs
                lm_labels[lm_labels[:, :] == config.TOKENIZER.pad_token_id] = -100
                outputs_val = model(input_ids=x_val_inputs,
                                    attention_mask=x_val_masks,
                                    lm_labels=lm_labels,
                                    decoder_attention_mask=y_val_masks)
                loss = outputs_val[0]
                running_loss += loss.item()


        y_pred_test = predict_t5(model, t5_data_set)
        y_true_test =  test_data.labels.cpu().numpy()
        f1 = f1_score(y_true_test, y_pred_test, average="macro")
        j = jaccard_score(y_true_test, y_pred_test, average='macro')
        p = precision_score(y_true_test, y_pred_test, average='macro')
        r = recall_score(y_true_test, y_pred_test, average='macro')
        h = hamming_loss(y_true_test, y_pred_test)


        y_pred_test = predict_t5(model, t5_data_set_val)
        y_true_test =  test_data_val.labels.cpu().numpy()
        f1_val = f1_score(y_true_test, y_pred_test, average="macro")
        j_val = jaccard_score(y_true_test, y_pred_test, average='macro')
        p_val = precision_score(y_true_test, y_pred_test, average='macro')
        r_val = recall_score(y_true_test, y_pred_test, average='macro')
        h_val = hamming_loss(y_true_test, y_pred_test)

        # calculate validation loss
        val_loss = running_loss / len(val_loader)  # calculate validation loss
        output[epoch] = {'val_loss':val_loss, 'train_loss':train_loss,'f1_test':f1,
                         'j_test':j, 'p_test':p, 'r_test':r,
                         'h_test':h, 'f1_val':f1_val, 'j_val':j_val,
                         'p_val':p_val,'r_val':r_val,'h_val':h_val}

        # print status
        if show_progress:
            print('\n Epoch = %d, Train loss = %.5f, Val loss = %.5f' % (epoch_idx, train_loss, val_loss))

        # append training and validation loss
        val_losses.append(val_loss)
        train_losses.append(train_loss)

        if best_val_loss > val_loss:
            best_val_loss = val_loss
            best_model = model
        # else:
        #     if early_stopping:
        #         break

        pbar.reset()
    # save best model
    if save_path is not None:
        save_path_name = save_path + 'ratio_{}_{:.5f}.pt'.format(goemo_ratio, best_val_loss)
        torch.save(best_model, save_path_name)

    return train_losses, output

data = load_dataset('go_emotions')
val_data = T5Dataset(data['validation'], goemo_ratio=1.0)
DEVICE = 'cuda'  # cuda or cpu
save_path = pre_path+'model/'

for ratio in [0.75]:
    model = T5Model().to(DEVICE)
    train_losses, output = temp_train_T5(model, data, goemo_ratio=ratio, epochs=20, lr=2e-5, batch_size=16, show_progress=True, save_path=save_path)

p=['val_loss','train_loss', 'f1_test', 'jaccard_test', 'precision_test', 'recall_test', 'hamming_test', 'f1_val', 'jaccard_val', 'precision_val', 'recall_val', 'hamming_val']
new_output={}
for key in output.keys():
    new_output[key] = dict(zip(p,list(output[key].values())))

with open(save_path"epoch-v-metrics.pkl", 'wb') as f:
    pickle.dump(new_output, f)

## Duplicating files into osf storage from Google Drive

In [None]:
!osf init
!osf list
!osf -p cku2b upload -r /content/drive/MyDrive/DeepZen/model/epochs /model/

# Analysis of results

## Epoch per Metric and Validation Loss

In [None]:
pre_path = 'osfstorage/model/epochs/'
# pre_path = '/content/drive/MyDrive/DeepZen/model/epochs/'

with open(pre_path+"epoch-v-metrics.pkl", 'rb') as f:
    new_output = pickle.load(f)

def get_metric_score(d,metric='val_loss'):
    epochs = max(d.keys())+1
    x = []
    for epoch in range(epochs):
        x.append(d[epoch][metric])
    return x

def p_line(axs, title, ylabel, xlabel, data_x, data_y, line_label=None, line_color=None, marker=None):
    axs.set_title(title)
    axs.set_ylabel(ylabel)
    axs.set_xlabel(xlabel)
    axs.grid(True, which='both')
    axs.plot(data_x, data_y, label=line_label, color=line_color,marker=marker)

test_list = ['val_loss','hamming_test','f1_test', 'jaccard_test', 'precision_test', 'recall_test']
val_list = ['val_loss','hamming_val','f1_val', 'jaccard_val', 'precision_val', 'recall_val']

In [None]:
fig, ax = plt.subplots(3,2,figsize=(10,10),constrained_layout=True)
ax = ax.flatten()
val_loss = get_metric_score(new_output,'val_loss')
ax = ax.flatten()

for idx,(t,v) in enumerate(zip(test_list,val_list)):
    epochs = len(new_output)
    metric = re.findall("^(.*?)_",t)[0].capitalize()
    if metric=='Val':
        p_line(ax[idx],"Validation",'loss','epochs',range(epochs),get_metric_score(new_output,t),line_label='test',marker="o")
        ax[idx].axvline(x=np.argmin(val_loss), ymin=0, ymax=1, color='red', linestyle="--")
        ax[idx].legend(['Val Loss','Min Val'],loc="upper right")
    elif metric=='Hamming':
        p_line(ax[idx],'Hamming Loss','loss','epochs',range(epochs),get_metric_score(new_output,t),line_label='test',marker="o")
        p_line(ax[idx],'Hamming Loss','loss','epochs',range(epochs),get_metric_score(new_output,v),line_label='val',marker="^")
        ax[idx].axvline(x=np.argmin(val_loss), ymin=0, ymax=1, color='red', linestyle="--")
        ax[idx].legend(['Test','Val','Min Val'],loc="upper right")
    else:
        p_line(ax[idx],metric,'score','epochs',range(epochs),get_metric_score(new_output,t),line_label='test',marker="o")
        p_line(ax[idx],metric,'score','epochs',range(epochs),get_metric_score(new_output,v),line_label='val',marker="^")
        ax[idx].axvline(x=np.argmin(val_loss), ymin=0, ymax=1, color='red', linestyle="--")
        ax[idx].legend(['Test','Val','Min Val'],loc="lower right")

    ax[idx].set_xlim(-1, epochs)
fig.savefig("/epochs-v-metrics.png")

## F1 Score of Models (Testing)

### Obtain Metric Scores for BERT Models

In [None]:
from sklearn.metrics import f1_score, recall_score, precision_score, hamming_loss, accuracy_score, jaccard_score

model_names = ['distilbert-base-cased', 'bert-base-cased', 'roberta-base']#,'t5-base']
DEVICE = 'cuda' #'cpu'
pre_path = 'osfstorage/model/epochs/'
# pre_path = '/content/drive/MyDrive/DeepZen/model/epochs/'
tuned_models = {}
for name in model_names:
    architecture = re.findall("^(.*?)-",name)[0]
    tuned_models[name] = MultiLabelClassificationModel(architecture, pre_path+name+"/outputs/best_model")
print(tuned_models.keys())

test_data = EmotionsDataset(data['test'], max_length=64, device=DEVICE)
t5_data_set = T5Dataset(data['test'], goemo_ratio=1.0)
threshold = 0.3
f1, precision, recall, jaccard = {}, {}, {}, {}
for name in model_names:
    # y_pred_test = predict(torch.load('/content/drive/MyDrive/DeepZen/model/epochs/roberta-base/epoch_2_0.15042.pt', map_location=torch.device(DEVICE)), test_data)
    _, y_pred_test = tuned_models[name].predict(test_data.text)
    architecture = re.findall("^(.*?)-",name)[0]
    y_true_test = test_data.labels.cpu().numpy()
    y_pred_test = (y_pred_test>=threshold) * np.ones(y_pred_test.shape)
    f1[architecture] = np.hstack((f1_score(y_true_test, y_pred_test, average=None),
                          f1_score(y_true_test, y_pred_test, average='macro')))
    precision[architecture] = np.hstack((precision_score(y_true_test, y_pred_test, average=None),
                                 precision_score(y_true_test, y_pred_test, average='macro')))
    recall[architecture] = np.hstack((recall_score(y_true_test, y_pred_test, average=None),
                              recall_score(y_true_test, y_pred_test, average='macro')))
    jaccard[architecture] = np.hstack((jaccard_score(y_true_test, y_pred_test, average=None),
                                       jaccard_score(y_true_test, y_pred_test, average='macro'),
                                       hamming_loss(y_true_test, y_pred_test)))
    hl = hamming_loss(y_true_test, y_pred_test)
    a = accuracy_score(y_true_test, y_pred_test)
    print("Name: {} yields an Hamming Loss of {} and Accuracy of {}".format(name, hl, a))

### Obtain Metric Scores for BERT Models - Reduced

In [None]:
model_names = ['distilbert-base-cased', 'bert-base-cased', 'roberta-base']
pre_path = 'osfstorage/model/epochs/reduced/'
# pre_path = '/content/drive/MyDrive/DeepZen/model/epochs/reduced/'
tuned_models = {}
for name in model_names:
    architecture = re.findall("^(.*?)-",name)[0]
    tuned_models[name] = MultiLabelClassificationModel(architecture, pre_path+name+"/outputs/best_model")
print(tuned_models.keys())

for name in model_names:
    # y_pred_test = predict(torch.load('/content/drive/MyDrive/DeepZen/model/epochs/roberta-base/epoch_2_0.15042.pt', map_location=torch.device(DEVICE)), test_data)
    _, y_pred_test = tuned_models[name].predict(test_data.text)
    architecture = re.findall("^(.*?)-",name)[0]+"\nreduced"
    y_true_test = test_data.labels.cpu().numpy()
    y_pred_test = (y_pred_test>=threshold) * np.ones(y_pred_test.shape)
    f1[architecture] = np.hstack((f1_score(y_true_test, y_pred_test, average=None),
                          f1_score(y_true_test, y_pred_test, average='macro')))
    precision[architecture] = np.hstack((precision_score(y_true_test, y_pred_test, average=None),
                                 precision_score(y_true_test, y_pred_test, average='macro')))
    recall[architecture] = np.hstack((recall_score(y_true_test, y_pred_test, average=None),
                              recall_score(y_true_test, y_pred_test, average='macro')))
    jaccard[architecture] = np.hstack((jaccard_score(y_true_test, y_pred_test, average=None),
                                       jaccard_score(y_true_test, y_pred_test, average='macro'),
                                       hamming_loss(y_true_test, y_pred_test)))
    hl = hamming_loss(y_true_test, y_pred_test)
    a = accuracy_score(y_true_test, y_pred_test)
    print("Name: {} yields an Hamming Loss of {} and Accuracy of {}".format(name, hl, a))

### Obtain average Metric Scores for T5 Models

In [None]:
# pre_path = '/content/drive/MyDrive/DeepZen/model/epochs'
pre_path = 'osfstorage/model/epochs/'
ratios=[1.0,0.75,0.5,0.333]
f1_std, precision_std, recall_std, jaccard_std = {},{},{},{}
f1_raw, precision_raw, recall_raw, jaccard_raw = {},{},{},{}
for dir,folders,files in os.walk(pre_path):
    for ratio in ratios:
        f_all, p_all, r_all, j_all=[], [], [], []
        path=pre_path+str(ratio)
        if dir==path:
            for f in files:
                fname =path+'/'+f
                name = 't5-'+re.findall('_(.+)_',f)[0]
                y_pred_test = predict_t5(torch.load(fname, map_location=torch.device(DEVICE)), t5_data_set)
                y_true_test =  test_data.labels.cpu().numpy()
                f_all.append(np.hstack((f1_score(y_true_test, y_pred_test, average=None),
                                        f1_score(y_true_test, y_pred_test, average='macro'))))
                p_all.append(np.hstack((precision_score(y_true_test, y_pred_test, average=None),
                                        precision_score(y_true_test, y_pred_test, average='macro'))))
                r_all.append(np.hstack((recall_score(y_true_test, y_pred_test, average=None),
                                        recall_score(y_true_test, y_pred_test, average='macro'))))
                j_all.append(np.hstack((jaccard_score(y_true_test, y_pred_test, average=None),
                                        jaccard_score(y_true_test, y_pred_test, average='macro'),
                                        hamming_loss(y_true_test, y_pred_test))))
            f1_raw[name], precision_raw[name], recall_raw[name], jaccard_raw[name] = f_all, p_all, r_all, j_all
            f1[name], f1_std[name] = np.mean(f_all, axis=0), np.std(f_all, axis=0)
            precision[name], precision_std[name] = np.mean(p_all,axis=0), np.std(p_all, axis=0)
            recall[name], recall_std[name] = np.mean(r_all, axis=0), np.std(r_all, axis=0)
            jaccard[name], jaccard_std[name] = np.mean(j_all, axis=0), np.std(j_all, axis=0)
            print("Name: {} yields an Hamming Loss of {} and Macro F1 of {}".format(name, jaccard[name][-1], f1[name][-1]))

### Obtain Metric Scores for T5 Models - Reduced

In [None]:
# pre_path = '/content/drive/MyDrive/DeepZen/model/epochs/reduced/'
pre_path = 'osfstorage/model/epochs/reduced/'
ratios=[1.0,0.75,0.5,0.333]
for dir,folders,files in os.walk(pre_path):
    if dir==pre_path:
        for f in files:
            print(f)
            fname = pre_path+"/"+f
            architecture = 't5-'+re.findall('_(.+)_',f)[0]+"\nreduced"
            y_pred_test = predict_t5(torch.load(fname, map_location=torch.device(DEVICE)), t5_data_set)
            y_true_test =  test_data.labels.cpu().numpy()
            f1[architecture] = np.hstack((f1_score(y_true_test, y_pred_test, average=None),
                                          f1_score(y_true_test, y_pred_test, average='macro')))
            precision[architecture] = np.hstack((precision_score(y_true_test, y_pred_test, average=None),
                                                 precision_score(y_true_test, y_pred_test, average='macro')))
            recall[architecture] = np.hstack((recall_score(y_true_test, y_pred_test, average=None),
                                              recall_score(y_true_test, y_pred_test, average='macro')))
            jaccard[architecture] = np.hstack((jaccard_score(y_true_test, y_pred_test, average=None),
                                               jaccard_score(y_true_test, y_pred_test, average='macro'),
                                               hamming_loss(y_true_test, y_pred_test)))

In [None]:
# Create Index for f1, precision and recall tables
vals = list(config.GOEMO_MAPPING.values())
base_index = np.hstack((vals, 'macro'))
added_index = np.hstack((base_index,'hamming loss'))
model_names=['distilbert', 'bert', 'roberta', 't5-1.0',  't5-0.75', 't5-0.5', 't5-0.333',
             'distilbert\nreduced', 'bert\nreduced', 'roberta\nreduced',
             't5-1.0\nreduced',  't5-0.75\nreduced', 't5-0.5\nreduced', 't5-0.333\nreduced']
base_index,added_index

In [None]:
pd.set_option('precision', 3)
f1_emotions = pd.DataFrame(f1, index=base_index, columns=model_names)
precision_emotions = pd.DataFrame(precision, index=base_index,columns=model_names)
recall_emotions = pd.DataFrame(recall, index=base_index, columns=model_names)
jaccard_emotions = pd.DataFrame(jaccard, index=added_index,columns=model_names)
f1_std_ = pd.DataFrame(f1_std, index=base_index, columns=model_names)
precision_std_ = pd.DataFrame(precision_std, index=base_index, columns=model_names)
recall_std_ = pd.DataFrame(recall_std, index=base_index, columns=model_names)
jaccard_std_ = pd.DataFrame(jaccard_std, index=added_index, columns=model_names)

## Violin Plots of T5 Models

In [None]:
metrics = {'f1':f1_raw, 'precision':precision_raw, 'recall':recall_raw, 'jaccard':jaccard_raw}

plot_output={}
for metric in metrics.keys():
    tmp={}
    for name in ['t5-1.0',  't5-0.75', 't5-0.5', 't5-0.333']:
        # print(name, list(zip(*f1_raw[name]))[-1])
        if metric=="jaccard":
            tmp[name] = list(zip(*metrics[metric][name]))[-2]
        else:
            tmp[name] = list(zip(*metrics[metric][name]))[-1]
    plot_output[metric]=tmp

fig, ax = plt.subplots(2,2,figsize=(8,6), constrained_layout=True)
ax = ax.flatten()

for idx, metric in enumerate(plot_output.keys()):
    sns.lineplot(y=pd.DataFrame(plot_output[metric]).mean(), x=['t5-1.0',  't5-0.75', 't5-0.5', 't5-0.333'], linestyle="--", ax=ax[idx])
    sns.violinplot(data=pd.DataFrame(plot_output[metric]), palette="Set3", ax=ax[idx])
    ax[idx].set_title(metric.capitalize())

fig.savefig('/Violin Plots of T5 Models.png', bbox_inches='tight')

## Create Heat Map w Standard Deviation for T5 Models

In [None]:
t5 = ['t5-1.0',  't5-0.75', 't5-0.5', 't5-0.333']

a = jaccard_emotions.loc[:,t5]
b = jaccard_std_.loc[:,t5]
create_heatmap(a,b,"T5 - Jaccard Score and Hamming Loss")

a = recall_emotions.loc[:,t5]
b = recall_std_.loc[:,t5]
create_heatmap(a,b,"T5 - Recall Score")

a = precision_emotions.loc[:,t5]
b = precision_std_.loc[:,t5]
create_heatmap(a,b,"T5 - Precision Score")

a = f1_emotions.loc[:,t5]
b = f1_std_.loc[:,t5]
create_heatmap(a,b,"T5 - F1 Score")

## Create Heat Map of BERT and T5 models

In [None]:
model_names=['distilbert', 'bert', 'roberta', 't5-1.0']
create_heatmap(f1_emotions.loc[:,model_names],None,'F1 Score')

model_names=['distilbert', 'bert', 'roberta', 't5-1.0']
create_heatmap(precision_emotions.loc[:,model_names],None,'Precision Score')

model_names=['distilbert', 'bert', 'roberta', 't5-1.0']
create_heatmap(recall_emotions.loc[:,model_names],None,'Recall Score')

model_names=['distilbert', 'bert', 'roberta', 't5-1.0']
create_heatmap(jaccard_emotions.loc[:,model_names],None,'Jaccard Score and Hamming Loss')

## Create Heat Map of Best-in-Class Counts for T5 vs BERT

In [None]:
t5 = ['distilbert', 'bert', 'roberta', 't5-1.0']
summary={"F1":groupby_count(f1_emotions.loc[:,t5], t5),
         "Precision":groupby_count(precision_emotions.loc[:,t5], t5),
         "Recall":groupby_count(recall_emotions.loc[:,t5], t5),
         "Jaccard":groupby_count(jaccard_emotions.loc[:,t5], t5)}
create_heatmap(pd.DataFrame(summary), None, "Count of Best-in-Class per Metric - T5 vs BERT", fig_size=(4,4))

## Create Heat Map of Best-in-Class Counts for T5

In [None]:
t5 = ['t5-1.0',  't5-0.75', 't5-0.5', 't5-0.333']
summary={"F1":groupby_count(f1_emotions.loc[:,t5], t5),
         "Precision":groupby_count(precision_emotions.loc[:,t5], t5),
         "Recall":groupby_count(recall_emotions.loc[:,t5], t5),
         "Jaccard":groupby_count(jaccard_emotions.loc[:,t5], t5)}
create_heatmap(pd.DataFrame(summary), None, "Count of Best-in-Class per Metric - T5", fig_size=(4,4))

## Create Heat Map of Best-in-Class Counts for T5 vs RoBERTa

In [None]:
t5 = ['roberta', 't5-1.0']
summary={"F1":groupby_count(f1_emotions.loc[:,t5], t5),
         "Precision":groupby_count(precision_emotions.loc[:,t5], t5),
         "Recall":groupby_count(recall_emotions.loc[:,t5], t5),
         "Jaccard":groupby_count(jaccard_emotions.loc[:,t5], t5)}
create_heatmap(pd.DataFrame(summary), None, "Count of Best-in-Class per Metric - T5 vs RoBERTa", fig_size=(4,2))

## Highest Precision and Recall Difference and TF-IDF Dot Product
Plots the highest precision and recall score difference and their corresponding TF-IDF score (requires analysis of text to be ran)

In [None]:
t5 = ['t5-1.0',  't5-0.75', 't5-0.5', 't5-0.333']
bert = ['distilbert', 'bert', 'roberta']
p,r = precision_emotions[t5].mean(axis=1), recall_emotions[t5].mean(axis=1)
p_,r_ = precision_emotions.loc[:,bert].mean(axis=1), recall_emotions.loc[:,bert].mean(axis=1)
p.name, r.name = 'Precision_T5','Recall_T5'
p_.name ,r_.name = 'Precision_Bert','Recall_Bert'
diff_r= r-r_
diff_r.name = 'T5-Bert Recall'
a = pd.concat([diff_r,pd.DataFrame(tfidf,index=['TF-IDF Dot Product']).transpose()], axis=1).where(abs(diff_r)>0.15).dropna()
create_heatmap(a,None,"Recall and TF-IDF",fig_size=(5,2))

diff_p= p-p_
diff_p.name = 'T5-Bert Precision'
b = pd.concat([diff_p,pd.DataFrame(tfidf,index=['TF-IDF Dot Product']).transpose()], axis=1).where(abs(diff_p)>0.15).dropna()
create_heatmap(b,None,"Precision and TF-IDF",fig_size=(5,2))

# Analysis on Reduced vs Full T5

## Create Heat Map of BERT and T5 models - Reduced

In [None]:
model_names=['distilbert\nreduced', 'bert\nreduced', 'roberta\nreduced', 't5-1.0\nreduced']
create_heatmap(f1_emotions.loc[:,model_names],None,'Reduced - F1 Score')

## Create Heat Map of F1 score Full vs Reduced of T5-0.75

In [None]:
t5 = ['t5-0.75','t5-0.75\nreduced']
emotions = ['admiration','gratitude','approval','grief','nervousness','pride','relief']
a = f1_emotions.loc[emotions,t5]

create_heatmap(a,None,"T5-0.75 F1 Score Full vs Reduced", fig_size=(3,15/28.*7))

## Histogram of samples per GoEmotions label

In [None]:
data = load_dataset('go_emotions')
# val_data = T5Dataset(data['validation'], goemo_ratio=1.0)
train,test,val = data_reduction(data['train']),data_reduction(data['test']),data_reduction(data['validation'])

data_labels_all = train['labels'] + test['labels'] + val['labels']
all_labels = list(itertools.chain.from_iterable(data_labels_all))
_labels, _counts = np.unique(all_labels, return_counts=True)
fig, ax = plt.subplots(constrained_layout=True)
plt.bar(_labels, _counts, align='center')
plt.gca().set_xticks(_labels)
ax.set_xticklabels(label_names, horizontalalignment='right', wrap=True, rotation=45)
plt.title("Reduced - Number of samples per GoEmotion label")
plt.show()
fig.savefig("/reduced-number-of-samples-per-goemo.png")

## Create zip folder of all images and download

In [None]:
from google.colab import files
import shutil
count=0
if not os.path.exists('/figures'):
    os.makedirs('/figures')

for dir, folder, fnames in os.walk("/"):
    if dir=="/":
        for f in fnames:
            # print(dir+f)
            count+=1
            shutil.move(dir+f, "/figures")
            # files.download(dir+f)
print(count)

In [None]:
!zip -r /content/figures.zip /figures
files.download("/content/figures.zip")