In [17]:
#package imports 

import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import pytorch_lightning as pl

from transformers import DistilBertModel, DistilBertTokenizer, AutoTokenizer, AutoModelWithLMHead, DistilBertForSequenceClassification, AdamW, get_linear_schedule_with_warmup


from typing import List
import logging
import copy
import os
import sys
import gc
from functools import lru_cache
from argparse import Namespace
from packaging import version
from tqdm.autonotebook import tqdm

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

In [18]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

In [19]:
# This notebook is partly adapted from the link below:
# https://curiousily.com/posts/multi-label-text-classification-with-bert-and-pytorch-lightning/

In [20]:
train_path = "emotion_data/my_train.txt"
test_path = "emotion_data/my_test.txt"
val_path = "emotion_data/my_val.txt"

In [21]:
#create a dictionary associating each string label to an integer value

labels = [ "sadness", "joy", "anger", "fear"]
label2int = dict(zip(labels, list(range(len(labels)))))

## Now we build a classifier

In [32]:

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [33]:
import pandas as pd
data = pd.read_csv("emotion_data/my_train.txt",sep=";", header=None)
data1 = pd.read_csv("emotion_data/my_val.txt",sep=";", header=None)
data2 = pd.read_csv("emotion_data/my_test.txt",sep=";", header=None)
data.columns = ["text","class"]
data.head()

Unnamed: 0,text,class
0,i gotta say for the first time in a long while...,joy
1,i am pissed,anger
2,im kind of firm as the school year is coming t...,sadness
3,today im really content,joy
4,im not fine my team was disqualified and i los...,anger


## Prepare the dataset

In [34]:
class EmoDataset(Dataset):
  def __init__(
    self,
    path,
    tokenizer: DistilBertTokenizer,
    max_token_len: int = 100
  ):
    self.tokenizer = tokenizer
    self.data_column = "text"
    self.class_column = "class"
    self.data = pd.read_csv(path, sep=";", header=None, names=[self.data_column, self.class_column],
                            engine="python")
    
    self.max_token_len = max_token_len
  
  def __len__(self):
    return len(self.data)
  
  def __getitem__(self, index: int):
    data_row = self.data.iloc[index]
    text = data_row.text
    labels = label2int[data_row["class"]]
    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_token_len,
      return_token_type_ids=False,
      padding="max_length",
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )
    return (encoding["input_ids"].flatten(),encoding["attention_mask"].flatten()), torch.tensor(labels)

In [35]:
train_dataset = EmoDataset(
    train_path,
    tokenizer,
    max_token_len=100
)
sample_item = train_dataset[5]
sample_item

((tensor([ 101, 2026, 4654, 9868, 2026, 2166, 1998, 1045, 2572, 5506,  102,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0]),
  tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [36]:
sample_item[1]
# sample_item["input_ids"].shape

tensor(2)

In [37]:
sample_batch = next(iter(DataLoader(train_dataset, batch_size=16)))
sample_batch[0][0].shape, sample_batch[0][1].shape

(torch.Size([16, 100]), torch.Size([16, 100]))

In [38]:
output = model(sample_batch[0][0], sample_batch[0][1])

In [39]:
output

BaseModelOutput(last_hidden_state=tensor([[[ 0.1031,  0.2227,  0.0414,  ...,  0.0191,  0.2575,  0.3722],
         [ 0.3046,  0.2739,  0.1756,  ...,  0.0326,  0.5295,  0.6913],
         [ 1.0214,  0.3050,  0.3467,  ..., -0.0498,  0.2737,  0.9548],
         ...,
         [-0.1791, -0.0794,  0.3395,  ...,  0.0381, -0.1493, -0.2007],
         [-0.2979,  0.0012,  0.2718,  ...,  0.0012, -0.2241,  0.0244],
         [-0.3091,  0.1529,  0.2534,  ..., -0.0766, -0.1674,  0.0195]],

        [[ 0.0606,  0.1232,  0.1519,  ...,  0.0015,  0.3208,  0.3495],
         [ 0.3743,  0.2497,  0.0587,  ..., -0.0790,  0.5911,  0.4906],
         [ 0.0889,  0.5393,  0.5102,  ..., -0.0486,  0.4431,  0.5649],
         ...,
         [ 0.2955,  0.2265,  0.0643,  ...,  0.2148, -0.0127,  0.1539],
         [ 0.3096,  0.2164,  0.0705,  ...,  0.2262, -0.0109,  0.1525],
         [ 0.2522,  0.2623,  0.0570,  ...,  0.1941,  0.0059,  0.1437]],

        [[ 0.0357,  0.3292,  0.1219,  ..., -0.0315,  0.3737,  0.4592],
         [-

## define custom classifcation model for fine-tuning

In [26]:
#using Mish activation function 
#(from https://github.com/digantamisra98/Mish/blob/b5f006660ac0b4c46e2c6958ad0301d7f9c59651/Mish/Torch/mish.py)
@torch.jit.script
def mish(input):
    return input * torch.tanh(F.softplus(input))
  
class Mish(nn.Module):
    def forward(self, input):
        return mish(input)

In [27]:
#define an EmoClassificationModel class to do the actual fine-tuning

class EmoClassificationModel(nn.Module):
    def __init__(self, base_model, n_classes, base_model_output_size=768, dropout=0.05):
        super().__init__()
        self.base_model = base_model
        
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(base_model_output_size, base_model_output_size),
            Mish(),
            nn.Dropout(dropout),
            nn.Linear(base_model_output_size, n_classes)
        )
        
        for layer in self.classifier:
            if isinstance(layer, nn.Linear):
                layer.weight.data.normal_(mean=0.0, std=0.02)
                if layer.bias is not None:
                    layer.bias.data.zero_()

    def forward(self, input_ids, attention_mask, *args):

        hidden_states = self.base_model(input_ids, attention_mask=attention_mask)
        
        return self.classifier(hidden_states[0][:, 0, :])

## prepare lightning module

In [48]:
#use PyTorch Lightning for training.
#we use PyTorch Lighning for training. Lightning methods are defined here

class TrainingModule(pl.LightningModule):
    def __init__(self, hparams):
        super().__init__()
        self.model = EmoClassificationModel(DistilBertModel.from_pretrained('distilbert-base-uncased'), len(labels)) #was "distilroberta-base"
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.max_token_length = 128
        self.loss = nn.CrossEntropyLoss() #cross entropy loss since this is multi-class classification
        # self.save_hyperparameters(hparams)
        self.hparams = hparams
        self.loss_amount = 0.

    def step(self, batch, step_name="train"):
        X, y = batch
        loss = self.loss(self.forward(input_ids=X[0], attention_mask = X[1]), y)
        loss_key = f"{step_name}_loss"
        tensorboard_logs = {loss_key: loss}

        return { ("loss" if step_name == "train" else loss_key): loss, 'log': tensorboard_logs,
               "progress_bar": {loss_key: loss}}

    def forward(self, input_ids, attention_mask):
        return self.model(input_ids, attention_mask)

    def training_step(self, batch, batch_idx):
        return self.step(batch, "train")

    def validation_step(self, batch, batch_idx):
        return self.step(batch, "val")

    def validation_end(self, outputs: List[dict]):
        
        loss = torch.stack([x["val_loss"] for x in outputs]).mean()

        return {"val_loss": loss}
        
    def test_step(self, batch, batch_idx):
        return self.step(batch, "test")
    
    def train_dataloader(self):
        return self.create_data_loader(self.hparams.train_path, shuffle=True)

    def val_dataloader(self):
        return self.create_data_loader(self.hparams.val_path)

    def test_dataloader(self):
        return self.create_data_loader(self.hparams.test_path)
                
    def create_data_loader(self, ds_path: str, shuffle=False):
        return DataLoader(
                    EmoDataset(ds_path, self.tokenizer, self.max_token_length),
                    batch_size=self.hparams.batch_size,
                    shuffle=shuffle,
        )
        
    @lru_cache()
    def total_steps(self):
        return len(self.train_dataloader()) // self.hparams.accumulate_grad_batches * self.hparams.epochs

    def configure_optimizers(self):
        optimizer = AdamW(self.model.parameters(), lr=self.hparams.lr) #we use AdamW as this usually performs well
        lr_scheduler = get_linear_schedule_with_warmup(
                    optimizer,
                    num_warmup_steps=self.hparams.warmup_steps,
                    num_training_steps=self.total_steps(),
        )
        return [optimizer], [{"scheduler": lr_scheduler, "interval": "step"}]
   
    # def save_model(self):
    #     torch.save(self.model.state_dict(), 'emotion_model/BERT_emotion_1ft.pt')

## begin training

In [49]:
hparams = Namespace(
    train_path=train_path,
    val_path=val_path,
    test_path=test_path,
    batch_size=20,
    warmup_steps=100,
    epochs=30,
    lr=2E-06,
    accumulate_grad_batches=1
)
module = TrainingModule(hparams)
#rubbish collection
gc.collect()
torch.cuda.empty_cache()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


                                           

In [50]:
from pytorch_lightning.callbacks import EarlyStopping
early_stop_callback = EarlyStopping(
    monitor="val_loss",  # monitor validation loss
    min_delta=0.001, #to very small change in the monitored quantity to qualify as an improvement
    patience=20, # used to check number of time with no improvement after which training will be stopped
    verbose=False, 
    mode="min" #sed while training will stopped when the quantity monitor has stopped decreasing
    )

In [51]:
#train (using cuda)
trainer = pl.Trainer(gpus=1, max_epochs=hparams.epochs, progress_bar_refresh_rate=10,
                     accumulate_grad_batches=hparams.accumulate_grad_batches,
                     early_stop_callback=early_stop_callback)

trainer.fit(module)

INFO:lightning:GPU available: True, used: True
INFO:lightning:CUDA_VISIBLE_DEVICES: [0]
INFO:lightning:
   | Name                                                   | Type                   | Params
----------------------------------------------------------------------------------------------
0  | model                                                  | EmoClassificationModel | 66 M  
1  | model.base_model                                       | DistilBertModel        | 66 M  
2  | model.base_model.embeddings                            | Embeddings             | 23 M  
3  | model.base_model.embeddings.word_embeddings            | Embedding              | 23 M  
4  | model.base_model.embeddings.position_embeddings        | Embedding              | 393 K 
5  | model.base_model.embeddings.LayerNorm                  | LayerNorm              | 1 K   
6  | model.base_model.embeddings.dropout                    | Dropout                | 0     
7  | model.base_model.transformer                

Epoch 30:  74%|███████▍  | 40/54 [00:11<00:04,  3.46it/s, loss=0.177, train_loss=0.0638, v_num=167]


1

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
with torch.no_grad():
    progress = ["/", "-", "\\", "|", "/", "-", "\\", "|"]
    module.eval().cuda()
    true_y, pred_y = [], []
    for i, batch_ in enumerate(module.test_dataloader()):
        X,y = batch_
        input_ids = X[0]
        attention_mask = X[1]
        print(progress[i % len(progress)], end="\r")
        y_pred = torch.argmax(module(input_ids.to(device), attention_mask.to(device)), dim=1)
        true_y.extend(y.cpu())
        pred_y.extend(y_pred.cpu())
print("\n" + "_" * 80)
print(classification_report(true_y, pred_y, target_names=label2int.keys(), digits=4))

In [None]:
# plot confusion matrix
cm = confusion_matrix(true_y, pred_y, labels=range(len(labels)))
df_cm = pd.DataFrame(cm, index=labels, columns=labels)

plt.rcParams.update({'font.size':12})
plt.figure(figsize=(10,8))
sns.heatmap(df_cm, annot=True, cmap='Greens', fmt='g')

In [28]:

# save model
# module.save_model()

In [29]:
# # load model test
# hparams = Namespace(
#     train_path=train_path,
#     val_path=val_path,
#     test_path=test_path,
#     batch_size=10,
#     warmup_steps=100,
#     epochs=20,
#     lr=2.5E-05,
#     accumulate_grad_batches=1
# )
# device = torch.device('cuda:0')
# model = TrainingModule(hparams)
# model.model.load_state_dict(torch.load('empathy_model\BERT_emotion_1ft.pt'), strict=False)
# model.to(device)