In [1]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

import gc

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yisiang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
## Code below is partly adapted from the link below to fine-tuning for classification task with pytorch lightning:
# https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb#scrollTo=RKNr7fgzcKpZ

In [2]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(22)

In [3]:
logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
  def on_validation_end(self, trainer, pl_module):
    logger.info("***** Validation results *****")
    if pl_module.is_logger():
      metrics = trainer.callback_metrics
      # Log results
      for key in sorted(metrics):
        if key not in ["log", "progress_bar"]:
          logger.info("{} = {}\n".format(key, str(metrics[key])))

  def on_test_end(self, trainer, pl_module):
    logger.info("***** Test results *****")

    if pl_module.is_logger():
      metrics = trainer.callback_metrics

      # Log and save results to file
      output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
      with open(output_test_results_file, "w") as writer:
        for key in sorted(metrics):
          if key not in ["log", "progress_bar"]:
            logger.info("{} = {}\n".format(key, str(metrics[key])))
            writer.write("{} = {}\n".format(key, str(metrics[key])))

In [4]:
#Load the tokenizer for the T5-base model
tokenizer = T5Tokenizer.from_pretrained('t5-base')
# tokenizer = AutoTokenizer.from_pretrained("t5-base")

In [5]:
#check how it encodes the labels (they're a tuple)
emotions = [ "sadness", "joy", "anger", "fear"]
for em in emotions:
  print(tokenizer.encode(em))

[24784, 1]
[3922, 1]
[11213, 1]
[2971, 1]


In [6]:
class EmotionDataset(Dataset):
  def __init__(self, tokenizer, data_dir, type_path,  max_len=512):
    self.path = os.path.join(data_dir, type_path + '.txt')

    self.data_column = "text"
    self.class_column = "emotion"
    self.data = pd.read_csv(self.path, sep=";", header=None, names=[self.data_column, self.class_column],
                            engine="python")
    
    self.max_len = max_len
    self.tokenizer = tokenizer
    self.inputs = []
    self.targets = []

    self._build()
  
  def __len__(self):
    return len(self.inputs)
  
  def __getitem__(self, index):
    source_ids = self.inputs[index]["input_ids"].squeeze()
    target_ids = self.targets[index]["input_ids"].squeeze()

    src_mask    = self.inputs[index]["attention_mask"].squeeze()  #might need to squeeze
    target_mask = self.targets[index]["attention_mask"].squeeze()  #might need to squeeze

    return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}
  
  def _build(self):
    for idx in range(len(self.data)):
      input_, target = self.data.loc[idx, self.data_column], self.data.loc[idx, self.class_column]      
      
      input_ = input_ + ' </s>'
      target = target + " </s>"

      # tokenize inputs
      tokenized_inputs = self.tokenizer.batch_encode_plus(
          [input_], max_length=self.max_len, pad_to_max_length=True, return_tensors="pt"
      )
       # tokenize targets
      tokenized_targets = self.tokenizer.batch_encode_plus(
          [target], max_length=2, pad_to_max_length=True, return_tensors="pt"
      )
       #tokenize inputs
      # tokenized_inputs = self.tokenizer.batch_encode_plus(
      #     [input_], max_length=self.max_len, padding="max_length", truncation=True, return_tensors="pt"
      # )
      #  #tokenize targets
      # tokenized_targets = self.tokenizer.batch_encode_plus(
      #     [target], max_length=2, padding="max_length", truncation=True, return_tensors="pt"
      # )

      self.inputs.append(tokenized_inputs)
      self.targets.append(tokenized_targets)

In [7]:
def get_dataset(tokenizer, type_path, args):
  return EmotionDataset(tokenizer=tokenizer, data_dir=args.data_dir, type_path=type_path,  max_len=args.max_seq_length)

In [8]:
train_path = "emotion_data/my_train.txt"
test_path = "emotion_data/my_test.txt"
val_path = "emotion_data/my_val.txt"
dataset = EmotionDataset(tokenizer, 'emotion_data/', 'my_test',  max_len=512)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [9]:
#show one sample
data = dataset[0]
# print(data)
print(tokenizer.decode(data['target_ids']))
print(tokenizer.decode(data['source_ids'],skip_special_tokens=True))
print(tokenizer.decode(data['target_ids'],skip_special_tokens=True))

sadness</s>
not very well my dog died yesterday
sadness


In [10]:
class T5FineTuner(pl.LightningModule):
  def __init__(self, hparams):
    super(T5FineTuner, self).__init__()
    self.hparams = hparams
    # self.save_hyperparameters(hparams) 
    self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
    self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)
  
  def is_logger(self):
    return self.trainer.global_rank  <= 0 #proc_rank rename to global rank in new transformers
  
  def forward(
      self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None
  ):
    return self.model(
        input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        decoder_attention_mask=decoder_attention_mask,
        labels=labels,
    )

  def _step(self, batch):
    labels = batch["target_ids"]
    labels[labels[:, :] == self.tokenizer.pad_token_id] = -100

    outputs = self(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        labels=labels,
        decoder_attention_mask=batch['target_mask']
    )

    loss = outputs[0]

    return loss

  def training_step(self, batch, batch_idx):
    loss = self._step(batch)

    tensorboard_logs = {"train_loss": loss}
    return {"loss": loss, "log": tensorboard_logs}
  
  def training_epoch_end(self, outputs):
    avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
    tensorboard_logs = {"avg_train_loss": avg_train_loss}
    return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def validation_step(self, batch, batch_idx):
    loss = self._step(batch)
    return {"val_loss": loss}
  
  def validation_epoch_end(self, outputs):
    avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
    tensorboard_logs = {"val_loss": avg_loss}
    return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def configure_optimizers(self):
    "Prepare optimizer and schedule (linear warmup and decay)"

    model = self.model
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.hparams.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
    self.opt = optimizer
    return [optimizer]
  
  def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
    # if self.trainer.use_tpu:
    #   xm.optimizer_step(optimizer)
    # else:
    optimizer.step()
    optimizer.zero_grad()
    self.lr_scheduler.step()
  
  def get_tqdm_dict(self):
    tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

    return tqdm_dict

  def train_dataloader(self):
    train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="my_train", args=self.hparams)
    dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True)
    t_total = (
        (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
        // self.hparams.gradient_accumulation_steps
        * float(self.hparams.num_train_epochs)
    )
    scheduler = get_linear_schedule_with_warmup(
        self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
    )
    self.lr_scheduler = scheduler
    return dataloader

  def val_dataloader(self):
    val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="my_val", args=self.hparams)
    return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size)

In [11]:
#rubbish collection
gc.collect()
torch.cuda.empty_cache()

In [12]:
args_dict = dict(
    data_dir="", # path for data files
    output_dir="", # path to save the checkpoints
    model_name_or_path='t5-base',
    tokenizer_name_or_path='t5-base',
    max_seq_length=512,
    learning_rate=8e-4,
    weight_decay=0.0,
    adam_epsilon=1e-4,
    warmup_steps=0,
    train_batch_size=5,
    eval_batch_size=5,
    num_train_epochs=6,
    gradient_accumulation_steps=2,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False,
    opt_level='O1',
    max_grad_norm=1.0, 
    seed=22,
)

In [13]:
args_dict.update({'data_dir': 'emotion_data/', 'output_dir': 't5_emotion', 'num_train_epochs':6})
args = argparse.Namespace(**args_dict)
print(args_dict)

{'data_dir': 'emotion_data/', 'output_dir': 't5_emotion', 'model_name_or_path': 't5-base', 'tokenizer_name_or_path': 't5-base', 'max_seq_length': 512, 'learning_rate': 0.0008, 'weight_decay': 0.0, 'adam_epsilon': 0.0001, 'warmup_steps': 0, 'train_batch_size': 5, 'eval_batch_size': 5, 'num_train_epochs': 6, 'gradient_accumulation_steps': 2, 'n_gpu': 1, 'early_stop_callback': False, 'fp_16': False, 'opt_level': 'O1', 'max_grad_norm': 1.0, 'seed': 22}


In [14]:
#load model (uncomment to load)
device = torch.device('cuda:0')
model = T5FineTuner(args)
model.model.load_state_dict(torch.load('t5_emotion/T5emotion_finetuned.pt'), strict=False)
model.to(device)

T5FineTuner(
  (model): T5ForConditionalGeneration(
    (shared): Embedding(32128, 768)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 768)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=768, out_features=768, bias=False)
                (k): Linear(in_features=768, out_features=768, bias=False)
                (v): Linear(in_features=768, out_features=768, bias=False)
                (o): Linear(in_features=768, out_features=768, bias=False)
                (relative_attention_bias): Embedding(32, 12)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseReluDense(
                (wi): Linear(in_features=768, out_features=3072, bias=False)
                (wo): Linear(in_featur

## Evaluate on test Set

In [15]:
import textwrap
from tqdm.auto import tqdm
from sklearn import metrics

In [16]:
dataset = EmotionDataset(tokenizer, 'emotion_data', 'my_test', 512)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [17]:
it = iter(loader)
batch = next(it)
batch["source_ids"].shape

torch.Size([32, 512])

In [18]:
outs = model.model.generate(input_ids=batch['source_ids'].cuda(), 
                              attention_mask=batch['source_mask'].cuda(), 
                              max_length=2)

dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs]

texts = [tokenizer.decode(ids,skip_special_tokens=True) for ids in batch['source_ids']]
targets = [tokenizer.decode(ids,skip_special_tokens=True) for ids in batch['target_ids']]

In [19]:
#print out some predictions
for i in range(32):
    c = texts[i]
    lines = textwrap.wrap("text:\n%s\n" % c, width=100)
    print("\n".join(lines))
    print("\nActual sentiment: %s" % targets[i])
    print("predicted sentiment: %s" % dec[i])
    print("=====================================================================\n")

text: i feel like shit

Actual sentiment: sadness
predicted sentiment: anger

text: i am sick and tired of this shit

Actual sentiment: anger
predicted sentiment: anger

text: i am not feeling very confident about today and have a feeling that something bad will happen

Actual sentiment: fear
predicted sentiment: fear

text: not so good i am missing a birthday party because i am ill

Actual sentiment: sadness
predicted sentiment: sadness

text: just pissed off about everything

Actual sentiment: anger
predicted sentiment: anger

text: amazeballs

Actual sentiment: joy
predicted sentiment: joy

text: im feeling very angry today

Actual sentiment: anger
predicted sentiment: anger

text: i am so mad i have just discovered that my husband is cheating on me with a coworker

Actual sentiment: anger
predicted sentiment: anger

text: i am worried that i will run out of money before payday

Actual sentiment: fear
predicted sentiment: fear

text: i am angry with my boss as he always shouts at me

## test metrics

In [23]:
dataset = EmotionDataset(tokenizer, 'emotion_data', 'my_test', 512)
loader = DataLoader(dataset, batch_size=32)
model.model.eval()
outputs = []
targets = []
for batch in tqdm(loader):
  outs = model.model.generate(input_ids=batch['source_ids'].cuda(), 
                              attention_mask=batch['source_mask'].cuda(), 
                              max_length=2)

  dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs]
  target = [tokenizer.decode(ids,skip_special_tokens=True) for ids in batch["target_ids"]]
  
  outputs.extend(dec)
  targets.extend(target)

100%|██████████| 4/4 [00:02<00:00,  1.61it/s]


In [25]:
for i, out in enumerate(outputs):
  if out not in emotions:
    print(i, 'detected invalid prediction')
    #deal with invalid predictions if any:
    del outputs[i]
    del targets[i]

In [26]:
print(metrics.classification_report(targets, outputs, digits=4))

              precision    recall  f1-score   support

       anger     0.8889    0.9143    0.9014        35
        fear     0.9167    0.8800    0.8980        25
         joy     0.9688    0.9394    0.9538        33
     sadness     0.8148    0.8462    0.8302        26

    accuracy                         0.8992       119
   macro avg     0.8973    0.8950    0.8959       119
weighted avg     0.9007    0.8992    0.8997       119

