In [31]:
!pip install lightning

[0m

In [33]:
!pip install transformers

[0m

In [34]:
!pip install torchmetrics

[0m

In [35]:
from torch.utils.data import Dataset, DataLoader
import torchtext
import numpy as np
import pandas as pd
import torch
import lightning.pytorch as pl
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks import ModelCheckpoint, Callback
from torchmetrics import Accuracy, F1Score
import torchmetrics
import matplotlib.pyplot as plt 

# utils

In [36]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

## dataset

In [37]:
class EmpathyConversationDataset(Dataset):
    

    FILE_PATH = {'train': "/kaggle/input/dialogue-empathy-detection/train.csv",
                 'test': "/kaggle/input/dialogue-empathy-detection/test.csv",
                 'val': "/kaggle/input/dialogue-empathy-detection/val.csv",}

    def __init__(self, split="train", transforms=None):

        if split.lower() not in self.FILE_PATH.keys():
            raise Exception("must be train or test or val")

        df = pd.read_csv(self.FILE_PATH[split.lower()])
        df = self.conv_preprocess(df)
        
        self.x = df[['utterance']].to_numpy()
        self.y = df[['empathy']].to_numpy()
        self.n_sample = len(df)

        self.transforms = transforms
    
    def conv_preprocess(self, df):
        return df.groupby('conv_id')['utterance'].apply(list).reset_index().\
                  merge(df.groupby('conv_id')['empathy'].max().reset_index(), on='conv_id', how="inner")

    def __getitem__(self, index):
        sample = self.x[index], self.y[index]
        return self._pipline_transforms(sample)
    
    def _pipline_transforms(self, sample):
        if self.transforms:
            for transform in self.transforms:
                sample = transform(sample)
        return sample
    
    def __len__(self):
        return self.n_sample

In [38]:
class TextListCleaner:

    punc = '''!()-[]{.};:'"\,<>/?@#$%^&*_~`|’“”…—–'''

    def __call__(self, sample):
        texts, target = sample
        texts = texts[0]
        new_texts = list()
        
        for text in texts:
            text = text.lower()
            for each in self.punc:
                text = text.replace(each, ' ')
            new_texts.append(text)
        return np.array([new_texts]), target


class ConversationFormater:
    SPECIAL_TOKEN_START_UTTERANCE = "<BOU>"
    SPECIAL_TOKEN_END_UTTERANCE = "<EOU>"  
    
    def __call__(self, sample):
        texts, target = sample
        texts = texts[0]
        
        conversation = str()
        for text in texts:
            conversation += f"{self.SPECIAL_TOKEN_START_UTTERANCE} {text} {self.SPECIAL_TOKEN_END_UTTERANCE} "
        return np.array([conversation]), target

    
class Tokenizer:

    def __init__(self, version="bert-base-uncased", max_len=128, tokenizer=None, new_special_tokens=None):
        self.tokenizer = AutoTokenizer.from_pretrained(version) if tokenizer is None else tokenizer
        
        if new_special_tokens:
            tokenizer.add_special_tokens(new_special_tokens)
            
        self.MAX_LEN = max_len

    def __call__(self, sample):
        text, target = sample
        inputs = self.tokenizer.encode_plus(text[0], add_special_tokens=True, max_length=self.MAX_LEN, padding='max_length', 
                                             return_attention_mask=True, return_token_type_ids=True, truncation=True)
        return inputs['input_ids'], inputs['attention_mask'], inputs['token_type_ids'], target


class ToTensor:
    # Convert ndarrays to Tensors
    def __call__(self, sample):
        return tuple(torch.from_numpy(np.array(each)) for each in sample)
    

class OneHotLabel:
    
    def __init__(self, num_classes):
        self.num_classes = num_classes
    
    def __call__(self, sample):
        target = sample[-1]
        target = torch.squeeze(torch.nn.functional.one_hot(target, num_classes=self.num_classes), dim=0)
        sample = list(sample[:-1]) + [target]
        return tuple(sample)

In [39]:
class HistoryCallback(Callback):

    def __init__(self):
        super().__init__()
        self.metrics = []

    def on_validation_end(self, trainer, pl_module):
        self.metrics.append({key: value.item() for key, value in trainer.callback_metrics.items()})

# DiagGPT

In [40]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")

In [41]:
pip_trans = [TextListCleaner(), 
             ConversationFormater(),
             Tokenizer(tokenizer=tokenizer, new_special_tokens={'additional_special_tokens': [ConversationFormater.SPECIAL_TOKEN_START_UTTERANCE,
                                                                                              ConversationFormater.SPECIAL_TOKEN_END_UTTERANCE],
                                                               'pad_token': '[PAD]'},
                        max_len=512),
             ToTensor()]

In [42]:
tokenizer.all_special_tokens

['<|endoftext|>', '[PAD]', '<BOU>', '<EOU>']

In [43]:
train_dataset = EmpathyConversationDataset(transforms=pip_trans)
test_dataset = EmpathyConversationDataset(split="test", transforms=pip_trans)
val_dataset = EmpathyConversationDataset(split="val", transforms=pip_trans)

In [12]:
train_dataset[0]

(tensor([50257,  1312,   285,  1760,  2282,  1312,  1842,   345,   284,   607,
           780,  1312,   836,   256,   765,   284,  3285,   257,  6486,   736,
           284,   502,   220,  1312,  1254,   523, 13400,   290, 19125,   290,
          9642,  2668,   220, 50258,   220, 50257,  4686,    74,   644,   257,
          2266,  9582,  1724,  3446,   475,   616,  5608,   284,   345,   318,
           611,   673, 13622,   345,   826,   788,  5089,   607,   612,   389,
         13188,   286,   584,  4813,   503,   612,   220,   290,   611,   334,
          1254,   588,   287,  1842,   351,   607,   290,   655,  2666,   607,
           788,  1949,   284,   407,  1337,   290,   345,   423,   284,  2453,
           262,  1109,   326,   617,  4813,   355,   880,   355,   617,  3730,
           220,   481,  6486,  3892,   284,   534,  1986,   329,   812,  1231,
           597, 34081,   220,   616,  5608,   318,  2245, 18088,   878,  1165,
          2739,   220,   220, 50258,   220, 50259, 5

In [44]:
train_dataloader = DataLoader(dataset=train_dataset, batch_size=4, shuffle=True)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=4)
val_dataloader = DataLoader(dataset=val_dataset, batch_size=4)

In [45]:
from transformers import AutoConfig
AutoConfig.from_pretrained('microsoft/DialoGPT-small')

GPT2Config {
  "_name_or_path": "microsoft/DialoGPT-small",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "conversational": {
      "max_length": 1000
    }
  },
  "transformers_version": "4.27.4",
  "use_cache": true,
  "vocab_size": 50257
}

In [46]:
from transformers import AutoModel
from transformers import AutoConfig


class EmpathyDetectionDialoGPTModel(pl.LightningModule):
    
    LOSS = torch.nn.BCEWithLogitsLoss()
    
    def __init__(self, embedding_tokens_len=None, config=AutoConfig.from_pretrained('microsoft/DialoGPT-small')):
        super().__init__()
        self.transformer_model = AutoModel.from_config(config)
        if embedding_tokens_len:
            # when transformer_model.wte.weight.shape[0] != len(tokenizer)
            self.transformer_model.resize_token_embeddings(embedding_tokens_len)
        self.drop = torch.nn.Dropout(0.5)
        self.out = torch.nn.Linear(768, 1)

    def forward(self, ids, mask, token_type_ids):
        x = self.transformer_model(ids, attention_mask=mask, token_type_ids=token_type_ids)[0].mean(dim=1)
        x = self.drop(x)
        output = self.out(x)
        return output
    
    def training_step(self, batch, batch_idx):
        ids, mask, token_type_ids, y = batch
        pred = self(ids, mask, token_type_ids)
        loss = self.LOSS(pred, y.float())
        acc = torchmetrics.functional.classification.binary_accuracy(pred, y.float())
        self.log_dict({"train_loss": loss, "train_accuracy": acc},on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        ids, mask, token_type_ids, y = batch
        pred = self(ids, mask, token_type_ids)
        val_loss = self.LOSS(pred, y.float())
        acc = torchmetrics.functional.classification.binary_accuracy(pred, y.float())
        self.log_dict({"val_loss": val_loss, "val_accuracy": acc}, on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
        return val_loss
    
    def test_step(self, batch, batch_idx):
        ids, mask, token_type_ids, y = batch
        pred = self(ids, mask, token_type_ids)
        test_loss = self.LOSS(pred, y.float())
        acc = torchmetrics.functional.classification.binary_accuracy(pred, y.float())
        f1_score = torchmetrics.functional.f1_score(pred, y.float(), task="binary")
        self.log_dict({"test_loss": test_loss, "test_accuracy": acc, "test_f1": f1_score}, 
                      on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
        return test_loss
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=5e-5)
        return optimizer

In [47]:
config = AutoConfig.from_pretrained('microsoft/DialoGPT-small', n_positions=512)

In [48]:
model = EmpathyDetectionDialoGPTModel(embedding_tokens_len=len(tokenizer), config=config)

In [49]:
model_checkpoint = ModelCheckpoint(
    save_top_k=1,
    monitor="val_loss",
    mode="min",
    dirpath="./dialoGPT",
    filename="dialoGPT-empathy-conv-{epoch:02d}-{val_loss:.2f}",
)
early_stop = EarlyStopping(monitor="val_loss", mode="min", patience=5)
history_callback = HistoryCallback()
trainer = pl.Trainer(limit_train_batches=100, max_epochs=50, callbacks=[history_callback, early_stop, model_checkpoint], 
                     accelerator='auto', devices=2)

INFO: GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs


In [50]:
trainer.fit(model=model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

INFO: Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/2
INFO: Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/2
INFO: ----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 2 processes
----------------------------------------------------------------------------------------------------

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
INFO: LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1]
INFO: 
  | Name              | Type      | Params
------------------------------------------------
0 | transformer_model | GPT2Model | 124 M 
1 | drop              | Dropout   | 0     
2 | out               | Linear    | 769   
------------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
496.199   Total estimated model params size (MB

Sanity Checking: 0it [00:00, ?it/s]

  "strategy=ddp_spawn and num_workers=0 may result in data loading bottlenecks."


Training: 0it [00:00, ?it/s]



Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [51]:
# history_df = pd.DataFrame(history_callback.metrics)
# history_df.dropna(inplace=True)

In [52]:
# plt.plot(history_df.train_accuracy)
# plt.plot(history_df.val_accuracy)
# plt.title('model accuracy')
# plt.ylabel('accuracy')
# plt.xlabel('epoch')
# plt.legend(['Train', 'Validation'], loc='upper left')

In [53]:
# plt.plot(history_df.train_loss)
# plt.plot(history_df.val_loss)
# plt.title('model loss')
# plt.ylabel('loss')
# plt.xlabel('epoch')
# plt.legend(['Train', 'Validation'], loc='upper left')

In [54]:
trainer.test(model, dataloaders=test_dataloader, ckpt_path='best')

INFO: Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/2
INFO: Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/2
INFO: ----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 2 processes
----------------------------------------------------------------------------------------------------

INFO: Restoring states from the checkpoint path at /kaggle/working/dialoGPT/dialoGPT-empathy-conv-epoch=04-val_loss=0.60.ckpt
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
INFO: LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1]
INFO: Loaded model weights from checkpoint at /kaggle/working/dialoGPT/dialoGPT-empathy-conv-epoch=04-val_loss=0.60.ckpt
  "strategy=ddp_spawn and num_workers=0 may result in data loading bottlenecks."


Testing: 0it [00:00, ?it/s]

[{'test_loss_epoch': 0.6181172728538513,
  'test_accuracy_epoch': 0.6926829218864441,
  'test_f1_epoch': 0.7867595553398132}]