### RoBERTa Tutorial on VPC dataset
https://www.youtube.com/watch?v=vNKIg8rXK6w

In [1]:
import requests
import zipfile
import io
import datetime
import json
import math

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import scipy.stats as stats
import researchpy as rp
import statsmodels.api as sm
from statsmodels.formula.api import ols

from patsy.builtins import *

from sklearn.model_selection import train_test_split

from torch.utils.data import Dataset
import torch
from transformers import AutoTokenizer
import pytorch_lightning as pl
from torch.utils.data import DataLoader

from transformers import AutoModel, AdamW, get_cosine_schedule_with_warmup
import torch.nn as nn
import math
from torchmetrics.functional.classification import auroc
import torch.nn.functional as F



In [2]:
dir_path = os.path.abspath('')

nlp_dir = os.path.join(dir_path, "NPL")

vpc_data = os.path.join(nlp_dir, f"VPC.csv")

vpc_df = pd.read_csv(vpc_data)
vpc_df

Unnamed: 0,messages,VPC
0,"First of all, I think you should talk to your ...",5
1,"Hello Marvelous45, \n\n\n\nI'm sorry to hear ...",5
2,You have 2 options:\n\n\n\n1. You can continue...,3
3,I believe that you should confront the owner a...,5
4,"Hi Marvelous45, I am a college student as well...",5
...,...,...
737,If you are genuinely interested and passionate...,6
738,"Hi Marvelous45, \n\n\n\nThis sounds like a tou...",5
739,Dear Marvelous45:\n\nI am sorry to hear about ...,6
740,"If I were you, I'd be flaming mad. Talk to you...",3


In [3]:
print(f"{len(vpc_df[vpc_df.messages.isna()])} NaN messages removed")
vpc_df = vpc_df[vpc_df.messages.notna()]

11 NaN messages removed


In [4]:
vpc_df['highlevel_VPC'] = (vpc_df.VPC - 1) / 3 + 1
vpc_df = vpc_df.astype({'highlevel_VPC': 'int32'}) # Consider using 3 labels: LPC, MPC, HPC
vpc_df['LPC'] = vpc_df.highlevel_VPC == 1
vpc_df['MPC'] = vpc_df.highlevel_VPC == 2
vpc_df['HPC'] = vpc_df.highlevel_VPC == 3
vpc_df = vpc_df.astype({'LPC': 'int32'})
vpc_df = vpc_df.astype({'MPC': 'int32'})
vpc_df = vpc_df.astype({'HPC': 'int32'})
vpc_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vpc_df['highlevel_VPC'] = (vpc_df.VPC - 1) / 3 + 1


Unnamed: 0,messages,VPC,highlevel_VPC,LPC,MPC,HPC
0,"First of all, I think you should talk to your ...",5,2,0,1,0
1,"Hello Marvelous45, \n\n\n\nI'm sorry to hear ...",5,2,0,1,0
2,You have 2 options:\n\n\n\n1. You can continue...,3,1,1,0,0
3,I believe that you should confront the owner a...,5,2,0,1,0
4,"Hi Marvelous45, I am a college student as well...",5,2,0,1,0
...,...,...,...,...,...,...
737,If you are genuinely interested and passionate...,6,2,0,1,0
738,"Hi Marvelous45, \n\n\n\nThis sounds like a tou...",5,2,0,1,0
739,Dear Marvelous45:\n\nI am sorry to hear about ...,6,2,0,1,0
740,"If I were you, I'd be flaming mad. Talk to you...",3,1,1,0,0


In [5]:
class VPC_Dataset(Dataset):

    def __init__(self, data, tokenizer, attributes, max_token_len: int = 128, sample = 5000):
        self.data = data
        self.tokenizer = tokenizer
        self.attributes = attributes
        self.max_token_len = max_token_len
        self.sample = sample
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data.iloc[index]
        messages = str(item.messages)
        attributes = torch.FloatTensor(item[self.attributes])
        tokens = self.tokenizer.encode_plus(messages,
                                            add_special_tokens=True,
                                            return_tensors='pt',
                                            truncation=True,
                                            padding='max_length',
                                            max_length=self.max_token_len,
                                            return_attention_mask = True)
        return {'input_ids': tokens.input_ids.flatten(), 'attention_mask': tokens.attention_mask.flatten(), 'labels': attributes}


In [6]:
attributes = ['LPC','MPC','HPC',]
data = ['messages']

vpc_df[attributes].value_counts()

LPC  MPC  HPC
0    1    0      455
1    0    0      236
0    0    1       40
dtype: int64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(vpc_df[data], vpc_df[attributes], train_size=0.9, random_state=2)
vpc_train_df = X_train.join(y_train)
vpc_test_df = X_test.join(y_test)
print(vpc_train_df[attributes].value_counts())
print(vpc_test_df[attributes].value_counts())

LPC  MPC  HPC
0    1    0      406
1    0    0      220
0    0    1       31
dtype: int64
LPC  MPC  HPC
0    1    0      49
1    0    0      16
0    0    1       9
dtype: int64


In [8]:
print(len(vpc_train_df))
print(len(vpc_test_df))

657
74


In [9]:
print(f"Max length: {vpc_df.messages.str.len().max()}")
lengths = vpc_df["messages"].str.len()
argmax = np.where(lengths == lengths.max())[0]
vpc_df["messages"].iloc[argmax].to_numpy().ravel().tolist()

Max length: 2193


['I had a similar situation when I was looking into what I wanted to do with my degree.\n\n\n\nI started my university as a Psychology major and realized that I didn\'t actually want to do anything related to that field of study once I graduated.  I looked into the majors that were similar to that so I wouldn\'t lose out on all of the courses I\'d taken and then looked further into what I wanted to do when I graduated to decide.  After careful consideration I decided to choose a degree in Communication.\n\n\n\nWhat it sounds like in your situation is that you know what you want to do.  If Philosophy is your thing then I would say you should do it.  Regardless of the job opportunities available, if you put your mind to it, anything is possible.  Plus, Philosophy teaches you how to think in alternative ways that could be helpful in a multitude of work positions.  \n\n\n\nOne possible option you could take is to double major.  It wouldn\'t hurt you any extra to have both of the degrees un

In [10]:
model_name = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
vpc_train_ds = VPC_Dataset(vpc_train_df, tokenizer, attributes=attributes, max_token_len=512)

In [12]:
vpc_train_ds.__getitem__(0)['labels'].shape, vpc_train_ds.__getitem__(0)['input_ids'].shape, vpc_train_ds.__getitem__(0)['attention_mask'].shape

(torch.Size([3]), torch.Size([512]), torch.Size([512]))

In [13]:
vpc_train_ds.__getitem__(0)

{'input_ids': tensor([    0,  9502,   160,   939,    74,  3608,    47,     7, 39086,   110,
           499,   624,   209,    80,  1735,    47,    33,   156,    13,  2512,
             4,   114, 10561,    10, 10668,  9683,    47,    32,  2778,  2509,
            11,     8,    47, 33976,  1508,     5,  1690, 15106,  1527,     7,
           465,    10,   633,     6,  3046,   817,    47,  1372,     4,   959,
             6,   114,   418,     8,   110,   157,   145,    16,    55,   505,
             7,    47,    25,   157,    25, 11098,  2992,    47,   538,    88,
          3039,  2866,     4,    47,   189,   192,    24,    25,    22,  2678,
           117,    80,    55,   107,   113,    53,    24,    40,    28,   966,
            24,    11,     5,   251,   422,     4,    80,   107,    40,  3598,
           375,    47,     8,  1010,    47,    40,  5318,     8,   386,   110,
           301,     4, 33976,  6187,    88,   301,    98,  1335,     7,  1086,
          3508,     9,  9499,    10,   

In [14]:
class VPC_Data_Module(pl.LightningDataModule):

    def __init__(self, train_data, test_data, attributes, batch_size: int = 16, max_token_length: int = 128,  model_name='roberta-base'):
        super().__init__()
        self.train_data = train_data
        self.test_data = test_data
        self.attributes = attributes
        self.batch_size = batch_size
        self.max_token_length = max_token_length
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def setup(self, stage = None):
        if stage in (None, "fit"):
            self.train_dataset = VPC_Dataset(self.train_data, attributes=self.attributes, tokenizer=self.tokenizer, max_token_len=self.max_token_length)
            self.test_dataset = VPC_Dataset(self.test_data, attributes=self.attributes, tokenizer=self.tokenizer, sample=None, max_token_len=self.max_token_length)
        if stage == 'predict':
            self.test_dataset = VPC_Dataset(self.test_data, attributes=self.attributes, tokenizer=self.tokenizer, sample=None, max_token_len=self.max_token_length)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size = self.batch_size, num_workers=4, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.test_dataset, batch_size = self.batch_size, num_workers=4, shuffle=False)

    def predict_dataloader(self):
        return DataLoader(self.test_dataset, batch_size = self.batch_size, num_workers=4, shuffle=False)


In [15]:
vpc_data_module = VPC_Data_Module(vpc_train_df, vpc_test_df, attributes=attributes, max_token_length=512)

In [16]:
vpc_data_module.setup()

In [17]:
len(vpc_data_module.train_dataloader())

42

In [18]:
class VPC_Comment_Classifier(pl.LightningModule):

    def __init__(self, config: dict):
        super().__init__()
        self.config = config
        self.pretrained_model = AutoModel.from_pretrained(config['model_name'], return_dict = True)
        self.hidden = torch.nn.Linear(self.pretrained_model.config.hidden_size, self.pretrained_model.config.hidden_size)
        self.classifier = torch.nn.Linear(self.pretrained_model.config.hidden_size, self.config['n_labels'])
        torch.nn.init.xavier_uniform_(self.classifier.weight)
        self.loss_func = nn.BCEWithLogitsLoss(reduction='mean') # set up for multi label classification, consider different loss function
        self.dropout = nn.Dropout()
    
    def forward(self, input_ids, attention_mask, labels=None):
        # roberta layer
        output = self.pretrained_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = torch.mean(output.last_hidden_state, 1) # consider max pooling or simply taking the first hidden state
        # final logits
        pooled_output = self.dropout(pooled_output)
        pooled_output = self.hidden(pooled_output)
        pooled_output = F.relu(pooled_output)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output) # output of model
        # calculate loss
        loss = 0
        if labels is not None:
            loss = self.loss_func(logits.view(-1, self.config['n_labels']), labels.view(-1, self.config['n_labels']))
        return loss, logits

    def training_step(self, batch, batch_index):
        loss, outputs = self(**batch) # calls forward() due to lightning module
        self.log("train loss ", loss, prog_bar = True, logger=True)
        return {"loss":loss, "predictions":outputs, "labels": batch["labels"]}

    def validation_step(self, batch, batch_index):
        loss, outputs = self(**batch)
        self.log("validation loss ", loss, prog_bar = True, logger=True)
        return {"val_loss": loss, "predictions":outputs, "labels": batch["labels"]}

    def predict_step(self, batch, batch_index):
        loss, outputs = self(**batch)
        return outputs

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.config['lr'], weight_decay=self.config['weight_decay']) # use torch.optim.AdamW
        total_steps = self.config['train_size']/self.config['batch_size']
        warmup_steps = math.floor(total_steps * self.config['warmup'])
        scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
        return [optimizer],[scheduler]

  # def validation_epoch_end(self, outputs):
  #   losses = []
  #   for output in outputs:
  #     loss = output['val_loss'].detach().cpu()
  #     losses.append(loss)
  #   avg_loss = torch.mean(torch.stack(losses))
  #   self.log("avg_val_loss", avg_loss)
    

In [19]:
config = {
    'model_name': 'distilroberta-base', # smaller model
    'n_labels': len(attributes),
    'batch_size': len(vpc_data_module.train_dataloader()), # consider alternatives
    'lr': 1.5e-6,
    'warmup': 0.2, 
    'train_size': len(vpc_data_module.train_dataloader()),
    'weight_decay': 0.001,
    'n_epochs': 100
}

model = VPC_Comment_Classifier(config)

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
idx=0
input_ids = vpc_train_ds.__getitem__(idx)['input_ids']

attention_mask = vpc_train_ds.__getitem__(idx)['attention_mask']
labels = vpc_train_ds.__getitem__(idx)['labels']
model.cpu()
loss, output = model(input_ids.unsqueeze(dim=0), attention_mask.unsqueeze(dim=0), labels.unsqueeze(dim=0))
print(labels.shape, output.shape, output)

torch.Size([3]) torch.Size([1, 3]) tensor([[-0.3104,  0.1536,  1.0572]], grad_fn=<AddmmBackward0>)


# Training

In [None]:
# datamodule
vpc_data_module = VPC_Data_Module(vpc_train_df, vpc_test_df, attributes=attributes, max_token_length=512, batch_size=config['batch_size'])
vpc_data_module.setup()

# model
model = VPC_Comment_Classifier(config)

# trainer and fit
trainer = pl.Trainer(max_epochs=config['n_epochs'], num_sanity_val_steps=10)
trainer.fit(model, vpc_data_module)


In [None]:
print("here")