In [1]:
pip install lightning --quiet

In [2]:
import os
import pandas as pd
import numpy as np

import lightning as L
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import TensorBoardLogger
from lightning.pytorch.utilities.memory import garbage_collection_cuda

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, TensorDataset, random_split
from torch.optim import Adam

from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup

import seaborn as sns
import matplotlib.pyplot as plt

from torchmetrics import AUROC

from tqdm.auto import tqdm
%matplotlib inline


## Load the Data

In [3]:
os.environ['KAGGLE_CONFIG_DIR'] = '.'

In [4]:
try:
  !kaggle competitions download -c jigsaw-toxic-comment-classification-challenge
  ! unzip jigsaw-toxic-comment-classification-challenge -d data
  print('Files downloaded and added to data folder')
except:
  print('Files present on drive')

Downloading jigsaw-toxic-comment-classification-challenge.zip to /content
 93% 49.0M/52.6M [00:00<00:00, 131MB/s]
100% 52.6M/52.6M [00:00<00:00, 121MB/s]
Archive:  jigsaw-toxic-comment-classification-challenge.zip
  inflating: data/sample_submission.csv.zip  
  inflating: data/test.csv.zip       
  inflating: data/test_labels.csv.zip  
  inflating: data/train.csv.zip      
Files downloaded and added to data folder


In [4]:
raw_df = pd.read_csv('data/train.csv.zip')
test_df = pd.read_csv('data/test.csv.zip')
sub_df = pd.read_csv('data/sample_submission.csv.zip')

## PreProcessing

In [5]:
train_df, val_df = train_test_split(raw_df, test_size=0.05)

In [6]:
train_df.shape, val_df.shape

((151592, 8), (7979, 8))

In [7]:
LABEL_COLUMNS = raw_df.columns[2:]

In [8]:
train_df[LABEL_COLUMNS].sum()

toxic            14519
severe_toxic      1521
obscene           8021
threat             451
insult            7487
identity_hate     1327
dtype: int64

In [9]:
train_toxic = train_df[train_df[LABEL_COLUMNS].sum(axis=1)>0]

In [10]:
train_toxic[LABEL_COLUMNS].sum()

toxic            14519
severe_toxic      1521
obscene           8021
threat             451
insult            7487
identity_hate     1327
dtype: int64

In [11]:
train_clean = train_df[train_df[LABEL_COLUMNS].sum(axis=1)==0]

In [12]:
train_toxic.shape, train_clean.shape

((15400, 8), (136192, 8))

In [13]:
train_df = pd.concat(
    [train_toxic,
    train_clean.sample(15_000)]
)

In [14]:
train_df[LABEL_COLUMNS].sum()

toxic            14519
severe_toxic      1521
obscene           8021
threat             451
insult            7487
identity_hate     1327
dtype: int64

In [15]:
train_df.shape

(30400, 8)

In [16]:
train_df.iloc[46].comment_text

'[[==:Amortias is M(o)ther (fu)k(er)=='

In [17]:
train_df[LABEL_COLUMNS].iloc[46]

toxic            1
severe_toxic     0
obscene          1
threat           0
insult           1
identity_hate    0
Name: 156608, dtype: int64

## Tokenizer

In [18]:
BERT_MODEL_NAME = "bert-base-cased"
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
## Check avg token legth from the training set

token_count =[]
for index, row in tqdm(train_df.iterrows(),total=len(train_df)):
  TOKEN_LENGTH = len(tokenizer.encode(row.comment_text, max_length=512, truncation=True))
  token_count.append(TOKEN_LENGTH)

  0%|          | 0/30400 [00:00<?, ?it/s]

In [None]:
sns.histplot(token_count)
plt.show()

In [None]:
type(tokenizer)

## Create Dataset

In [None]:
class create_dataset(Dataset):
  def __init__(self, df:pd.DataFrame, tokenizer=tokenizer, max_length: int=512)-> Dataset:
    super().__init__()
    self.df = df
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __len__(self):
    return len(self.df)

  def __getitem__(self,idx:int):
    comment_text = self.df.iloc[idx].comment_text
    labels = self.df.iloc[idx][LABEL_COLUMNS]

    encoding = self.tokenizer.encode_plus(
        comment_text,
        add_special_tokens =True,
        max_length = self.max_length,
        return_token_type_ids = False,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    return dict(
        comment_text = comment_text,
        input_ids = encoding['input_ids'].flatten(),
        attention_mask = encoding['attention_mask'].flatten(),
        labels = torch.Tensor(labels)
    )




In [None]:
ds = create_dataset(train_df)

In [None]:
ds[0]['input_ids'].shape

In [None]:
len(ds)

## Model

In [None]:
bert_model = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)

In [None]:
bert_model.config

## PyTorch DataModule

In [None]:
class create_data_module(L.LightningDataModule):
  def __init__(self, train_df:pd.DataFrame,val_df:pd.DataFrame,test_df:pd.DataFrame, tokenizer=tokenizer, max_length: int=512, batch_size: int=6)-> None:
    super().__init__()
    self.train_df = train_df
    self.val_df=val_df
    self.test_df=test_df
    self.tokenizer = tokenizer
    self.max_length = max_length
    self.batch_size = batch_size

  def setup(self, stage=None):
    self.train_ds = create_dataset(self.train_df, self.tokenizer, self.max_length)
    self.val_ds = create_dataset(self.val_df, self.tokenizer, self.max_length)
    self.test_ds = create_dataset(self.test_df, self.tokenizer, self.max_length)


  def train_dataloader(self):
    return DataLoader(
        self.train_ds,
        batch_size=self.batch_size,
        shuffle=True,
        pin_memory=True,
        num_workers=8
    )
  def val_dataloader(self):
    return DataLoader(
        self.val_ds,
        batch_size=self.batch_size*2,
        shuffle=False,
        pin_memory=True,
        num_workers=8
    )

  def test_dataloader(self):
    return DataLoader(
        self.test_ds,
        batch_size=1
    )



In [None]:
data_module = create_data_module(train_df,val_df,test_df)

In [None]:
data_module.setup()

## Fine Tuning BERT Model

In [None]:
class FTModel(L.LightningModule):

  def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
    super().__init__()
    self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
    self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
    self.n_training_steps = n_training_steps
    self.n_warmup_steps = n_warmup_steps
    self.criterion = nn.BCEWithLogitsLoss()
    self.training_outputs = []

  def forward(self, input_ids, attention_mask, labels=None):
    output = self.bert(input_ids, attention_mask=attention_mask)
    output = self.classifier(output.pooler_output)
    output = torch.sigmoid(output)
    loss = 0
    if labels is not None:
        loss = self.criterion(output, labels)
    return loss, output

  def training_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.training_outputs.append({"predictions": outputs, "labels": labels})
    self.log("train_loss", loss, prog_bar=True, logger=True)
    return {"loss": loss}

  def validation_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("val_loss", loss, prog_bar=True, logger=True)
    return loss

  def test_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]
    loss, outputs = self(input_ids, attention_mask, labels)
    self.log("test_loss", loss, prog_bar=True, logger=True)
    return loss

  def on_train_epoch_end(self):

    labels = torch.stack([out_labels["labels"]for out_labels in training_outputs]).int()
    predictions = torch.stack([out_labels["predictions"]for out_labels in training_outputs])

    for i, name in enumerate(LABEL_COLUMNS):
      class_roc_auc = auroc(predictions[:, i], labels[:, i])
      self.logger.experiment.add_scalar(f"{name}_roc_auc/Train", class_roc_auc, self.current_epoch)

    self.training_outputs.clear()

  def configure_optimizers(self):

    optimizer = AdamW(self.parameters(), lr=2e-5)

    scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=self.n_warmup_steps,
      num_training_steps=self.n_training_steps
    )

    return dict(
      optimizer=optimizer,
      lr_scheduler=dict(
        scheduler=scheduler,
        interval='step'
      )
    )



In [None]:
dl = DataLoader(ds, batch_size=4)

In [106]:
input_ids, _, attention_mask, labels = batch['input_ids'], batch['comment_text'], batch['attention_mask'], batch['labels']

In [107]:
bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
linear = nn.Linear(in_features= bert.config.hidden_size, out_features = len(LABEL_COLUMNS))

In [108]:
output = bert(input_ids, attention_mask)

In [109]:
out = linear(output.pooler_output)

In [110]:
criterion = nn.BCEWithLogitsLoss()

In [111]:
criterion(out, labels)

tensor(0.7827, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)

## Train and Evaluate Model

In [None]:
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs

In [None]:
logger =TensorBoardLogger('lightning_logs', name='NLPTextClassification')
checkpoint_callback = ModelCheckpoint(
    dirpath = 'checkpoints',
    filename = 'best-checkpoint',
    save_top_k = 1,
    verbose = True,
    monitor='val_loss',
    mode='min'
)
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

In [None]:
trainer = L.Trainer(
    logger = logger,
    callbacks = [checkpoint_callback,early_stopping],
    max_epochs=3,
    devices=1,
    accelerator='gpu',
    enable_progress_bar=True,
    enable_model_summary=True,
    precision='16-mixed'
)

In [None]:
model = FTModel(n_classes=len(LABEL_COLUMNS), n_warmup_steps=1, n_training_steps=len(train_df))

In [None]:
trainer.fit(model, data_module)

## Prediction

In [69]:
model.eval()
model.freeze()

In [86]:
comment_text = test_df.iloc[0].comment_text

encoding = tokenizer.encode_plus(
    comment_text,
    add_special_tokens =True,
    max_length = 512,
    return_token_type_ids = False,
    padding="max_length",
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)


In [95]:
preds = model(encoding['input_ids'],encoding['attention_mask'])[-1].flatten()

In [None]:
dict(zip(preds,LABEL_COLUMNS))