# Toxic Comment Classification Challenge
Identify and classify toxic online comments  
[jifsaw competition page](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data)

## preparation

In [19]:
# setting up kaggle authentication
!chmod 600 ~/.kaggle/kaggle.json

# download kaggle dataset
# !kaggle competitions download -c jigsaw-toxic-comment-classification-challenge

Downloading jigsaw-toxic-comment-classification-challenge.zip to /work/Kaggle/PJ/jigsaw
 99%|█████████████████████████████████████▌| 52.0M/52.6M [00:03<00:00, 7.68MB/s]
100%|██████████████████████████████████████| 52.6M/52.6M [00:04<00:00, 13.7MB/s]


In [20]:
# importing modules required for file preparation
from glob import glob
import os
from zipfile import ZipFile

In [21]:
# # unzip all zip files
# files = glob("*.zip")
# for file in files:
#     with ZipFile(file) as zip:
#         zip.extractall()

In [14]:
# # delete all zip files
# files = glob("*.zip")        
# for file in files:
#     os.remove(file)

## model building

In [23]:
import pandas as pd
import numpy as np
import transformers
from transformers import AutoTokenizer, AutoModel
import torch
from torch.utils.data import Dataset, DataLoader
import multiprocessing as mp
import pytorch_lightning as pl
from pytorch_lightning import loggers as pl_loggers
from torchmetrics import AUROC
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [24]:
data = pd.read_csv("train.csv.zip")

In [25]:
class JigsawDataset(Dataset):
    def __init__(self, data, model_name, max_length, cache_tensors=False):
        self.cache_tensors = cache_tensors
        if self.cache_tensors:
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.tokenized = tokenizer(
                list(data["comment_text"].values),
                padding="max_length",
                truncation=True,
                max_length=max_length,
                return_tensors="pt"
            )
            self.keys = list(self.tokenized.keys())
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.max_length = max_length
            self.comments = list(data.comment_text.values)
        self.labels = torch.as_tensor(data.values[:, 2:].astype(np.int_))


    def __len__(self):
        return self.labels.shape[0]

    def __getitem__(self, idx):
        if self.cache_tensors:
            inputs = {key: self.tokenized[key][idx] for key in self.keys}
        else:
            comment = self.comments[idx]
            inputs = self.tokenizer(
                comment,
                add_special_tokens=True,
                padding="max_length",
                truncation=True,
                max_length=self.max_length,
                return_tensors="pt"
            )
            # get rid of the batch dimension
            inputs = {k: v[0] for k, v in inputs.items()}
        inputs["labels"] = self.labels[idx, :]
        return inputs

In [26]:
class JigsawModel(pl.LightningModule):
    def __init__(self, model_name, freeze_pretrained=True):
        super().__init__()
        self.hf_model = AutoModel.from_pretrained(model_name)
        if freeze_pretrained:
            for parameter in self.hf_model.parameters():
                parameter.requires_grad = False
        # try to autodetect output size based on last layer
        out_size = list(self.hf_model.named_parameters())[-1][1].shape[0]
        print(f"Detected output size is {out_size}")
        self.head = torch.nn.Sequential(
            torch.nn.Linear(out_size, 6),
        )
        self.loss_fn = torch.nn.BCEWithLogitsLoss()
        self.train_rocauc = AUROC(num_classes=6)
        self.val_rocauc = AUROC(num_classes=6)
    
    def forward(self, inputs):
        out = self.hf_model(**inputs)
        out = self.head(out["pooler_output"])
        return out

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-5)
        return optimizer

    def training_step(self, inputs):
        x = {key: tensor for key, tensor in inputs.items() if key!="labels"}
        y = inputs["labels"]
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat, y.half())
        self.log("train_loss", loss, on_step=True, on_epoch=True)
        self.train_rocauc(y_hat, y)
        self.log('train_rocauc', self.train_rocauc, on_step=True, on_epoch=True)
        return loss
    
    def validation_step(self, inputs, *args):
        x = {key: tensor for key, tensor in inputs.items() if key!="labels"}
        y = inputs["labels"]
        y_hat = self.forward(x)
        self.val_rocauc(y_hat, y)
        self.log('val_rocauc', self.val_rocauc, on_epoch=True)

In [27]:
model_name = "bert-base-cased"

train_data, val_data = train_test_split(data, test_size=0.2)

train_ds = JigsawDataset(train_data, model_name, max_length=256, cache_tensors=True)
val_ds = JigsawDataset(val_data, model_name, max_length=256, cache_tensors=True)

train_dl = DataLoader(
    train_ds,
    batch_size=16,
    shuffle=True,
    num_workers=0,
    pin_memory=True,
    #persistent_workers=True
)
val_dl = DataLoader(
    val_ds,
    batch_size=16,
    shuffle=False,
    num_workers=0,
    pin_memory=True,
    #persistent_workers=True
)

model = JigsawModel(model_name, freeze_pretrained=True)
tb_logger = pl_loggers.TensorBoardLogger(
    "lightning_logs/",
    name="jigsaw",
    version="pooler_freeze"
)
trainer = pl.Trainer(
    max_epochs=500,
    gpus=1,
    precision=16,
    logger=tb_logger,
    enable_checkpointing=False
)
trainer.fit(model, train_dl, val_dl)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Detected output size is 768


MisconfigurationException: You requested GPUs: [0]
 But your machine only has: []