# Toxic comment classification

This notebook is a replicate of this [kaggle notebook](https://www.kaggle.com/code/tanulsingh077/deep-learning-for-nlp-zero-to-transformers-bert) on the NLP classification task.


It convers the implementation of several classical model architectures.

In [1]:
# download the jigsaw toxic comment classification dataset from kaggle

!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge -p dataset/toxic

# unzip the dataset
!unzip dataset/toxic/jigsaw-toxic-comment-classification-challenge.zip -d dataset/toxic

# remove the zip file
!rm dataset/toxic/jigsaw-toxic-comment-classification-challenge.zip



Downloading jigsaw-toxic-comment-classification-challenge.zip to dataset/toxic
 97%|████████████████████████████████████▊ | 51.0M/52.6M [00:03<00:00, 17.5MB/s]
100%|██████████████████████████████████████| 52.6M/52.6M [00:03<00:00, 17.4MB/s]
Archive:  dataset/toxic/jigsaw-toxic-comment-classification-challenge.zip
  inflating: dataset/toxic/sample_submission.csv.zip  
  inflating: dataset/toxic/test.csv.zip  
  inflating: dataset/toxic/test_labels.csv.zip  
  inflating: dataset/toxic/train.csv.zip  


In [2]:
# download the glove word embeddings

!wget http://nlp.stanford.edu/data/glove.6B.zip -P dataset/glove

# unzip the glove word embeddings
!unzip dataset/glove/glove.6B.zip -d dataset/glove

--2023-05-29 17:22:15--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-05-29 17:22:15--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-05-29 17:22:15--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘dataset/glove/glove

In [24]:
# read data
import pandas as pd

df = pd.read_csv('dataset/toxic/train.csv.zip')

df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [25]:
# to make the training faster, we will only include 12,000 samples

df = df.iloc[:12000]

In [26]:
# max sequence length -- useful for padding

max_seq_len = df['comment_text'].apply(lambda x: len(x.split())).max()
print('max sequence length', max_seq_len)

max sequence length 1403


# Simple RNN model using GRU


This doesn't utilize the pretrained embedding vectors

In [27]:
# we will use simple split by space tokenizer

# dictionary of words and their counts

word_to_index = {}
index_to_word = {}

# definition of special tokens
oov = '<OOV>'
pad = '<PAD>'

# add special tokens to the dictionary
word_to_index[oov] = 0
word_to_index[pad] = 1
index_to_word[0] = oov
index_to_word[1] = pad

# preprocess the text
# remove punctuation and convert to lower case
# replace all non-alphanumeric characters with space
# split by space

df['comment_cleaned'] = df['comment_text'].str.replace(r'[^a-zA-Z0-9]+', ' ', regex=True).str.lower().apply(lambda x: x.split())

# build the dictionary
for index, row in df.iterrows():
    for word in row['comment_cleaned']:
        if word not in word_to_index:
            word_to_index[word] = len(word_to_index)
            index_to_word[len(index_to_word)] = word

print('vocabulary size', len(word_to_index))

# convert text to sequence of indices
df['comment_index'] = df['comment_cleaned'].apply(lambda x: [word_to_index[word] for word in x])

df.head()




vocabulary size 39992


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_cleaned,comment_index
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,"[explanation, why, the, edits, made, under, my...","[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1..."
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,"[d, aww, he, matches, this, background, colour...","[48, 49, 50, 51, 52, 53, 54, 25, 41, 55, 56, 5..."
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"[hey, man, i, m, really, not, trying, to, edit...","[65, 66, 25, 41, 67, 68, 69, 70, 71, 72, 73, 7..."
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,"[more, i, can, t, make, any, real, suggestions...","[89, 25, 95, 17, 96, 97, 98, 99, 21, 100, 25, ..."
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"[you, sir, are, my, hero, any, chance, you, re...","[134, 162, 119, 8, 163, 97, 164, 134, 165, 166..."


In [28]:
# train test split
from sklearn.model_selection import train_test_split

test_ratio = 0.2

df_train, df_test = train_test_split(df, test_size=test_ratio, random_state=42, shuffle=True)

print('train size', len(df_train))
print('test size', len(df_test))



train size 9600
test size 2400


In [49]:
# dataset utils 

from torch.utils.data import Dataset, DataLoader
import torch

class ToxicDataset(Dataset):

    def __init__(self, df_train, df_test, max_seq_len, comment_label='comment_index', *label_cols):
        self.df_train = df_train
        self.df_test = df_test
        self.max_seq_len = max_seq_len
        self.mode = 'train'

        self.comment_label = comment_label
        self.label_cols = list(label_cols)

    def set_mode(self, mode):
        if mode not in ['train', 'test']:
            raise ValueError('mode must be either train or test')
        self.mode = mode

    def __len__(self):
        if self.mode == 'train':
            return len(self.df_train)
        else:
            return len(self.df_test)
        
    def __getitem__(self, idx):

        if self.mode == 'train':
            row = self.df_train.iloc[idx]
        else:
            row = self.df_test.iloc[idx]

        comment = row[self.comment_label]
        comment = comment[:self.max_seq_len]

        # pad the sequence
        comment = comment + [word_to_index[pad]] * (self.max_seq_len - len(comment))

        # convert to tensor
        comment = torch.tensor(comment)

        # get the labels
        labels = row[self.label_cols].astype(int).values
        labels = torch.tensor(labels, dtype=torch.float)

        return comment, labels

In [50]:
# test if the dataset is working

dataset = ToxicDataset(
    df_train, df_test,
    max_seq_len,
    'comment_index',
    'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
)


loader = DataLoader(dataset, batch_size=4, shuffle=True)

batch = next(iter(loader))

print(batch)

[tensor([[  714,   121,    52,  ...,     1,     1,     1],
        [ 1656, 18713,  1902,  ...,     1,     1,     1],
        [  555,  1661,   337,  ...,     1,     1,     1],
        [   25,   712,  1667,  ...,     1,     1,     1]]), tensor([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]])]


In [46]:
# model architecture

from torch import nn
import torch.nn.functional as F

class SimpleRNN(nn.Module):
    """
    Simple RNN model
    Architecture:

    Embedding -> RNN -> Linear
    """

    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        # dropout layer
        self.dropout = nn.Dropout(0.2)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        # text shape: [batch_size, seq_len]
        embedded = self.dropout(self.embedding(text))
        # embedded shape: [batch_size, seq_len, embedding_dim]
        output, hidden = self.rnn(embedded)

        # output shape: [batch_size, seq_len, hidden_dim]
        # hidden shape: [1, batch_size, hidden_dim]

        # we only need the last hidden state
        return self.fc(hidden.squeeze(0))
    



In [47]:
# test if the model is working

model = SimpleRNN(
    vocab_size=len(word_to_index),
    embedding_dim=100,
    hidden_dim=128,
    output_dim=6,
    pad_idx=word_to_index[pad]
)

model(batch[0])


tensor([[-0.0352, -0.0996, -0.0632,  0.0620, -0.0314, -0.1151],
        [-0.0352, -0.0996, -0.0632,  0.0620, -0.0314, -0.1151],
        [-0.0352, -0.0996, -0.0632,  0.0620, -0.0314, -0.1151],
        [-0.0352, -0.0996, -0.0632,  0.0620, -0.0314, -0.1151]],
       grad_fn=<AddmmBackward0>)

In [59]:
# loss function
def loss_fn(outputs, targets):
    return F.binary_cross_entropy_with_logits(outputs, targets)


# training loop
def train_epoch(model, optimizer, data_loader, loss_fn, **kwargs):
    # train the model for one epoch

    # set model to training mode
    model.train()

    device = kwargs.get('device', 'cpu')

    # iterate over batches
    for i, batch in enumerate(data_loader):
        # step
        optimizer.zero_grad()

        # get the data
        inputs, targets = batch
        # move data to device
        inputs = inputs.to(device)
        targets = targets.to(device)

        # forward pass
        outputs = model(inputs)

        # calculate loss
        loss = loss_fn(outputs, targets)

        # backward pass
        loss.backward()

        # update parameters
        optimizer.step()

        # log every 100 steps
        if i % 10 == 0:
            print(f'step {i}, loss {loss.item()}')



# evaluation loop
def eval_model(model, data_loader, loss_fn, **kwargs):
    model.eval()
    running_loss = 0.0
    num_correct = 0

    device = kwargs.get('device', 'cpu')

    # we don't need to calculate gradients
    with torch.no_grad():
        # iterate over batches
        for i, batch in enumerate(data_loader):
            # get the data
            inputs, targets = batch
            inputs = inputs.to(device)
            targets = targets.to(device)

            # forward pass
            outputs = model(inputs)

            # calculate loss
            loss = loss_fn(outputs, targets)

            # update running loss
            running_loss += loss.item()

            # get the predictions
            preds = torch.sigmoid(outputs).round()

            # update num_correct

            num_correct += (preds == targets).sum().item()

    # calculate accuracy and loss
    accuracy = num_correct / len(data_loader.dataset)
    avg_loss = running_loss / len(data_loader)

    print(f'accuracy {accuracy}, avg loss {avg_loss}')

    return accuracy, avg_loss





In [60]:
# parameters
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

vocab_size = len(word_to_index)
embedding_dim = 100
hidden_dim = 128
output_dim = 6

pad_idx = word_to_index[pad]

num_epochs = 20
learning_rate = 4e-4

batch_size = 128

# create the dataaset and dataloader
dataset = ToxicDataset(
    df_train, df_test,
    max_seq_len,
    'comment_index',
    'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
)

dataset.set_mode('train')
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# create the model

model = SimpleRNN(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    hidden_dim=hidden_dim,
    output_dim=output_dim,
    pad_idx=pad_idx
)

model = model.to(device)

# create the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# train the model
for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}')
    train_epoch(model, optimizer, train_loader, loss_fn, device=device)
    print()

    eval_model(model, train_loader, loss_fn, device=device)



Epoch 1/20
step 0, loss 0.7130800485610962
step 10, loss 0.6808332800865173
step 20, loss 0.6402722597122192
step 30, loss 0.5560530424118042
step 40, loss 0.24920324981212616
step 50, loss 0.15349721908569336
step 60, loss 0.20040611922740936
step 70, loss 0.15604200959205627

accuracy 5.783229166666667, avg loss 0.14350944002469382
Epoch 2/20
step 0, loss 0.16073022782802582
step 10, loss 0.13776260614395142
step 20, loss 0.13208015263080597
step 30, loss 0.11392798274755478
step 40, loss 0.14820009469985962
step 50, loss 0.18179702758789062
step 60, loss 0.16182804107666016
step 70, loss 0.1341179460287094

accuracy 5.783645833333333, avg loss 0.13981592178344726
Epoch 3/20
step 0, loss 0.09953878819942474
step 10, loss 0.1044817790389061
step 20, loss 0.10659607499837875
step 30, loss 0.10189026594161987
step 40, loss 0.1518658995628357
step 50, loss 0.17675289511680603
step 60, loss 0.14469346404075623
step 70, loss 0.08127924054861069

accuracy 5.783645833333333, avg loss 0.13957

KeyboardInterrupt: 

In [61]:
# make the predictions
import numpy as np

dataset.set_mode('test')

test_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

with torch.no_grad():
    preds = []
    for i, batch in enumerate(test_loader):
        inputs, _ = batch
        inputs = inputs.to(device)
        outputs = torch.sigmoid(model(inputs))
        preds.append(outputs.cpu().numpy())

preds = np.concatenate(preds)


In [62]:
# evaluate the model

from sklearn.metrics import roc_auc_score, confusion_matrix

labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# calculate the roc auc score
roc_auc_score(df_test[labels].values, preds)

0.505561819963189

In [66]:
df_train[labels].sum() / len(df_train)


toxic            0.095521
severe_toxic     0.009687
obscene          0.050625
threat           0.003333
insult           0.048542
identity_hate    0.009062
dtype: float64