# Toxic comment classification

This notebook is a replicate of this [kaggle notebook](https://www.kaggle.com/code/tanulsingh077/deep-learning-for-nlp-zero-to-transformers-bert) on the NLP classification task.


It convers the implementation of several classical model architectures.

In [1]:
# download the jigsaw toxic comment classification dataset from kaggle

!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge -p dataset/toxic

# unzip the dataset
!unzip dataset/toxic/jigsaw-toxic-comment-classification-challenge.zip -d dataset/toxic

# remove the zip file
!rm dataset/toxic/jigsaw-toxic-comment-classification-challenge.zip



Downloading jigsaw-toxic-comment-classification-challenge.zip to dataset/toxic
 93%|███████████████████████████████████▎  | 49.0M/52.6M [00:01<00:00, 47.5MB/s]
100%|██████████████████████████████████████| 52.6M/52.6M [00:01<00:00, 39.1MB/s]
Archive:  dataset/toxic/jigsaw-toxic-comment-classification-challenge.zip
  inflating: dataset/toxic/sample_submission.csv.zip  
  inflating: dataset/toxic/test.csv.zip  
  inflating: dataset/toxic/test_labels.csv.zip  
  inflating: dataset/toxic/train.csv.zip  


In [3]:
# download the glove word embeddings

!wget http://nlp.stanford.edu/data/glove.6B.zip -P dataset/glove


--2023-05-29 19:10:32--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-05-29 19:10:32--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-05-29 19:10:32--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘dataset/glove/glove

In [1]:
# read data
import pandas as pd

df = pd.read_csv('dataset/toxic/train.csv.zip')

df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [2]:
# to make the training faster, we will only include 12,000 samples

df = df.iloc[:12000]

In [3]:
# max sequence length -- useful for padding

max_seq_len = df['comment_text'].apply(lambda x: len(x.split())).max()
print('max sequence length', max_seq_len)

max sequence length 1403


# Simple RNN model using GRU


This doesn't utilize the pretrained embedding vectors

In [4]:
# we will use simple split by space tokenizer

# dictionary of words and their counts

word_to_index = {}
index_to_word = {}

# definition of special tokens
oov = '<OOV>'
pad = '<PAD>'

# add special tokens to the dictionary
word_to_index[oov] = 0
word_to_index[pad] = 1
index_to_word[0] = oov
index_to_word[1] = pad

# preprocess the text
# remove punctuation and convert to lower case
# replace all non-alphanumeric characters with space
# split by space

df['comment_cleaned'] = df['comment_text'].str.replace(r'[^a-zA-Z0-9]+', ' ', regex=True).str.lower().apply(lambda x: x.split())

# build the dictionary
for index, row in df.iterrows():
    for word in row['comment_cleaned']:
        if word not in word_to_index:
            word_to_index[word] = len(word_to_index)
            index_to_word[len(index_to_word)] = word

print('vocabulary size', len(word_to_index))

# convert text to sequence of indices
df['comment_index'] = df['comment_cleaned'].apply(lambda x: [word_to_index[word] for word in x])

df.head()




vocabulary size 39992


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_cleaned,comment_index
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,"[explanation, why, the, edits, made, under, my...","[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1..."
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,"[d, aww, he, matches, this, background, colour...","[48, 49, 50, 51, 52, 53, 54, 25, 41, 55, 56, 5..."
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"[hey, man, i, m, really, not, trying, to, edit...","[65, 66, 25, 41, 67, 68, 69, 70, 71, 72, 73, 7..."
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,"[more, i, can, t, make, any, real, suggestions...","[89, 25, 95, 17, 96, 97, 98, 99, 21, 100, 25, ..."
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"[you, sir, are, my, hero, any, chance, you, re...","[134, 162, 119, 8, 163, 97, 164, 134, 165, 166..."


In [5]:
# train test split
from sklearn.model_selection import train_test_split

test_ratio = 0.2

df_train, df_test = train_test_split(df, test_size=test_ratio, random_state=42, shuffle=True)

print('train size', len(df_train))
print('test size', len(df_test))



train size 9600
test size 2400


In [6]:
# dataset utils 

from torch.utils.data import Dataset, DataLoader
import torch

class ToxicDataset(Dataset):

    def __init__(self, df_train, df_test, max_seq_len, comment_label='comment_cleaned', *label_cols):
        self.df_train = df_train
        self.df_test = df_test
        self.max_seq_len = max_seq_len
        self.mode = 'train'

        self.comment_label = comment_label
        self.label_cols = list(label_cols)

    def set_mode(self, mode):
        if mode not in ['train', 'test']:
            raise ValueError('mode must be either train or test')
        self.mode = mode

    def __len__(self):
        if self.mode == 'train':
            return len(self.df_train)
        else:
            return len(self.df_test)
        
    def __getitem__(self, idx):

        if self.mode == 'train':
            row = self.df_train.iloc[idx]
        else:
            row = self.df_test.iloc[idx]

        comment = row[self.comment_label]
        comment = comment[:self.max_seq_len]

        # pad the sequence
        comment = comment + [word_to_index[pad]] * (self.max_seq_len - len(comment))

        # convert to tensor
        comment = torch.tensor(comment)

        # get the labels
        labels = row[self.label_cols].astype(int).values
        labels = torch.tensor(labels)

        return comment, labels

In [7]:
# test if the dataset is working

dataset = ToxicDataset(
    df_train, df_test,
    max_seq_len,
    'comment_index',
    'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
)


loader = DataLoader(dataset, batch_size=4, shuffle=True)

batch = next(iter(loader))

print(batch)

[tensor([[6588, 3887,  159,  ...,    1,    1,    1],
        [ 608,   52,  159,  ...,    1,    1,    1],
        [ 134,  281,    8,  ...,    1,    1,    1],
        [  86,  245,    4,  ...,    1,    1,    1]]), tensor([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 1, 0, 1, 0],
        [0, 0, 0, 0, 0, 0]])]


In [8]:
# model architecture

from torch import nn
import torch.nn.functional as F

class SimpleRNN(nn.Module):
    """
    Simple RNN model
    Architecture:

    Embedding -> RNN -> Linear
    """

    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        # dropout layer
        self.dropout = nn.Dropout(0.2)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        # text shape: [batch_size, seq_len]
        embedded = self.dropout(self.embedding(text))
        # embedded shape: [batch_size, seq_len, embedding_dim]
        output, hidden = self.rnn(embedded)

        # output shape: [batch_size, seq_len, hidden_dim]
        # hidden shape: [1, batch_size, hidden_dim]

        # we only need the last hidden state
        return self.fc(hidden.squeeze(0))
    



In [9]:
# test if the model is working

model = SimpleRNN(
    vocab_size=len(word_to_index),
    embedding_dim=100,
    hidden_dim=128,
    output_dim=6,
    pad_idx=word_to_index[pad]
)

model(batch[0])


tensor([[ 0.0696,  0.0060, -0.0226, -0.0640, -0.0064,  0.0030],
        [ 0.0696,  0.0060, -0.0226, -0.0640, -0.0064,  0.0030],
        [ 0.0696,  0.0060, -0.0226, -0.0640, -0.0064,  0.0030],
        [ 0.0696,  0.0060, -0.0226, -0.0640, -0.0064,  0.0030]],
       grad_fn=<AddmmBackward0>)

In [10]:
# loss function
def loss_fn(outputs, targets):
    return F.binary_cross_entropy_with_logits(outputs, targets)


# training loop
def train_model(model, optimizer, loss_fn, data_loader, **kwargs):
    """
    Function for training the model
    """

    # set model to training mode
    model.train()

    # initialize every epoch
    running_loss = 0.0

    device = kwargs.get('device', torch.device('cpu'))

    # iterate over data batches
    for i, (inputs, labels) in enumerate(data_loader):
        # send the inputs to device
        inputs = inputs.to(device)
        labels = labels.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward pass
        outputs = model(inputs)

        # calculate the loss
        loss = loss_fn(outputs, labels.float())

        # backward pass
        loss.backward()

        # update model weights
        optimizer.step()

        # compute loss and accuracy
        running_loss += loss.item() * inputs.size(0)

    epoch_loss = running_loss / len(data_loader.dataset)

    return model, optimizer, epoch_loss


def test_model(model, loss_fn, data_loader, **kwargs):
    """
    Function for testing the model
    """

    # set model to evaluation mode
    model.eval()

    # initialize every epoch
    running_loss = 0.0

    device = kwargs.get('device', torch.device('cpu'))

    # iterate over data batches
    for i, (inputs, labels) in enumerate(data_loader):
        # send the inputs to device
        inputs = inputs.to(device)
        labels = labels.to(device)

        # forward pass
        outputs = model(inputs)

        # calculate the loss
        loss = loss_fn(outputs, labels.float())

        # compute loss and accuracy
        running_loss += loss.item() * inputs.size(0)

    epoch_loss = running_loss / len(data_loader.dataset)

    return model, epoch_loss

In [11]:
# see the training loop works

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = SimpleRNN(
    vocab_size=len(word_to_index),
    embedding_dim=100,
    hidden_dim=128,
    output_dim=6,
    pad_idx=word_to_index[pad]
)

model = model.to(device)

optimizer = torch.optim.Adam(model.parameters())

train_loader = DataLoader(dataset, batch_size=4, shuffle=True)

for epoch in range(2):
    model, optimizer, train_loss = train_model(
        model, optimizer, loss_fn, train_loader, device=device
    )
    print(f'Epoch: {epoch + 1}, Train Loss: {train_loss:.4f}')

KeyboardInterrupt: 

In [11]:
# parameters 

num_epochs = 10
batch_size = 128
learning_rate = 1e-3
log_interval = 100

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# instantiate the model

model = SimpleRNN(
    vocab_size=len(word_to_index),
    embedding_dim=100,
    hidden_dim=128,
    output_dim=6,
    pad_idx=word_to_index[pad]
)

# send the model to device
model.to(device)

# define the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# create the dataset and dataloader
dataset = ToxicDataset(
    df_train, df_test,
    max_seq_len,
    'comment_index',
    'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
)

train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# start training

for epoch in range(num_epochs):

    model, optimizer, train_loss = train_model(
        model, optimizer, loss_fn, train_loader, device=device
    )

    print('Epoch: {}'.format(epoch + 1))
    print('\tTrain Loss: {:.4f}'.format(train_loss))




: 

: 