In [1]:
import os
import cv2
import time
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import re
import torch
import spacy
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.optim as optim
from collections import Counter
from sklearn.metrics import roc_auc_score,accuracy_score

In [2]:
train_df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
train_df.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
# Label values with -1 mean that this row was not used for scoring in the competition

In [4]:
test = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test.csv")
test_labels = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv")
test_df = pd.merge(test, test_labels, left_on='id', right_on='id', how='left')
drop_idxs = test_df[
    (test_df.toxic == -1) | (test_df.severe_toxic == -1) | (test_df.obscene == -1) | 
    (test_df.threat == -1) | (test_df.insult == -1) | (test_df.identity_hate == -1)
].index
test_df = test_df.drop(drop_idxs, axis="rows")

test_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
5,0001ea8717f6de06,Thank you for understanding. I think very high...,0,0,0,0,0,0
7,000247e83dcc1211,:Dear god this site is horrible.,0,0,0,0,0,0
11,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0,0,0,0,0,0
13,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0,0,0,0,0,0
14,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",0,0,0,0,0,0


In [5]:
test_df[test_df.toxic == 1]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
21,00091c35fa9d0465,"== Arabs are committing genocide in Iraq, but ...",1,0,0,0,0,0
48,0013fed3aeae76b7,DJ Robinson is gay as hell! he sucks his dick ...,1,0,1,0,1,1
59,0017d4d47894af05,":Fuck off, you anti-semitic cunt. |",1,0,1,0,1,0
76,001d739c97bc2ae4,How dare you vandalize that page about the HMS...,1,0,0,0,0,0
81,001eff4007dbb65b,"::No, he is an arrogant, self serving, immatur...",1,0,1,0,1,0
...,...,...,...,...,...,...,...,...
152879,ff84f0367ea58abb,am sorry for being a dickhead! I cannae help i...,1,0,1,0,1,0
152908,ff91c3d8a3e34398,NIGEL IS A CRAZY IDIOT!!!,1,0,0,0,1,0
153078,ffdf6854b41d9102,==Fourth Baldrick possibly being cleverer than...,1,0,0,0,0,0
153119,ffebe90c8d5acaba,""" \n\n == IRAN == \n That’s right, Iran. It wa...",1,0,1,0,0,0


# Data Cleaning Pipeline

In [7]:
def preprocess_text(text, lower_case=True, clean_text=True):
    
    if lower_case:
        text = text.lower()
    
    # Remove website links
    template = re.compile(r'https?://\S+|www\.\S+') 
    text = template.sub(r'', text)
    
    # Remove HTML tags
    template = re.compile(r'<[^>]*>') 
    text = template.sub(r'', text)
    
    # Remove none ascii characters
    template = re.compile(r'[^\x00-\x7E]+') 
    text = template.sub(r'', text)
    
    # Replace none printable characters
    template = re.compile(r'[\x00-\x0F]+') 
    text = template.sub(r' ', text)
    
    if clean_text:
        # Remove special characters
        text = re.sub("'s", '', text)
        template = re.compile('["#$%&\'()\*\+-/:;<=>@\[\]\\\\^_`{|}~]') 
        text = template.sub(r' ', text)
        # Replace multiple punctuation 
        text = re.sub('[.!?]{2,}', '.', text)
        text = re.sub(',+', ',', text) 
        # Remove numbers
        text = re.sub('\d+', ' ', text) 
        
    # Remove extra spaces
    text = re.sub('\s+', ' ', text)
    
    # Remove spaces at the beginning and at the end of string
    text = text.strip() 

    return text

Cleaning the comments

In [8]:
train_df["comment_text"] = train_df["comment_text"].map(lambda com : preprocess_text(com))
test_df["comment_text"] = test_df["comment_text"].map(lambda com : preprocess_text(com))

**Distribution of Labels**

In [9]:
train_df[['toxic','severe_toxic','obscene', 'threat', 'insult', 'identity_hate']].sum(axis =0 )

toxic            15294
severe_toxic      1595
obscene           8449
threat             478
insult            7877
identity_hate     1405
dtype: int64

In [10]:
train_df.shape

(159571, 8)

# Getting word dictionary/Max Sentence Length

In [11]:
def get_dict(df_clean):
    
    reviews = [str(review).split(' ') for review in list(df_clean['comment_text'] )]
    word_freq = Counter([token for review in reviews for token in review]).most_common()

    word_freq = dict(word_freq)
    min_freq = 5
    word_dict = {}


    # sending all the unknowns to 0
    i = 1
    for word in word_freq:
        if word_freq[word] > min_freq:
            word_dict[word] = i
            i += 1
        else:
            word_dict[word] = 0


    max_length = 0
    for idx in tqdm(range(len(df_clean))):
        row = df_clean.iloc[idx]
        length = len(str(row['comment_text']).split(' '))
        if length > max_length:
            max_length = length
            
    return max_length+1, word_dict

In [12]:
max_length, word_dict = get_dict(train_df)

  0%|          | 0/159571 [00:00<?, ?it/s]

In [13]:
LABEL_COLUMNS = train_df.columns.tolist()[2:]
LABEL_COLUMNS

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Creating Dataset

In [14]:
class ToxicComment_Dataset(Dataset):
    def __init__(self, df,word_dict, max_length):
        self.df = df
        self.word_dict = word_dict
        self.max_len = max_length
        self.UNK = max(word_dict.values())
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        labels = row[LABEL_COLUMNS]
        review = str(row['comment_text']).split(' ')
        x = torch.zeros(self.max_len)
        for idx in range(len(review)):
            x[self.max_len - len(review) + idx] = self.word_dict.get(review[idx], self.UNK)
        
        label = labels=torch.FloatTensor(labels)
        #torch.tensor(row_label['is_sarcastic']).float()
        return x.long(), label
    
    

# Loading from Gensim Word Embedding

In [15]:
import gensim.downloader
glove_emb = gensim.downloader.load('glove-wiki-gigaword-100')



In [16]:
weights = glove_emb.get_normed_vectors()
unk = weights.mean(axis = 0)
all_weights = np.vstack((weights, unk))

dict_length, word_dict = all_weights.shape[0], glove_emb.key_to_index

# Creating Dataloader

In [17]:
def create_dataloader(X_train,X_test,word_dict, batch_size, max_length):
    ds_train = ToxicComment_Dataset(X_train,word_dict, max_length)
    dl_train = DataLoader(ds_train, batch_size=batch_size, shuffle=True)


    ds_test = ToxicComment_Dataset(X_test,word_dict, max_length)
    dl_test = DataLoader(ds_test, batch_size=batch_size, shuffle=False)
    return dl_train, dl_test, ds_train, ds_test
    

In [18]:
batch_size = 1000
dl_train, dl_test, ds_train, ds_test = create_dataloader(train_df,test_df,word_dict, batch_size, max_length)

# Creating Model - GRU

In [25]:
class GRU(nn.Module):
    def __init__(self, weights, embedding_size,hidden_size):
        super(GRU, self).__init__()
        # padding index turns off gradient for unknown tokens
        #self.word_emb = nn.Embedding(dict_length, embedding_size, padding_idx=0)
        self.word_emb = nn.Embedding.from_pretrained(torch.tensor(weights), freeze=True)
        self.gru = nn.GRU(input_size= embedding_size, hidden_size=100, num_layers=1, batch_first=True)
        self.g = nn.Linear(hidden_size, 6)
        
        # PyTorch RNN outputs a sequence of same length as input
        # For many to one, we can either use the final hidden state OR
        # slap a linear layer on the output, taking in all the hidden states
        
    def forward(self, x):
        #print(x.shape)
        x = self.word_emb(x)
        #print(x.shape)
        # RNN layer outputs a tuple, the output and the final hidden state
        # taking the final hidden state as output
        x = self.gru(x)[1]
        #print(x.shape)
        output = self.g(x)
        #output = torch.sigmoid(output)
        #print(x.shape)
        
        return torch.squeeze(output)

In [26]:
x,y = next(iter(dl_test))

# Open pass and ROC

In [27]:
def one_pass(model, dataloader, optimizer, lossFun, backwards=True, print_loss=False):
    
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    
    if backwards == True:
        model.train()
    else:
        model.eval()
    
    total_loss = 0.0    

    all_y = []
    all_y_hat = []
    for x, y in tqdm(dataloader):
        x = x.to(device)
        y = y.to(device)
        y_pred = model(x)
        all_y.append(y)
        all_y_hat.append(y_pred)
        
        loss = lossFun(y_pred, y)
        total_loss += loss.item()
        
        if backwards == True:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    avg_loss = total_loss / len(dataloader)
    
    y = torch.cat(all_y,dim=0)
    y_hat = torch.cat(all_y_hat,dim=0)  
    roc = roc_auc_score(y.cpu(),y_hat.sigmoid().detach().cpu(), average = 'micro')
    if print_loss == True:
        print(avg_loss)
    
    return avg_loss, roc

def one_pass_roc(model, dataloader, num_points):
    model.eval()
    total_incorrect = 0
    all_y = []
    all_y_hat = []
    
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    
    for x, y in dataloader:
        x = x.to(device)
        y = y.to(device)
        y_pred = model(x)
        
        all_y.append(y)
        all_y_hat.append(y_pred)
        
        #y_pred = (torch.sigmoid(model(x)) > 0.5).float()
        
        #total_incorrect += torch.count_nonzero(y - y_pred).item()
    y = torch.cat(all_y,dim=0)
    y_hat = torch.cat(all_y_hat,dim=0)  
    roc = roc_auc_score(y.cpu(),y_hat.sigmoid().detach().cpu(), average = 'micro')
    
    return roc

# Train the Modeel

In [28]:
dict_length = max(word_dict.values()) + 1
#model = GRU(dict_length, 100, 100)
model = GRU(weights, 100, 100)
lossFun = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.01)
model = model.cuda()
criterion = lossFun.cuda()

num_epochs = 5

for epoch in tqdm(range(num_epochs)):
    print('Epoch: ', epoch)
    
    loss_train,roc_train = one_pass(model, dl_train, optimizer, lossFun)
    loss_test, roc_test = one_pass(model, dl_test, optimizer, lossFun)
    
    #roc_train = one_pass_roc(model, dl_train, len(ds_train))
    #roc_test = one_pass_roc(model, dl_test, len(ds_test))
    print(f'Train loss: {loss_train} and Test loss: {loss_test}', )
    print(f'Train ROC: {roc_train} and Test ROC: {roc_test}', )
    #print(f'Train accuracy: {acc_train} and Test accuracy: {acc_test}', )

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch:  0


  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Train loss: 0.10645055826753377 and Test loss: 0.07667452341411263
Train ROC: 0.8964792767259209 and Test ROC: 0.9615834103601508
Epoch:  1


  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Train loss: 0.05787590967956931 and Test loss: 0.06858877872582525
Train ROC: 0.9766436580787773 and Test ROC: 0.9710877138123724
Epoch:  2


  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Train loss: 0.054362450283952056 and Test loss: 0.06581159553024918
Train ROC: 0.9803105828905293 and Test ROC: 0.9741273770639352
Epoch:  3


  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Train loss: 0.052271723421290515 and Test loss: 0.06412877852562815
Train ROC: 0.9823565777711887 and Test ROC: 0.9758704772350223
Epoch:  4


  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Train loss: 0.050775234564207496 and Test loss: 0.06392537982901558
Train ROC: 0.9836628804476946 and Test ROC: 0.9759006906216251


# Creating Model - LSTM

In [29]:
class LSTM(nn.Module):
    def __init__(self, weights, embedding_size,hidden_size):
        super(LSTM, self).__init__()
        # padding index turns off gradient for unknown tokens
        #self.word_emb = nn.Embedding(dict_length, embedding_size, padding_idx=0)
        self.word_emb = nn.Embedding.from_pretrained(torch.tensor(weights), freeze=True)
        self.lstm = nn.LSTM(input_size= embedding_size, hidden_size=100, num_layers=1, batch_first=True)
        self.g = nn.Linear(hidden_size, 6)
        
        # PyTorch RNN outputs a sequence of same length as input
        # For many to one, we can either use the final hidden state OR
        # slap a linear layer on the output, taking in all the hidden states
        
    def forward(self, x):
        #print(x.shape)
        x = self.word_emb(x)
        out, hidden = self.lstm(x)
        x = hidden[0]
        #print(x.shape)
        # RNN layer outputs a tuple, the output and the final hidden state
        # taking the final hidden state as output
        #x = self.gru(x)[1]
        #print(x.shape)
        output = self.g(x)
        #output = torch.sigmoid(output)
        #print(x.shape)
        
        return torch.squeeze(output)

# Train LSTM

In [30]:
dict_length = max(word_dict.values()) + 1
#model = GRU(dict_length, 100, 100)
model = LSTM(weights, 100, 100)
lossFun = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.01)
model = model.cuda()
criterion = lossFun.cuda()

num_epochs = 5

for epoch in tqdm(range(num_epochs)):
    print('Epoch: ', epoch)
    
    loss_train,roc_train = one_pass(model, dl_train, optimizer, lossFun)
    loss_test, roc_test = one_pass(model, dl_test, optimizer, lossFun)
    
    #roc_train = one_pass_roc(model, dl_train, len(ds_train))
    #roc_test = one_pass_roc(model, dl_test, len(ds_test))
    print(f'Train loss: {loss_train} and Test loss: {loss_test}', )
    print(f'Train ROC: {roc_train} and Test ROC: {roc_test}', )
    #print(f'Train accuracy: {acc_train} and Test accuracy: {acc_test}', )

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch:  0


  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Train loss: 0.12958692526444793 and Test loss: 0.10324289824347943
Train ROC: 0.8348207875055529 and Test ROC: 0.9138401761044566
Epoch:  1


  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Train loss: 0.07810998253989965 and Test loss: 0.08388989174272865
Train ROC: 0.9498484847621959 and Test ROC: 0.9503505116236046
Epoch:  2


  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Train loss: 0.0684550258796662 and Test loss: 0.07933816441800445
Train ROC: 0.9632077487711888 and Test ROC: 0.9566726602259422
Epoch:  3


  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Train loss: 0.0673041108297184 and Test loss: 0.08044997637625784
Train ROC: 0.9650035942794367 and Test ROC: 0.9534627018299284
Epoch:  4


  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

Train loss: 0.06246025115251541 and Test loss: 0.07482868700753897
Train ROC: 0.9708958863971333 and Test ROC: 0.9619742398940189
