In [86]:
%%html
<style> table {float:left} </style>

In [None]:
!pip install torch tqdm lazyme nltk gensim
!python -m nltk.downloader punkt

In [2]:
import numpy as np
from tqdm import tqdm

import pandas as pd

from gensim.corpora import Dictionary

import torch
from torch import nn, optim, tensor, autograd
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

In [3]:
try: # Use the default NLTK tokenizer.
    from nltk import word_tokenize, sent_tokenize 
    # Testing whether it works. 
    # Sometimes it doesn't work on some machines because of setup issues.
    word_tokenize(sent_tokenize("This is a foobar sentence. Yes it is.")[0])
except: # Use a naive sentence tokenizer and toktok.
    import re
    from nltk.tokenize import ToktokTokenizer
    # See https://stackoverflow.com/a/25736515/610569
    sent_tokenize = lambda x: re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', x)
    # Use the toktok tokenizer that requires no dependencies.
    toktok = ToktokTokenizer()
    word_tokenize = word_tokenize = toktok.tokenize

# Classifying Toxic Comments

Lets apply what we learnt in a realistic task and **fight cyber-abuse with NLP**!

From https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/

> *The threat of abuse and harassment online means that many people stop <br>*
> *expressing themselves and give up on seeking different opinions. <br>*
> *Platforms struggle to effectively facilitate conversations, leading many <br>*
> *communities to limit or completely shut down user comments.*


The goal of the task is to build a model to detect different types of of toxicity:

 - toxic
 - severe toxic
 - threats
 - obscenity
 - insults
 - identity-based hate
 
In this part, you'll be munging the data as how I would be doing it at work. 

Your task is to train a feed-forward network on the toxic comments given the skills we have accomplished thus far.

## Digging into the data...

If you're using linux/Mac you can use these bang commands in the notebook:

```
!pip3 install kaggle
!mkdir -p /content/.kaggle/
!echo '{"username":"natgillin","key":"54ae95ab760b52c3307ed4645c6c9b5d"}' > /content/.kaggle/kaggle.json
!chmod 600 /content/.kaggle/kaggle.json
!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge
!unzip /content/.kaggle/competitions/jigsaw-toxic-comment-classification-challenge/*
```

Otherwise, download the data from https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/ 

In [4]:
df_train = pd.read_csv('jigsaw-toxic-comment-classification-challenge/train.csv')
df_train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
df_train[df_train['threat'] == 1]['comment_text']

176       I think that your a Fagget get a oife and burn...
600       I'm also a sock puppet of this account...SUPRI...
802       Fuck you, Smith. Please have me notified when ...
1017      WOULDN'T BE THE FIRST TIME BITCH. FUCK YOU I'L...
                                ...                        
157718    bitch \nyou are a fucking hore. you suck dick ...
158717    stupid head \n\nYOur dumb and you are stupid d...
158856    Hey \n\nhey faggot, are you dead yet? or are y...
159029                                  Death to Musulmans!
159400    Shalom \n\nSemite, get the fuck out of here. I...
Name: comment_text, Length: 478, dtype: object

In [6]:
df_train.iloc[3712]['comment_text']

'Please stop. If you continue to ignore our policies by introducing inappropriate pages to Wikipedia, you will be blocked.'

In [None]:
df_train['comment_text_tokenzied'] = df_train['comment_text'].apply(word_tokenize)

In [None]:
# Just in case your Jupyter kernel dies, save the tokenized text =)

# To save your tokenized text you can do this:
import pickle
with open('train_tokenized_text.pkl', 'wb') as fout:
    pickle.dump(df_train['comment_text_tokenzied'], fout)


In [16]:
# To load it back:
import pickle
with open('train_tokenized_text.pkl', 'rb') as fin:
    df_train['comment_text_tokenzied'] = pickle.load(fin)

# How to get a one-hot?

There are many variants of how to get your one-hot embeddings from the individual columns.

This is one way:

In [17]:
label_column_names = "toxic	severe_toxic	obscene	threat	insult	identity_hate".split()
df_train[label_column_names].values

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [18]:
torch.tensor(df_train[label_column_names].values).float()

tensor([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        ...,
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]])

In [19]:
# Convert one-hot to indices of the column.

print(np.argmax(df_train[label_column_names].values, axis=1))

[0 0 0 ... 0 0 0]


In [20]:
class ToxicDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.vocab = Dictionary(texts)
        special_tokens = {'<pad>': 0, '<unk>':1}
        self.vocab = Dictionary(texts)
        self.vocab.patch_with_special_tokens(special_tokens)
        
        self.vocab_size = len(self.vocab)
        
        # Vectorize labels
        self.labels = torch.tensor(labels)
        # Keep track of how many data points.
        self._len = ???
        
        # Find the longest text in the data.
        self.max_len = max(len(txt) for txt in texts)
        
        self.num_labels = len(labels[0])
        
    def __getitem__(self, index):
        vectorized_sent = self.vectorize(self.texts[index])
        # To pad the sentence:
        # Pad left = 0; Pad right = max_len - len of sent.
        pad_dim = (0, self.max_len - len(vectorized_sent))
        padded_vectorized_sent = F.pad(vectorized_sent, pad_dim, 'constant')
        return {'x':padded_vectorized_sent, 
                'y':self.labels[index], 
                'x_len':len(vectorized_sent)}
    
    def __len__(self):
        return self._len
    
    def vectorize(self, tokens):
        """
        :param tokens: Tokens that should be vectorized. 
        :type tokens: list(str)
        """
        # See https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.doc2idx 
        # Lets just cast list of indices into torch tensors directly =)
        return torch.tensor(self.vocab.???(tokens))
    
    def unvectorize(self, indices):
        """
        :param indices: Converts the indices back to tokens.
        :type tokens: list(int)
        """
        return [self.vocab[i] for i in indices]

In [21]:
label_column_names = "toxic	severe_toxic	obscene	threat	insult	identity_hate".split()
toxic_data = ToxicDataset(df_train['comment_text_tokenzied'],
                          df_train[label_column_names].values)

In [26]:
toxic_data[123]

{'x': tensor([2554, 1465,  586,  ...,    0,    0,    0]),
 'y': tensor([0, 0, 0, 0, 0, 0]),
 'x_len': 4948}

In [27]:
batch_size = 5
dataloader = DataLoader(???)

In [28]:
class FFNet(nn.Module):
    def __init__(self, max_len, num_labels, vocab_size, embedding_size, hidden_dim):
        super(FFNet, self).__init__()
        self.embeddings = nn.???(num_embeddings=vocab_size,
                                       embedding_dim=embedding_size, 
                                       padding_idx=0)
        # The no. of inputs to the linear layer is the 
        # no. of tokens in each input * embedding_size
        self.linear1 = nn.???(embedding_size*max_len, hidden_dim)
        self.linear2 = nn.???(hidden_dim, num_labels)
    
    def forward(self, inputs):
        # We want to flatten the inputs so that we get the matrix of shape.
        # batch_size x no. of tokens in each input * embedding_size
        batch_size, max_len = inputs.shape
        embedded = self.???(inputs).view(batch_size, -1)
        hidden = ???
        out = ???
        return ???
        

In [35]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

embedding_size = 100
learning_rate = 0.003
hidden_size = 100


criterion = nn.???()
# Hint: the CBOW model object you've created.
model = FFNet(toxic_data.max_len, 
              len(label_column_names),
              toxic_data.vocab_size, 
              embedding_size=embedding_size, 
              hidden_dim=hidden_size)


optimizer = optim.???(???)

#model = nn.DataParallel(model)

losses = []
num_epochs = 100
for _e in range(num_epochs):
    epoch_loss = []
    for batch in tqdm(dataloader):
        x = batch['x'].to(device)
        y = batch['y'].to(device)
        # Zero gradient.
        optimizer.???
        # Feed forward.
        predictions = ???
        loss = ???
        # Back propagate the loss
        ???()
        ???()
        epoch_loss.append(float(loss))
        break
    print(sum(epoch_loss)/len(epoch_loss))
    losses.append(sum(epoch_loss)/len(epoch_loss))
     

  0%|          | 0/31915 [00:02<?, ?it/s]

0.6861270666122437





In [82]:
def predict(text):
    # Vectorize and Pad.
    vectorized_sent = toxic_data.vectorize(word_tokenize(text))
    pad_dim = (0, toxic_data.max_len - len(vectorized_sent))
    vectorized_sent = F.pad(vectorized_sent, pad_dim, 'constant')
    # Forward Propagation.
    # Unsqueeze because model is expecting `batch_size` x `sequence_len` shape.
    outputs = ???(vectorized_sent.unsqueeze(0))
    # To get the boolean output, we check if outputs are > 0.5
    return [int(l > 0.5) for l in outputs.squeeze()]
    # What happens if you use torch.max instead? =)
    ##return label_column_names[int(torch.max(outputs, dim=1).indices)]

In [83]:
text = "This is a nice message."

In [85]:
print(label_column_names)
predict(text)

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


[0, 0, 0, 0, 1, 1]