In [19]:
from nltk.tokenize import word_tokenize
import nltk
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from gensim.models.wrappers import FastText
from wikipedia2vec import Wikipedia2Vec
import torch
import pickle

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jerem\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
documents = pd.read_csv('train.csv')
documents

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [4]:
model = Wikipedia2Vec.load('./enwiki_20180420_300d.pkl')

In [5]:
class Layer(torch.nn.Module):
    def __init__(self, size_in, size_out, activation):
        super(Layer, self).__init__()
        self.weights = torch.nn.Parameter(
            torch.randn(size_in, size_out, requires_grad=True)
        )
        self.bias = torch.nn.Parameter(
            torch.randn(1, size_out, requires_grad=True)
        )  
        self.activation = activation
        
    def forward(self, z_in):
        return self.activation(z_in @ self.weights + self.bias)
    
class LSTM(torch.nn.Module):
    def __init__(self, size_in, size_out, size_long, size_short):
        super().__init__()
        size_cat = size_in + size_short
        self.forget_gate = Layer(size_cat, size_long, torch.sigmoid)
        self.memory_gate = Layer(size_cat, size_long, torch.sigmoid)
        self.memory_tanh = Layer(size_cat, size_long, torch.tanh)
        self.recall_gate = Layer(size_cat, size_short, torch.sigmoid)
        self.recall_tanh = Layer(size_long, size_short, torch.tanh)
        self.output = Layer(size_short, size_out, torch.sigmoid)
        self.size_short = size_short
        self.size_long = size_long
        
    def forward(self, x):
        memory_long  = torch.zeros((1, self.size_long))
        memory_short = torch.zeros((1, self.size_short))
        out = []
        
        for t in range(x.shape[0]):
            x_t = x[[t], :]
            z_t = torch.cat([x_t, memory_short], axis=1)
            
            forget_gate = self.forget_gate(z_t)
            
            memory_long = memory_long * forget_gate
            
            memory_gate = self.memory_gate(z_t)
            memory_tanh = self.memory_tanh(z_t)
            memory = memory_gate * memory_tanh
            
            memory_long = memory_long + memory
            
            recall_gate = self.recall_gate(z_t)
            recall_tanh = self.recall_tanh(memory_long)
            
            memory_short = recall_gate * recall_tanh
            
        y_hat = self.output(memory_short)
        out.append(y_hat)
            
        return torch.cat(out, axis=0)
    
    def Predict(self, x, t=0.5):
        return torch.sigmoid(self.forward(x)) > t

In [6]:
def get_x(comment):
    x = []
    for tok in list(map(str.lower, word_tokenize(comment))):
        try:
            x.append(model.get_word_vector(tok))
        except:
            pass
    return torch.Tensor(x)

In [7]:
def MultiLableLoss(y_hat, y):
    loss = torch.nn.BCEWithLogitsLoss
    return torch.sum(torch.Tensor([loss(pred, targ) for pred,targ in zip(y_hat[0], y[0])]))

In [15]:
lstm = LSTM(300,6,10,10)
loss_func = torch.nn.BCEWithLogitsLoss()
optim = torch.optim.Adam(lstm.parameters(), lr=1e-1, )

In [16]:
iterations=1000
losses = [np.inf]
batch_size = 256
for i in range(iterations):
    print('{:>5} | {:>12.3}\r'.format(i, losses[-1]), end='')
    samp = documents.iloc[:256,:]
    comments = samp['comment_text']
    targets = torch.Tensor(samp.drop(['id','comment_text'], axis=1).values)
    loss = 0
    for comment, target in zip(comments, targets):
        y_hat = lstm(get_x(comment))
        t = torch.reshape(target, [1, target.shape[0]])
        loss = loss_func(y_hat, t)
        loss.backward()
        losses.append(loss.detach())
    optim.step()
    optim.zero_grad()

   10 |        0.694

KeyboardInterrupt: 

In [17]:
x = documents.iloc[0,1]
x = get_x(x)
lstm.Predict(x)

tensor([[True, True, True, True, True, True]])

In [18]:
documents.iloc[0,1]

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

As you can see, the model is a genius and flagged this incredibly vulger comment for everything it had. This guy should be ashamed.

In [20]:
with open('lstm.pickle', 'w+b') as file:
      pickle.dump(lstm, file)