In [254]:
import pandas as pd
import torchtext
import random
from torchtext.data import TabularDataset
import numpy as np
import os
from torchtext import data
import torch.nn as nn
import torch
import torch.optim as optim
import time
from sklearn.metrics import roc_auc_score,accuracy_score
import spacy

In [255]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [256]:
data_dir = './data/'
df = pd.read_csv('data/train.csv')

In [257]:
pd.read_csv(cache_dir + 'train.csv')

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
127737,ffe8b9316245be30,The numbers in parentheses are the additional ...,0,0,0,0,0,0
127738,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
127739,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
127740,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [335]:
# hyperparams
TEXT_LENGTH = 100
EMBEDDING_SIZE = 20
BATCH_SIZE = 64
VOCAB_SIZE=20000

In [336]:
filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
nlp = spacy.load("en")
def tokenizer(text):
    filtered = ''.join([c if c not in filters else '' for c in text])
    return [token.text for token in nlp.tokenizer(filtered) if not token.is_space]

In [337]:
random.seed(1234)
TEXT = data.Field(lower=True, batch_first=True,fix_length=TEXT_LENGTH, preprocessing=None, tokenize=tokenizer)
LABEL = data.Field(sequential=False,is_target=True, use_vocab=False, pad_token=None, unk_token=None)

datafields = [('id', None),
              ('comment_text', TEXT), 
              ("toxic", LABEL), 
              ("severe_toxic", LABEL),
              ('obscene', LABEL), 
              ('threat', LABEL),
              ('insult', LABEL),
              ('identity_hate', LABEL)]


alldata = TabularDataset(
    path='data/train.csv',
    format='csv',
    skip_header=True,
    fields=datafields)


In [338]:
random.seed(17)
train,dev = alldata.split(split_ratio=0.8, random_state=random.getstate())

In [361]:
# TEXT.build_vocab(train, vectors='glove.6B.300d', max_size=20000, min_freq=5)
TEXT.build_vocab(train, max_size=VOCAB_SIZE, min_freq=5)
LABEL.build_vocab(train)

In [343]:
# generate batch iterators
random.seed(1234)
train_iterator, valid_iterator = data.BucketIterator.splits((train, dev),
                                                            batch_size=BATCH_SIZE,
                                                            device=device,
                                                            shuffle=True,
                                                            sort_key=lambda x: len(x.comment_text))

In [390]:
#vars(train[0])
#TEXT.vocab.vectors.shape
#len(dev)
TEXT.vocab.vectors

In [367]:
class SqueezeModule(nn.Module):
    def forward(self, x):
        return x.squeeze(1)
    
class NNet(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx, embeddings, text_length):
        super().__init__()
        self.layers = nn.Sequential(
            #nn.Embedding.from_pretrained(embeddings, freeze=False, padding_idx=pad_idx),
            nn.Embedding(vocab_size,embedding_dim,padding_idx=pad_idx),
            nn.MaxPool2d((text_length,1)),
            SqueezeModule(),
            nn.Linear(embedding_dim, output_dim),
        )
        
        init_f = lambda m: torch.nn.init.xavier_uniform_(m.weight) if type(m) == nn.Linear else None
        init_bias = lambda m: m.bias.data.zero_() if type(m) == nn.Linear else None
        self.layers.apply(init_f)
            
    def forward(self, text):
        return self.layers(text)

In [382]:
def fit_epoch(iterator, model, optimizer, criterion):
    train_loss = 0
    train_acc = 0
    model.train()
    all_y = []
    all_y_hat = []
    for i, batch in enumerate(iterator):
        optimizer.zero_grad()
        y = torch.stack([batch.toxic,
                         batch.severe_toxic,
                         batch.obscene,
                         batch.threat,
                         batch.insult,
                         batch.identity_hate],dim=1).float().to(device)
        y_hat = model(batch.comment_text.to(device))
        loss = criterion(y_hat, y)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        all_y.append(y)
        all_y_hat.append(y_hat)
    y = torch.cat(all_y,dim=0)
    y_hat = torch.cat(all_y_hat,dim=0)
    roc = roc_auc_score(y,y_hat.sigmoid().detach())
    return train_loss / len(iterator.dataset), roc

def test_epoch(iterator, model, criterion):
    train_loss = 0
    train_acc = 0
    model.eval()
    all_y = []
    all_y_hat = []
    for i, batch in enumerate(iterator):
        y = torch.stack([batch.toxic,
                         batch.severe_toxic,
                         batch.obscene,
                         batch.threat,
                         batch.insult,
                         batch.identity_hate],dim=1).float().to(device)
        with torch.no_grad():
            y_hat = model(batch.comment_text.to(device))
        loss = criterion(y_hat, y)
        train_loss += loss.item()
        all_y.append(y)
        all_y_hat.append(y_hat)
    y = torch.cat(all_y,dim=0)
    y_hat = torch.cat(all_y_hat,dim=0)
    roc = roc_auc_score(y,y_hat.sigmoid().detach())
    return train_loss / len(iterator.dataset), roc

In [383]:
OUTPUT_DIM = 6
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
model = NNet(len(TEXT.vocab), EMBEDDING_SIZE, OUTPUT_DIM, PAD_IDX, TEXT.vocab.vectors,TEXT_LENGTH).to(device)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 400,166 trainable parameters


In [384]:
def train_n_epochs(n, lr, wd):

    criterion = nn.BCEWithLogitsLoss().to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    for epoch in range(n):
        start_time = time.time()
        train_loss, train_roc = fit_epoch(train_iterator, model, optimizer, criterion)
        valid_loss, valid_roc = test_epoch(valid_iterator, model, criterion)

        secs = int(time.time() - start_time)
        mins = secs / 60
        secs = secs % 60

        print('Epoch: %d' % (epoch + 1), " | time in %d minutes, %d seconds" % (mins, secs))
        print(f'\tLoss: {train_loss:.4f}(train)\t|\troc: {train_roc :.6f} (train)')
        print(f'\tLoss: {valid_loss:.4f}(valid)\t|\troc: {valid_roc:.6f} (valid)')    

In [385]:
train_n_epochs(10,0.001,0)

Epoch: 1  | time in 0 minutes, 30 seconds
	Loss: 0.0027(train)	|	roc: 0.663308 (train)
	Loss: 0.0020(valid)	|	roc: 0.740635%(valid)
Epoch: 2  | time in 0 minutes, 31 seconds
	Loss: 0.0017(train)	|	roc: 0.799886 (train)
	Loss: 0.0016(valid)	|	roc: 0.836839%(valid)
Epoch: 3  | time in 0 minutes, 29 seconds
	Loss: 0.0014(train)	|	roc: 0.867490 (train)
	Loss: 0.0014(valid)	|	roc: 0.875982%(valid)
Epoch: 4  | time in 0 minutes, 29 seconds
	Loss: 0.0013(train)	|	roc: 0.902830 (train)
	Loss: 0.0013(valid)	|	roc: 0.903905%(valid)
Epoch: 5  | time in 0 minutes, 32 seconds
	Loss: 0.0012(train)	|	roc: 0.928639 (train)
	Loss: 0.0012(valid)	|	roc: 0.922580%(valid)
Epoch: 6  | time in 0 minutes, 36 seconds
	Loss: 0.0011(train)	|	roc: 0.944312 (train)
	Loss: 0.0011(valid)	|	roc: 0.933554%(valid)
Epoch: 7  | time in 0 minutes, 32 seconds
	Loss: 0.0010(train)	|	roc: 0.953496 (train)
	Loss: 0.0011(valid)	|	roc: 0.939759%(valid)
Epoch: 8  | time in 0 minutes, 31 seconds
	Loss: 0.0010(train)	|	roc: 0.9599

## playground

In [333]:
criterion = nn.BCEWithLogitsLoss().to(device)
optimizer = optim.Adam(model.parameters())

all_y = []
all_y_hat = []
start_time = time.time()
for i,batch in enumerate(train_iterator):
    optimizer.zero_grad()
#    print(batch)
    y = torch.stack([batch.toxic,
                      batch.severe_toxic, 
                      batch.obscene,
                      batch.threat, 
                      batch.insult, 
                      batch.identity_hate],dim=1)

    #print(model(batch.comment_text))
    y_hat = model(batch.comment_text)
    #print(y)
    #print(torch.tensor([batch.]))
    loss = cripterion(y_hat,y.float())
    loss.backward()
    optimizer.step()
    #print(loss)
    #if (loss.item() < 0):
    #    print(i)
    #    break
    #all_y.append(y)
    #all_y_hat.append(y_hat)
    #roc_auc_score(y,y_hat.sigmoid().detach())
    if (i == 20):
        pass # break
#y = torch.cat(all_y,dim=0)
#y_hat = torch.cat(all_y_hat,dim=0)
secs = int(time.time() - start_time)
print(secs)

157


In [315]:
len(train_iterator)

1995

In [322]:
train_iterator

<torchtext.data.iterator.BucketIterator at 0x1373c12b0>