In [456]:
import pandas as pd
import torchtext
import random
from torchtext.data import TabularDataset
import numpy as np
import os
from torchtext import data
import torch.nn as nn
import torch
import torch.optim as optim
import time
from sklearn.metrics import roc_auc_score,accuracy_score
import spacy

In [457]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [458]:
data_dir = './data/'
df = pd.read_csv('data/train.csv')

In [459]:
# hyperparams
TEXT_LENGTH = 100
EMBEDDING_SIZE = 20
BATCH_SIZE = 64
VOCAB_SIZE=20000

In [460]:
filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
nlp = spacy.load("en")
def tokenizer(text):
    filtered = ''.join([c if c not in filters else '' for c in text])
    return [token.text for token in nlp.tokenizer(filtered) if not token.is_space]

In [465]:
import csv
csv.reader('data/train.csv', delimiter=' ', quotechar='|',encoding = "utf-8")

TypeError: 'encoding' is an invalid keyword argument for this function

In [461]:
random.seed(1234)
TEXT = data.Field(lower=True, batch_first=True,fix_length=TEXT_LENGTH, preprocessing=None, tokenize='spacy')
LABEL = data.Field(sequential=False,is_target=True, use_vocab=False, pad_token=None, unk_token=None)

datafields = [('id', None),
              ('comment_text', TEXT), 
              ("toxic", LABEL), 
              ("severe_toxic", LABEL),
              ('obscene', LABEL), 
              ('threat', LABEL),
              ('insult', LABEL),
              ('identity_hate', LABEL)]


alldata = TabularDataset(
    path='data/train.csv',
    format='csv',
    skip_header=True,
    fields=datafields)

In [443]:
random.seed(17)
train,dev = alldata.split(split_ratio=0.9, random_state=random.getstate())

In [444]:
# TEXT.build_vocab(train, vectors='glove.6B.300d', max_size=20000, min_freq=5)
TEXT.build_vocab(train, max_size=VOCAB_SIZE, min_freq=5)
LABEL.build_vocab(train)

In [445]:
# generate batch iterators
random.seed(1234)
train_iterator, valid_iterator = data.BucketIterator.splits((train, dev),
                                                            batch_size=BATCH_SIZE,
                                                            device=device,
                                                            shuffle=True,
                                                            sort_key=lambda x: len(x.comment_text))

In [446]:
class SqueezeModule(nn.Module):
    def forward(self, x):
        return x.squeeze(1)
    
class NNet(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx, embeddings, text_length):
        super().__init__()
        self.layers = nn.Sequential(
            #nn.Embedding.from_pretrained(embeddings, freeze=False, padding_idx=pad_idx),
            nn.Embedding(vocab_size,embedding_dim,padding_idx=pad_idx),
            nn.MaxPool2d((text_length,1)),
            SqueezeModule(),
            nn.Linear(embedding_dim, output_dim),
        )
        
        init_f = lambda m: torch.nn.init.xavier_uniform_(m.weight) if type(m) == nn.Linear else None
        init_bias = lambda m: m.bias.data.zero_() if type(m) == nn.Linear else None
        self.layers.apply(init_f)
            
    def forward(self, text):
        return self.layers(text)

In [447]:
def fit_epoch(iterator, model, optimizer, criterion):
    train_loss = 0
    train_acc = 0
    model.train()
    all_y = []
    all_y_hat = []
    for i, batch in enumerate(iterator):
        optimizer.zero_grad()
        y = torch.stack([batch.toxic,
                         batch.severe_toxic,
                         batch.obscene,
                         batch.threat,
                         batch.insult,
                         batch.identity_hate],dim=1).float().to(device)
        y_hat = model(batch.comment_text.to(device))
        loss = criterion(y_hat, y)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        all_y.append(y)
        all_y_hat.append(y_hat)
    y = torch.cat(all_y,dim=0)
    y_hat = torch.cat(all_y_hat,dim=0)
    roc = roc_auc_score(y,y_hat.sigmoid().detach())
    return train_loss / len(iterator.dataset), roc

def test_epoch(iterator, model, criterion):
    train_loss = 0
    train_acc = 0
    model.eval()
    all_y = []
    all_y_hat = []
    for i, batch in enumerate(iterator):
        y = torch.stack([batch.toxic,
                         batch.severe_toxic,
                         batch.obscene,
                         batch.threat,
                         batch.insult,
                         batch.identity_hate],dim=1).float().to(device)
        with torch.no_grad():
            y_hat = model(batch.comment_text.to(device))
        loss = criterion(y_hat, y)
        train_loss += loss.item()
        all_y.append(y)
        all_y_hat.append(y_hat)
    y = torch.cat(all_y,dim=0)
    y_hat = torch.cat(all_y_hat,dim=0)
    roc = roc_auc_score(y,y_hat.sigmoid().detach())
    return train_loss / len(iterator.dataset), roc

In [448]:
OUTPUT_DIM = 6
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
model = NNet(len(TEXT.vocab), EMBEDDING_SIZE, OUTPUT_DIM, PAD_IDX, TEXT.vocab.vectors,TEXT_LENGTH).to(device)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 400,166 trainable parameters


In [449]:
def train_n_epochs(n, lr, wd):

    criterion = nn.BCEWithLogitsLoss().to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    for epoch in range(n):
        start_time = time.time()
        train_loss, train_roc = fit_epoch(train_iterator, model, optimizer, criterion)
        valid_loss, valid_roc = test_epoch(valid_iterator, model, criterion)

        secs = int(time.time() - start_time)
        mins = secs / 60
        secs = secs % 60

        print('Epoch: %d' % (epoch + 1), " | time in %d minutes, %d seconds" % (mins, secs))
        print(f'\tLoss: {train_loss:.4f}(train)\t|\troc: {train_roc :.6f} (train)')
        print(f'\tLoss: {valid_loss:.4f}(valid)\t|\troc: {valid_roc:.6f} (valid)')    

In [450]:
train_n_epochs(5,0.01,0)

Epoch: 1  | time in 0 minutes, 28 seconds
	Loss: 0.0013(train)	|	roc: 0.889760 (train)
	Loss: 0.0010(valid)	|	roc: 0.954847 (valid)
Epoch: 2  | time in 0 minutes, 29 seconds
	Loss: 0.0009(train)	|	roc: 0.963827 (train)
	Loss: 0.0010(valid)	|	roc: 0.961743 (valid)
Epoch: 3  | time in 0 minutes, 30 seconds
	Loss: 0.0008(train)	|	roc: 0.972883 (train)
	Loss: 0.0010(valid)	|	roc: 0.962056 (valid)
Epoch: 4  | time in 0 minutes, 30 seconds
	Loss: 0.0007(train)	|	roc: 0.977441 (train)
	Loss: 0.0009(valid)	|	roc: 0.962616 (valid)
Epoch: 5  | time in 0 minutes, 30 seconds
	Loss: 0.0007(train)	|	roc: 0.980279 (train)
	Loss: 0.0009(valid)	|	roc: 0.960046 (valid)


In [451]:
train_n_epochs(5,0.001,0)

Epoch: 1  | time in 0 minutes, 28 seconds
	Loss: 0.0006(train)	|	roc: 0.986626 (train)
	Loss: 0.0009(valid)	|	roc: 0.961181 (valid)
Epoch: 2  | time in 0 minutes, 33 seconds
	Loss: 0.0006(train)	|	roc: 0.987181 (train)
	Loss: 0.0009(valid)	|	roc: 0.961261 (valid)
Epoch: 3  | time in 0 minutes, 32 seconds
	Loss: 0.0006(train)	|	roc: 0.987560 (train)
	Loss: 0.0009(valid)	|	roc: 0.960731 (valid)
Epoch: 4  | time in 0 minutes, 35 seconds
	Loss: 0.0006(train)	|	roc: 0.987787 (train)
	Loss: 0.0009(valid)	|	roc: 0.960474 (valid)
Epoch: 5  | time in 0 minutes, 32 seconds
	Loss: 0.0006(train)	|	roc: 0.987973 (train)
	Loss: 0.0010(valid)	|	roc: 0.960147 (valid)


In [452]:
dataFields = [("id", None), ("comment_text", TEXT)]
testDataset= data.TabularDataset(path='./data/test.csv', 
                                            format='csv',
                                            fields=dataFields, 
                                            skip_header=True)

In [453]:
test_iter = data.Iterator(testDataset,
                          batch_size=BATCH_SIZE,
                          device=device,
                          sort=False,
                          sort_within_batch=False,
                          repeat=False,
                          shuffle=False)

In [454]:
model.eval()
all_predictions = []
for batch in test_iter:
    with torch.no_grad():
        y_hat = model(batch.comment_text.to(device))
    pred = torch.sigmoid(y_hat)
    all_predictions.append(pred)
predictions = torch.cat(all_predictions)

In [455]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
sample_df = pd.read_csv('data/sample_submission.csv')
sample_df[list_classes] = predictions
sample_df.to_csv('submission.csv', index=False)