In [36]:
import pandas as pd
import numpy as np
import torch

In [37]:
pd.read_csv("data/train.csv").head(2)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0


In [38]:
pd.read_csv("data/valid.csv").head(2)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,000eefc67a2c930f,Radial symmetry \n\nSeveral now extinct lineag...,0,0,0,0,0,0
1,000f35deef84dc4a,There's no need to apologize. A Wikipedia arti...,0,0,0,0,0,0


In [39]:
pd.read_csv("data/test.csv").head(2)

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...


In [40]:
from torchtext.data import Field

In [41]:
tokenize = lambda x: x.split()
TEXT = Field(sequential=True, tokenize=tokenize, lower=True)

In [42]:
LABEL = Field(sequential=False, use_vocab=False)

In [43]:
#doctrings for Field class can be used for advanced data preprocessing

In [44]:
from torchtext.data import TabularDataset

In [45]:
tv_datafields = [("id", None), # we won't be needing the id, so we pass in None as the field
                 ("comment_text", TEXT), ("toxic", LABEL),
                 ("severe_toxic", LABEL), ("threat", LABEL),
                 ("obscene", LABEL), ("insult", LABEL),
                 ("identity_hate", LABEL)]
trn, vld = TabularDataset.splits(
               path="data", # the root directory where the data lies
               train='train.csv', validation="valid.csv",
               format='csv',
               skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
               fields=tv_datafields)
tst_datafields = [("id", None), # we won't be needing the id, so we pass in None as the field
                  ("comment_text", TEXT)]
tst = TabularDataset(
           path="data/test.csv", # the file path
           format='csv',
           skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
           fields=tst_datafields)

In [46]:
TEXT.build_vocab(trn)

In [47]:
TEXT.vocab.freqs.most_common(10)

[('the', 78),
 ('to', 41),
 ('you', 33),
 ('of', 30),
 ('and', 26),
 ('a', 26),
 ('is', 24),
 ('that', 22),
 ('i', 20),
 ('if', 19)]

In [48]:
from torchtext.data import Iterator, BucketIterator

In [49]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_iter, val_iter = BucketIterator.splits(
 (trn, vld), # we pass in the datasets we want the iterator to draw data from
 batch_sizes=(64, 64),
 device=device, # if you want to use the GPU, specify the GPU number here
 sort_key=lambda x: len(x.comment_text), # the BucketIterator needs to be told what function it should use to group the data.
 sort_within_batch=False,
 repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)
test_iter = Iterator(tst, batch_size=64, device=device, sort=False, sort_within_batch=False, repeat=False)

In [50]:
batch = next(train_iter.__iter__()); batch


[torchtext.data.batch.Batch of size 25]
	[.comment_text]:[torch.cuda.LongTensor of size 494x25 (GPU 0)]
	[.toxic]:[torch.cuda.LongTensor of size 25 (GPU 0)]
	[.severe_toxic]:[torch.cuda.LongTensor of size 25 (GPU 0)]
	[.threat]:[torch.cuda.LongTensor of size 25 (GPU 0)]
	[.obscene]:[torch.cuda.LongTensor of size 25 (GPU 0)]
	[.insult]:[torch.cuda.LongTensor of size 25 (GPU 0)]
	[.identity_hate]:[torch.cuda.LongTensor of size 25 (GPU 0)]

In [51]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
        self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # we pass in the list of attributes for x and y
    
    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            
            if self.y_vars is not None: # we will concatenate y into a single tensor
                y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim=1).float()
            else:
                y = torch.zeros((1))

            yield (x, y)
    
    def __len__(self):
        return len(self.dl)

In [52]:
train_dl = BatchWrapper(train_iter, "comment_text", ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"])
valid_dl = BatchWrapper(val_iter, "comment_text", ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"])
test_dl = BatchWrapper(test_iter, "comment_text", None)

In [53]:
next(train_dl.__iter__())

(tensor([[354, 606,  15,  ...,  15, 453,  15],
         [ 63, 693,  46,  ...,  97, 523, 657],
         [  4, 584,  10,  ..., 629,  30,  22],
         ...,
         [  1,   1,   1,  ...,   1,   1,   1],
         [  1,   1,   1,  ...,   1,   1,   1],
         [  1,   1,   1,  ...,   1,   1,   1]], device='cuda:0'),
 tensor([[1., 1., 0., 1., 1., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0.

In [54]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

In [55]:
class BiLSTMBaseline(nn.Module):
    def __init__(self, hidden_dim, embedding_dim = 256, spatial_dropout = 0.05, recurrent_dropout = 0.1, num_linear = 1):
        super(BiLSTMBaseline, self).__init__()
        self.embedding = nn.Embedding(len(TEXT.vocab), embedding_dim)
        self.encoder = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, dropout=recurrent_dropout)
        self.linear_layers = []
        for _ in range(num_linear - 1):
            self.linear_layers.append(nn.Linear(hidden_dim, hidden_dim))
        self.linear_layers = nn.ModuleList(self.linear_layers)
        self.predictor = nn.Linear(hidden_dim, 6)
        
    def forward(self, seq):
        hdn, _ = self.encoder(self.embedding(seq))
        feature = hdn[-1, :, :]
        for layer in self.linear_layers:
            feature = layer(feature)
        preds = self.predictor(feature)
        return preds

In [84]:
emb_size = 200
nh = 500
nl = 3
model = BiLSTMBaseline(nh, embedding_dim=emb_size, num_linear=nl); model

BiLSTMBaseline(
  (embedding): Embedding(784, 200)
  (encoder): LSTM(200, 500, dropout=0.1)
  (linear_layers): ModuleList(
    (0): Linear(in_features=500, out_features=500, bias=True)
    (1): Linear(in_features=500, out_features=500, bias=True)
  )
  (predictor): Linear(in_features=500, out_features=6, bias=True)
)

In [85]:
model.cuda()

BiLSTMBaseline(
  (embedding): Embedding(784, 200)
  (encoder): LSTM(200, 500, dropout=0.1)
  (linear_layers): ModuleList(
    (0): Linear(in_features=500, out_features=500, bias=True)
    (1): Linear(in_features=500, out_features=500, bias=True)
  )
  (predictor): Linear(in_features=500, out_features=6, bias=True)
)

In [86]:
import tqdm

In [90]:
opt = optim.Adam(model.parameters(), lr=1e-2)
loss_func = nn.BCEWithLogitsLoss()

In [91]:
epochs = 10

In [92]:
for epoch in range(1, epochs+1):
    running_loss = 0.0
    running_corrects = 0
    model.train()
    for x, y in tqdm.tqdm(train_dl):
        opt.zero_grad()
        
        preds = model(x)
        loss = loss_func(preds, y)
        loss.backward()
        opt.step()
        
        running_loss += loss.data.item() * x.size(0)
        
    epoch_loss = running_loss / len(trn)
    
    val_loss = 0.0
    model.eval()
    for x, y in valid_dl:
        preds = model(x)
        loss = loss_func(preds, y)
        val_loss += loss.data.item() * x.size(0)
        
    val_loss /= len(vld)
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))

100%|██████████| 1/1 [00:00<00:00,  8.62it/s]
100%|██████████| 1/1 [00:00<00:00,  9.14it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

Epoch: 1, Training Loss: 439.1548, Validation Loss: 10.8531
Epoch: 2, Training Loss: 31.4503, Validation Loss: 12.2910


100%|██████████| 1/1 [00:00<00:00,  9.16it/s]
100%|██████████| 1/1 [00:00<00:00,  9.27it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

Epoch: 3, Training Loss: 31.9995, Validation Loss: 2.6567
Epoch: 4, Training Loss: 4.7682, Validation Loss: 3.2360


100%|██████████| 1/1 [00:00<00:00,  9.35it/s]
100%|██████████| 1/1 [00:00<00:00,  9.21it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

Epoch: 5, Training Loss: 4.9998, Validation Loss: 3.3099
Epoch: 6, Training Loss: 4.5301, Validation Loss: 3.1527


100%|██████████| 1/1 [00:00<00:00,  9.37it/s]
100%|██████████| 1/1 [00:00<00:00,  9.28it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

Epoch: 7, Training Loss: 3.7907, Validation Loss: 3.1545
Epoch: 8, Training Loss: 3.4970, Validation Loss: 3.5669


100%|██████████| 1/1 [00:00<00:00,  9.40it/s]
100%|██████████| 1/1 [00:00<00:00,  9.26it/s]

Epoch: 9, Training Loss: 3.9164, Validation Loss: 3.8126
Epoch: 10, Training Loss: 3.5134, Validation Loss: 4.3793





In [93]:
test_preds = []
for x, y in tqdm.tqdm(test_dl):
    preds = model(x)
    # if you're data is on the GPU, you need to move the data back to the cpu
    preds = preds.data.cpu().numpy()
    # the actual outputs of the model are logits, so we need to pass these values to the sigmoid function
    preds = 1 / (1 + np.exp(-preds))
    test_preds.append(preds)
test_preds = np.hstack(test_preds)

  import sys
100%|██████████| 1/1 [00:00<00:00, 37.25it/s]


In [94]:
df = pd.read_csv("data/test.csv")
for i, col in enumerate(["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]):
    df[col] = test_preds[:, i]

In [95]:
df.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,0.295567,0.112594,5.840576999999999e-19,0.001824,0.022439,1.1912839999999998e-19
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,0.295567,0.112594,5.840576999999999e-19,0.001824,0.022439,1.1912839999999998e-19
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",0.295567,0.112594,5.840576999999999e-19,0.001824,0.022439,1.1912839999999998e-19
3,00017563c3f7919a,":If you have a look back at the source, the in...",0.295567,0.112594,5.840576999999999e-19,0.001824,0.022439,1.1912839999999998e-19
4,00017695ad8997eb,I don't anonymously edit articles at all.,0.295567,0.112594,5.840576999999999e-19,0.001824,0.022439,1.1912839999999998e-19
