In [217]:
#create a train / test set from train.csv
import pandas as pd
import numpy as np
np.random.seed(42)
df = pd.read_csv("../../data/train.csv")


mask = np.random.rand(len(df)) < 0.8
df[mask].to_csv("../../data/train_val_split/train.csv", index=False)
df[~mask].to_csv("../../data/train_val_split/val.csv", index=False)

In [218]:
pd.read_csv("../../data/train_val_split/train.csv").head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
2,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
3,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
4,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0


In [225]:
from torchtext.data import Field, LabelField, TabularDataset, BucketIterator
from sklearn.metrics import accuracy_score

import torch
import pandas as pd
from model.model import LSTM

device = torch.device("cpu")


TEXT = Field(tokenize = 'basic_english', lower=True, sequential=True)
LABEL = Field(sequential=False, use_vocab=False)

fields = [
    ('id',None),  #ignore id col
    ('comment_text', TEXT), 
    ('toxic', LABEL),
    ('severe_toxic', LABEL),
    ('obscene', LABEL),
    ('threat', LABEL),
    ('insult', LABEL),
    ('identity_hate', LABEL)
]

train_ds, val_ds = TabularDataset.splits(path="../../data/train_val_split",train="train.csv", validation="val.csv",format="csv", skip_header=True, fields=fields)

# print(vars(train_ds[15]))

TEXT.build_vocab(train_ds, vectors="fasttext.simple.300d")
LABEL.build_vocab(train_ds)

# create iterators for train/valid/test datasets
train_iter, val_iter = BucketIterator.splits(
 (train_ds, val_ds), # we pass in the datasets we want the iterator to draw data from
 batch_sizes=(64, 64),
 device=device, # if you want to use the GPU, specify the GPU number here
 sort_key=lambda x: len(x.comment_text), # the BucketIterator needs to be told what function it should use to group the data.
 sort_within_batch=False,
 repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)


In [220]:
batch = next(train_iter.__iter__()); batch


[torchtext.data.batch.Batch of size 64]
	[.comment_text]:[torch.LongTensor of size 461x64]
	[.toxic]:[torch.LongTensor of size 64]
	[.severe_toxic]:[torch.LongTensor of size 64]
	[.obscene]:[torch.LongTensor of size 64]
	[.threat]:[torch.LongTensor of size 64]
	[.insult]:[torch.LongTensor of size 64]
	[.identity_hate]:[torch.LongTensor of size 64]

In [222]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
        self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # we pass in the list of attributes for x 

    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper

            if self.y_vars is not None: # we will concatenate y into a single tensor
                y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim=1).float()
            else:
                y = torch.zeros((1))

        yield (x, y)

    def __len__(self):
        return len(self.dl)


In [223]:
train_dl = BatchWrapper(train_iter, "comment_text", ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"])
valid_dl = BatchWrapper(val_iter, "comment_text", ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"])

In [224]:
next(train_dl.__iter__())

(tensor([[ 213, 3615,  269,  ...,   10,   26,    6],
         [ 443,    5,    4,  ...,  257,   10,    9],
         [ 384, 1372,   36,  ...,   23,  702,  169],
         ...,
         [   1,    1,    1,  ...,    1,    1,    1],
         [   1,    1,    1,  ...,    1,    1,    1],
         [   1,    1,    1,  ...,    1,    1,    1]]),
 tensor([[1., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [1., 0., 1., 0., 1., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0.],
         [1., 0., 0., 0., 0., 0.],
 

In [237]:
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, hidden_dim, embedding_dim=300,num_classes=6):
        super().__init__()
        self.embedding = nn.Embedding(len(TEXT.vocab), embedding_dim)
        self.encoder = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=1
                          )
        self.linear1 = nn.Linear(hidden_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, num_classes)
    def forward(self, seq):
        hdn, _ = self.encoder(self.embedding(seq))
        feature = hdn[-1, :, :]
        out = self.linear1(feature)
        out = self.linear2(out)
        out = self.fc(out)
        return out

model = LSTM(hidden_dim=500, embedding_dim=300)
model

LSTM(
  (embedding): Embedding(209350, 300)
  (encoder): LSTM(300, 500)
  (linear1): Linear(in_features=500, out_features=500, bias=True)
  (linear2): Linear(in_features=500, out_features=500, bias=True)
  (fc): Linear(in_features=500, out_features=6, bias=True)
)

In [235]:
class SimpleLSTMBaseline(nn.Module):
    def __init__(self, hidden_dim, emb_dim=300, num_linear=1):
        super().__init__() # don't forget to call this!
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.encoder = nn.LSTM(emb_dim, hidden_dim, num_layers=1)
        self.linear_layers = []
        for _ in range(num_linear - 1):
            self.linear_layers.append(nn.Linear(hidden_dim, hidden_dim))
            self.linear_layers = nn.ModuleList(self.linear_layers)
        self.predictor = nn.Linear(hidden_dim, 6)

    def forward(self, seq):
        hdn, _ = self.encoder(self.embedding(seq))
        feature = hdn[-1, :, :]
        for layer in self.linear_layers:
          feature = layer(feature)
          preds = self.predictor(feature)
        return preds

em_sz = 100
nh = 500
nl = 3
model = SimpleLSTMBaseline(nh, emb_dim=em_sz, num_linear=nl)
model

SimpleLSTMBaseline(
  (embedding): Embedding(209350, 100)
  (encoder): LSTM(100, 500)
  (linear_layers): ModuleList(
    (0): Linear(in_features=500, out_features=500, bias=True)
    (1): Linear(in_features=500, out_features=500, bias=True)
  )
  (predictor): Linear(in_features=500, out_features=6, bias=True)
)

In [238]:
model

LSTM(
  (embedding): Embedding(209350, 300)
  (encoder): LSTM(300, 500)
  (linear1): Linear(in_features=500, out_features=500, bias=True)
  (linear2): Linear(in_features=500, out_features=500, bias=True)
  (fc): Linear(in_features=500, out_features=6, bias=True)
)

In [239]:
model = model.to(device)
loss_func = loss_func.to(device)

In [240]:
def train(train_dl):
    loss = 0
    for x, y in train_dl:
        opt.zero_grad()
        preds = model(x)
        loss = loss_func(y, preds)
        loss.backward()
        loss.step()
        loss +=loss.item()
    return lss
def val(val_dl):
    loss = 0
    for x, y in val_dl:
        preds = model(x)
        loss = loss_func(y, preds)
        loss +=loss.item()
    return lss


for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() # turn on training mode
    for x, y in tqdm.tqdm(train_dl): # thanks to our wrapper, we can intuitively iterate over our data!
        opt.zero_grad()

        preds = model(x)
        loss = loss_func(y, preds)
        loss.backward()
        opt.step()

        running_loss += loss.item() * x.size(0)

    epoch_loss = running_loss / len(train_iter)

    # calculate the validation loss for this epoch
    val_loss = 0.0
    model.eval() # turn on evaluation mode
    for x, y in valid_dl:
        preds = model(x)
        loss = loss_func(y, preds)
        val_loss += loss.item() * x.size(0)

    val_loss /= len(val_iter)
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))

  0%|          | 1/1994 [00:34<18:49:26, 34.00s/it]
  0%|          | 0/1994 [00:00<?, ?it/s]Epoch: 1, Training Loss: 0.3321, Validation Loss: 7.6338
  0%|          | 1/1994 [00:34<19:11:30, 34.67s/it]
Epoch: 2, Training Loss: 0.2912, Validation Loss: 7.6338
