In [1]:
from torchtext.data import Field
tokenize = lambda x: x.split()
TEXT = Field(sequential=True, tokenize=tokenize, lower=True)

LABEL = Field(sequential=False, use_vocab=False)

In [9]:
import pandas as pd
pd.read_csv("data/train.csv")

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [15]:
from torchtext.data import TabularDataset

tv_datafields = [("id", None), # we won't be needing the id, so we pass in None as the field
                 ("comment_text", TEXT), ("toxic", LABEL),
                 ("severe_toxic", LABEL), ("threat", LABEL),
                 ("obscene", LABEL), ("insult", LABEL),
                 ("identity_hate", LABEL)]
trn, vld = TabularDataset.splits(
               path="data", # the root directory where the data lies
               train='train.csv', validation="valid.csv",
               format='csv',
               skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
               fields=tv_datafields)

tst_datafields = [("id", None), # we won't be needing the id, so we pass in None as the field
                  ("comment_text", TEXT)]
tst = TabularDataset(
           path="data/test.csv", # the file path
           format='csv',
           skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
           fields=tst_datafields)

In [17]:
tst_datafields

[('id', None), ('comment_text', <torchtext.data.field.Field at 0x1095f5710>)]

In [18]:
vld

<torchtext.data.dataset.TabularDataset at 0x14f276410>

In [19]:
trn

<torchtext.data.dataset.TabularDataset at 0x1573b3dd0>

In [20]:
tv_datafields

[('id', None),
 ('comment_text', <torchtext.data.field.Field at 0x1095f5710>),
 ('toxic', <torchtext.data.field.Field at 0x10960a490>),
 ('severe_toxic', <torchtext.data.field.Field at 0x10960a490>),
 ('threat', <torchtext.data.field.Field at 0x10960a490>),
 ('obscene', <torchtext.data.field.Field at 0x10960a490>),
 ('insult', <torchtext.data.field.Field at 0x10960a490>),
 ('identity_hate', <torchtext.data.field.Field at 0x10960a490>)]

In [21]:
trn[0]

<torchtext.data.example.Example at 0x1573c2810>

In [22]:
trn[0].__dict__.keys()

dict_keys(['comment_text', 'toxic', 'severe_toxic', 'threat', 'obscene', 'insult', 'identity_hate'])

In [24]:
trn[0].comment_text[:3]

['explanation', 'why', 'the']

In [33]:
trn[0].identity_hate

'0'

In [34]:
TEXT.build_vocab(trn)

### Constructing iterator

In [37]:
from torchtext.data import Iterator, BucketIterator

train_iter, val_iter = BucketIterator.splits(
 (trn, vld), # we pass in the datasets we want the iterator to draw data from
 batch_sizes=(64, 64),
 #device=-1, # if you want to use the GPU, specify the GPU number here
 sort_key=lambda x: len(x.comment_text), # the BucketIterator needs to be told what function it should use to group the data.
 sort_within_batch=False,
 repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)
test_iter = Iterator(tst, batch_size=64, sort=False, sort_within_batch=False, repeat=False)

In [38]:
train_iter

<torchtext.data.iterator.BucketIterator at 0x1827eb890>

In [39]:
val_iter

<torchtext.data.iterator.BucketIterator at 0x1825f2110>

In [41]:
test_iter

<torchtext.data.iterator.Iterator at 0x1825f2790>

## Wrap the interator

In [42]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
            self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # we pass in the list of attributes for x 

    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper

            if self.y_vars is None: # we will concatenate y into a single tensor
                    y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim=1).float()
            else:
                    y = torch.zeros((1))

            yield (x, y)

    def __len__(self):
            return len(self.dl)

train_dl = BatchWrapper(train_iter, "comment_text", ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"])
valid_dl = BatchWrapper(val_iter, "comment_text", ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"])
test_dl = BatchWrapper(test_iter, "comment_text", None)


### Train the model

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

class SimpleLSTMBaseline(nn.Module):
    def __init__(self, hidden_dim, emb_dim=300, num_linear=1):
        super().__init__() # don't forget to call this!
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.encoder = nn.LSTM(emb_dim, hidden_dim, num_layers=1)
        self.linear_layers = []
        for _ in range(num_linear - 1):
            self.linear_layers.append(nn.Linear(hidden_dim, hidden_dim))
            self.linear_layers = nn.ModuleList(self.linear_layers)
        self.predictor = nn.Linear(hidden_dim, 6)

    def forward(self, seq):
        hdn, _ = self.encoder(self.embedding(seq))
        feature = hdn[-1, :, :]
        for layer in self.linear_layers:
          feature = layer(feature)
          preds = self.predictor(feature)
        return preds

em_sz = 100
nh = 500
nl = 3
model = SimpleBiLSTMBaseline(nh, emb_dim=em_sz)