In [1]:
import os
import spacy
import pandas as pd

import torch
import torch.nn as nn
from torchtext import data

from scene.data.loaders import BatchWrapper

spacy_en = spacy.load('en')

In [2]:
DATAPATH = '/Users/yngtodd/src/kaggle/scene/data/splits/small_val/sans_header'

In [3]:
trainpath = os.path.join(DATAPATH, 'train.csv')
train = pd.read_csv(trainpath)

In [5]:
train[0:5]

Unnamed: 0,id,text,genre,labels
0,26123,OICES coming from the living room. INT. LIVING...,thriller,8
1,15843,n exotic looking container full of dark liquid...,thriller,8
2,7385,"at the board game. DANNY What is this, Staling...",drama,3
3,17645,"ng, sir. I, on the other hand, am in a positio...",thriller,8
4,25175,e how in the count room nobody ever seems to s...,drama,3


In [4]:
def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [5]:
TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True)
LABEL = data.Field(sequential=False, use_vocab=False, postprocessing=data.Pipeline(int))

In [6]:
train, val, test = data.TabularDataset.splits(
    path=DATAPATH, 
    train='train.csv',
    validation='val.csv', 
    test='test.csv', 
    format='csv',
    fields=[
        ('id', None),
        ('text', TEXT),
        ('genre', None),
        ('labels', LABEL)
    ],
    skip_header=True
)

In [7]:
train.fields

{'id': None,
 'text': <torchtext.data.field.Field at 0x1302c14e0>,
 'genre': None,
 'labels': <torchtext.data.field.Field at 0x1302c1518>}

In [8]:
TEXT.build_vocab(train)
LABEL.build_vocab(train)

In [9]:
len(LABEL.vocab)

10

In [22]:
device = 'cpu'

train_iter, val_iter, test_iter = data.BucketIterator.splits(
    (train, val, test), 
    sort_key=lambda x: len(x.Text),
    batch_sizes=(32, 128, 128), 
    device=device
)

In [28]:
trainloader = BatchWrapper(train_iter, label='labels')

In [43]:
train_iter

<torchtext.data.iterator.BucketIterator at 0x7f95185ab6a0>

In [24]:
vocab = TEXT.vocab
embed = nn.Embedding(len(vocab), 200)

In [25]:
embed

Embedding(64191, 200)

In [30]:
#batch = next(train_iter.__iter__()); batch
batch = next(trainloader.__iter__()); batch

TypeError: 'NoneType' object cannot be interpreted as an integer

In [49]:
batch.__dict__.keys()

dict_keys(['batch_size', 'dataset', 'fields', 'input_fields', 'target_fields', 'text', 'labels'])

In [47]:
TEXT.vocab.freqs.most_common(5)

[('.', 227756), ('the', 137092), (',', 118420), ('a', 59798), ('to', 55231)]

In [78]:
class BatchWrapper:
    """Convenience wrapper for dataloaders."""
    def __init__(self, dataloader, data="text", label="labels"):
        self.dataloader = dataloader
        self.data = data
        self.label = label
    
    def __iter__(self):
        for batch in self.dataloader:
            x = getattr(batch, self.data)
            
            if self.label is not None:
                y = getattr(batch, self.label)
            else:
                y = torch.zeros((1))

            yield (x, y)
    
    def __len__(self):
        return len(self.dataloader)

In [79]:
trainloader = BatchWrapper(train_iter)

In [80]:
next(trainloader.__iter__())

(tensor([[16104,  2407,   688,  ...,   434, 47015,  6255],
         [    8,   647,    59,  ...,     4,     2,   900],
         [ 4160,     6,    22,  ...,    77,   128,    36],
         ...,
         [    1,     1,     1,  ...,     1,     1,     1],
         [    1,     1,     1,  ...,     1,     1,     1],
         [    1,     1,     1,  ...,     1,     1,     1]]),
 tensor([5, 3, 8, 8, 3, 2, 3, 2, 0, 8, 8, 0, 3, 3, 3, 8, 3, 3, 5, 2, 3, 3, 3, 3,
         8, 8, 3, 0, 0, 3, 2, 2]))

In [81]:
len(trainloader)

424