In [1]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

SEED = 515
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# `TorchText` after version 0.7
Reference: https://github.com/pytorch/text/releases/tag/v0.7.0-rc3

## Build a Dataset Manually
### Define `Dataset`

In [2]:
from torch.utils.data import Dataset

class MyDataset(Dataset):
    """
    Refs:
    https://github.com/pytorch/text/blob/master/torchtext/experimental/datasets/text_classification.py
    """
    def __init__(self, data, vocab, transforms):
        super().__init__()
        self.data = data
        self.vocab = vocab
        self.transforms = transforms  # (label_transforms, tokens_transforms)

    def __getitem__(self, i):
        text, label = self.data[i]
        return (self.transforms[0](text), self.transforms[1](label))

    def __len__(self):
        return len(self.data)

    def get_labels(self):
        labels = []
        for text, label in self.data:
            labels.apppend(self.transforms[1](label))
        return set(labels)

    def get_vocab(self):
        return self.vocab

### Build Vocabulary

In [3]:
from collections import Counter
import spacy
nlp = spacy.load("en_core_web_sm")

def tokenize(text):
    return [tok.text for tok in nlp(text)]

raw_data = [["I like this film.", "pos"], 
            ["I hate it.", "neg"], 
            ["I have no feelings about it.", "neg"], 
            ["It is my best.", "pos"], 
            ["My father loves it so much and I do think so.", "pos"]]

token_counter = Counter()
label_counter = Counter()
for text, label in raw_data:
    token_counter.update(tokenize(text))
    label_counter.update([label])

print(token_counter.most_common(5))
print(label_counter.most_common())

[('.', 5), ('I', 4), ('it', 3), ('so', 2), ('like', 1)]
[('pos', 3), ('neg', 2)]


In [4]:
from torchtext.vocab import Vocab, Vectors

vecs = Vectors('test-vecs.txt', cache='assets/vector_cache')
print(vecs.stoi)
print(vecs.itos)
print(vecs.vectors)

{'I': 0, 'love': 1, 'this': 2, 'film': 3, 'very': 4, 'much': 5, '.': 6}
['I', 'love', 'this', 'film', 'very', 'much', '.']
tensor([[-0.1316, -1.2163,  0.3154,  2.2605,  0.4316],
        [-0.4608, -0.9925, -0.2819, -1.6757,  0.3488],
        [ 0.9211, -0.0034, -1.7872, -0.5069, -0.2404],
        [ 0.2009, -2.6882,  0.1634,  0.8077,  0.0838],
        [-0.0382,  0.2052, -1.1867,  0.8228, -0.5860],
        [ 0.7365,  0.3347,  1.6088, -0.4995,  0.4200],
        [-1.1841,  0.9180,  1.0854, -0.3196, -1.1193]])


In [5]:
vocab = Vocab(token_counter, vectors=vecs)
print(vocab.freqs.most_common(5))
print(vocab.itos)

label_vocab = Vocab(label_counter, specials=())
print(label_vocab.itos)

[('.', 5), ('I', 4), ('it', 3), ('so', 2), ('like', 1)]
['<unk>', '<pad>', '.', 'I', 'it', 'so', 'It', 'My', 'about', 'and', 'best', 'do', 'father', 'feelings', 'film', 'hate', 'have', 'is', 'like', 'loves', 'much', 'my', 'no', 'think', 'this']
['pos', 'neg']


In [6]:
vocab.vectors

tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-1.1841,  0.9180,  1.0854, -0.3196, -1.1193],
        [-0.1316, -1.2163,  0.3154,  2.2605,  0.4316],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.2009, -2.6882,  0.1634,  0.8077,  0.0838],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 

### Define `transforms`

In [7]:
from torchtext.experimental.functional import vocab_func, sequential_transforms

dataset = MyDataset(raw_data, vocab, 
                    (sequential_transforms(tokenize, vocab_func(vocab)), 
                     lambda x: label_vocab.stoi[x]))

dataset[0]

([3, 18, 24, 14, 2], 0)

### Use `DataLoader`

In [8]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    texts, labels = [], []
    for text, label in batch:
        texts.append(text)
        labels.append(label)
    return texts, labels

train_loader = DataLoader(dataset, batch_size=2, collate_fn=collate_fn)
for idx, (texts, labels) in enumerate(train_loader):
    break

print(texts)
print(labels)

[[3, 18, 24, 14, 2], [3, 15, 4, 2]]
[0, 1]


### Padding

In [9]:
def pad(texts, padding_idx=0, length=None):
    maxlen = max(len(text) for text in texts)
    length = maxlen if length is None else max(length, maxlen)

    return [text + [padding_idx] * (length-len(text)) for text in texts]

texts = pad(texts, padding_idx=vocab.stoi['<pad>'], length=10)
print(texts)

[[3, 18, 24, 14, 2, 1, 1, 1, 1, 1], [3, 15, 4, 2, 1, 1, 1, 1, 1, 1]]


In [10]:
print(torch.tensor(texts))
print(torch.tensor(labels))

tensor([[ 3, 18, 24, 14,  2,  1,  1,  1,  1,  1],
        [ 3, 15,  4,  2,  1,  1,  1,  1,  1,  1]])
tensor([0, 1])


### Batching, Padding, To-Tensor in One `collate_fn`

In [11]:
from torch.utils.data import DataLoader

def great_collate_fn(batch):
    """
    Batching, padding and to-tensor together.
    """
    texts, labels = [], []
    for text, label in batch:
        texts.append(text)
        labels.append(label)

    texts = pad(texts, padding_idx=vocab.stoi['<pad>'], length=10)
    return torch.tensor(texts), torch.tensor(labels)

train_loader = DataLoader(dataset, batch_size=2, collate_fn=great_collate_fn)
for idx, (texts, labels) in enumerate(train_loader):
    break

print(texts)
print(labels)

tensor([[ 3, 18, 24, 14,  2,  1,  1,  1,  1,  1],
        [ 3, 15,  4,  2,  1,  1,  1,  1,  1,  1]])
tensor([0, 1])


## Load a Dataset from `TorchText`

In [12]:
from torchtext.experimental.datasets import IMDB
train_data, test_data = IMDB(ngrams=3, root="assets/data")

25000lines [00:05, 4656.67lines/s]


In [13]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    texts, labels = [], []
    for label, txt in batch:
        texts.append(txt)
        labels.append(label)
    return texts, labels

train_loader = DataLoader(train_data, batch_size=8, collate_fn=collate_fn)
for idx, (texts, labels) in enumerate(train_loader):
    break

In [14]:
print(type(texts), len(texts))

print(type(texts[0]), texts[0].size())
print(type(texts[1]), texts[1].size())
print(type(texts[2]), texts[2].size())

<class 'list'> 8
<class 'torch.Tensor'> torch.Size([948])
<class 'torch.Tensor'> torch.Size([759])
<class 'torch.Tensor'> torch.Size([300])


In [15]:
print(type(labels), len(labels))

print(type(labels[0]), labels[0].size())
print(type(labels[1]), labels[1].size())
print(type(labels[2]), labels[2].size())

<class 'list'> 8
<class 'torch.Tensor'> torch.Size([])
<class 'torch.Tensor'> torch.Size([])
<class 'torch.Tensor'> torch.Size([])
