In [1]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

SEED = 515
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# `TorchText` after version 0.7
Reference: https://github.com/pytorch/text/releases/tag/v0.7.0-rc3

## Build a Dataset Manually
### Define `Dataset`

In [2]:
from torch.utils.data import Dataset

class MyDataset(Dataset):
    """
    Refs:
    https://github.com/pytorch/text/blob/master/torchtext/experimental/datasets/text_classification.py
    """
    def __init__(self, data, vocab, transforms):
        super().__init__()
        self.data = data
        self.vocab = vocab
        self.transforms = transforms  # (label_transforms, tokens_transforms)

    def __getitem__(self, i):
        text, label = self.data[i]
        return (self.transforms[0](text), self.transforms[1](label))

    def __len__(self):
        return len(self.data)

    def get_labels(self):
        labels = []
        for text, label in self.data:
            labels.apppend(self.transforms[1](label))
        return set(labels)

    def get_vocab(self):
        return self.vocab

### Build Vocabulary

In [3]:
from collections import Counter, OrderedDict
import spacy
nlp = spacy.load("en_core_web_sm")

def tokenize(text):
    return [tok.text for tok in nlp(text)]

raw_data = [["I like this film.", "pos"], 
            ["I hate it.", "neg"], 
            ["I have no feelings about it.", "neg"], 
            ["It is my best.", "pos"], 
            ["My father loves it so much and I do think so.", "pos"]]

token_counter = Counter()
label_counter = Counter()
for text, label in raw_data:
    token_counter.update(tokenize(text))
    label_counter.update([label])

print(token_counter.most_common(5))
print(label_counter.most_common())

[('.', 5), ('I', 4), ('it', 3), ('so', 2), ('like', 1)]
[('pos', 3), ('neg', 2)]


In [4]:
from torchtext.experimental.vocab import Vocab

# `Counter.most_common` returns the tuple list sorted by frequcies. 
vocab = Vocab(OrderedDict([('<unk>', 100), ('<pad>', 100)] + token_counter.most_common()))

print(vocab['film'])
print(vocab['a_unk_token'])

print(vocab.get_itos())
print(vocab.get_stoi())

8
0
['<unk>', '<pad>', '.', 'I', 'it', 'so', 'like', 'this', 'film', 'hate', 'have', 'no', 'feelings', 'about', 'It', 'is', 'my', 'best', 'My', 'father', 'loves', 'much', 'and', 'do', 'think']
{'<unk>': 0, '<pad>': 1, '.': 2, 'I': 3, 'it': 4, 'so': 5, 'like': 6, 'this': 7, 'film': 8, 'hate': 9, 'have': 10, 'no': 11, 'feelings': 12, 'about': 13, 'It': 14, 'is': 15, 'my': 16, 'best': 17, 'My': 18, 'father': 19, 'loves': 20, 'much': 21, 'and': 22, 'do': 23, 'think': 24}


In [5]:
from torchtext.experimental.vectors import Vectors

text = "I love this film very much."
tokens = tokenize(text)
vec_values = torch.randn(len(tokens), 5)
vecs = Vectors(tokens, vec_values)

print(len(vecs))
print(vecs["film"])
print(vecs["a_unk_token"])

7
tensor([ 2.2844,  1.1930,  0.0323, -0.5821,  0.7174])
tensor([0., 0., 0., 0., 0.])


In [6]:
vecs.lookup_vectors(vocab.get_itos())

tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-0.0578, -0.9896, -0.9021,  0.0657, -1.3600],
        [-0.3902,  1.4256,  0.0491, -0.0559, -1.0172],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0143, -0.3039, -0.7228,  0.4080,  0.1864],
        [ 2.2844,  1.1930,  0.0323, -0.5821,  0.7174],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 

In [7]:
idx = 2
print(vocab.get_itos()[idx])
print(vecs[vocab.get_itos()[idx]])

idx = 4
print(vocab.get_itos()[idx])
print(vecs[vocab.get_itos()[idx]])

.
tensor([-0.0578, -0.9896, -0.9021,  0.0657, -1.3600])
it
tensor([0., 0., 0., 0., 0.])


### Define `transforms`

In [8]:
idx2label = ['<pad>'] + list(label_counter.keys())

label2idx = {label: i for i, label in enumerate(idx2label)}
label2idx

{'<pad>': 0, 'pos': 1, 'neg': 2}

In [9]:
from torchtext.experimental.functional import vocab_func, sequential_transforms

dataset = MyDataset(raw_data, vocab, 
                    (sequential_transforms(tokenize, vocab_func(vocab)), 
                     lambda x: label2idx[x]))

dataset[0]

([3, 6, 7, 8, 2], 1)

### Use `DataLoader`

In [10]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    texts, labels = [], []
    for text, label in batch:
        texts.append(text)
        labels.append(label)
    return texts, labels

train_loader = DataLoader(dataset, batch_size=2, collate_fn=collate_fn)
for idx, (texts, labels) in enumerate(train_loader):
    break

print(texts)
print(labels)

[[3, 6, 7, 8, 2], [3, 9, 4, 2]]
[1, 2]


### Padding

In [11]:
def pad(texts, padding_idx=0, length=None):
    maxlen = max(len(text) for text in texts)
    length = maxlen if length is None else max(length, maxlen)

    return [text + [padding_idx] * (length-len(text)) for text in texts]

texts = pad(texts, padding_idx=vocab['<pad>'], length=10)
print(texts)

[[3, 6, 7, 8, 2, 1, 1, 1, 1, 1], [3, 9, 4, 2, 1, 1, 1, 1, 1, 1]]


In [12]:
print(torch.tensor(texts))
print(torch.tensor(labels))

tensor([[3, 6, 7, 8, 2, 1, 1, 1, 1, 1],
        [3, 9, 4, 2, 1, 1, 1, 1, 1, 1]])
tensor([1, 2])


### Batching, Padding, To-Tensor in One `collate_fn`

In [13]:
from torch.utils.data import DataLoader

def great_collate_fn(batch):
    """
    Batching, padding and to-tensor together.
    """
    texts, labels = [], []
    for text, label in batch:
        texts.append(text)
        labels.append(label)

    # It may be better to register the collate function as a method of Dataset
    texts = pad(texts, padding_idx=vocab['<pad>'], length=10)
    return torch.tensor(texts), torch.tensor(labels)

train_loader = DataLoader(dataset, batch_size=2, collate_fn=great_collate_fn)
for idx, (texts, labels) in enumerate(train_loader):
    break

print(texts)
print(labels)

tensor([[3, 6, 7, 8, 2, 1, 1, 1, 1, 1],
        [3, 9, 4, 2, 1, 1, 1, 1, 1, 1]])
tensor([1, 2])


## Load a Dataset from `TorchText`

In [14]:
from torchtext.experimental.datasets import IMDB
train_data, test_data = IMDB(ngrams=3, root="assets/data")

25000lines [00:05, 4643.70lines/s]


In [15]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    texts, labels = [], []
    for label, txt in batch:
        texts.append(txt)
        labels.append(label)
    return texts, labels

train_loader = DataLoader(train_data, batch_size=8, collate_fn=collate_fn)
for idx, (texts, labels) in enumerate(train_loader):
    break

In [16]:
print(type(texts), len(texts))

print(type(texts[0]), texts[0].size())
print(type(texts[1]), texts[1].size())
print(type(texts[2]), texts[2].size())

<class 'list'> 8
<class 'torch.Tensor'> torch.Size([948])
<class 'torch.Tensor'> torch.Size([759])
<class 'torch.Tensor'> torch.Size([300])


In [17]:
nn.utils.rnn.pad_sequence(texts, batch_first=True, padding_value=train_data.vocab['<pad>'])

tensor([[  13, 3857,   13,  ...,    1,    1,    1],
        [  13,  401, 5144,  ...,    1,    1,    1],
        [  63,   89,    8,  ...,    1,    1,    1],
        ...,
        [  13,   85,  471,  ...,    1,    1,    1],
        [6688, 2336,    2,  ...,    1,    1,    1],
        [  75,   13,  121,  ...,    1,    1,    1]])

In [18]:
print(type(labels), len(labels))

print(type(labels[0]), labels[0].size())
print(type(labels[1]), labels[1].size())
print(type(labels[2]), labels[2].size())

<class 'list'> 8
<class 'torch.Tensor'> torch.Size([])
<class 'torch.Tensor'> torch.Size([])
<class 'torch.Tensor'> torch.Size([])
