In [1]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

SEED = 515
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# `TorchText` before version 0.7
Reference: https://github.com/pytorch/text/releases/tag/v0.7.0-rc3

## Build a Dataset Manually
Among the main concepts of `TorchText`, `Field` is the one that defines how data should be processed.  
The `Field` class couples tokenization, vocabularies, splitting, batching and sampling, padding, and numericalization all together. 

In [2]:
from torchtext.data import Field, LabelField, Example, Dataset, BucketIterator

TEXT = Field(tokenize='spacy', tokenizer_language='en_core_web_sm')
GENDER = Field(sequential=False)
LABEL = LabelField(dtype=torch.float)

In [3]:
# Example from dict
fields = {'T': ('text', TEXT), 
          'G': ('gender', GENDER), 
          'L': ('label', LABEL)}
raw_ex = {'T': "I like this film.", 
          'G': "f", 
          'L': "pos"}

ex = Example.fromdict(raw_ex, fields)
print(ex)
print(ex.text)
print(ex.gender)
print(ex.label)

<torchtext.data.example.Example object at 0x00000160AC51ECA0>
['I', 'like', 'this', 'film', '.']
f
pos


In [4]:
# Example from list
fields = [('text', TEXT), ('gender', GENDER), ('label', LABEL)]
raw_ex = ["I like this film.", "f", "pos"]

ex = Example.fromlist(raw_ex, fields)
print(ex)
print(ex.text)
print(ex.gender)
print(ex.label)

<torchtext.data.example.Example object at 0x00000160AB65CB80>
['I', 'like', 'this', 'film', '.']
f
pos


In [5]:
# Create a Dataset
fields = [('text', TEXT), ('gender', GENDER), ('label', LABEL)]
raw_data = [["I like this film.", "f", "pos"], 
            ["I hate it.", "f", "neg"], 
            ["I have no feelings about it.", "m", "neg"], 
            ["It is my best.", "m", "pos"], 
            ["My father loves it so much and I do think so.", "f", "pos"]]

examples = [Example.fromlist(d, fields) for d in raw_data]
data = Dataset(examples, fields)
print(data)
print(data[1])
print(data[1].text)
print(data[1].gender)
print(data[1].label)

<torchtext.data.dataset.Dataset object at 0x00000160AC51EE80>
<torchtext.data.example.Example object at 0x00000160AC51EBB0>
['I', 'hate', 'it', '.']
f
neg


In [6]:
TEXT.build_vocab(data)
GENDER.build_vocab(data)
LABEL.build_vocab(data)
len(TEXT.vocab), len(GENDER.vocab), len(LABEL.vocab)

(25, 3, 2)

In [7]:
BATCH_SIZE = 2

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
iterator = BucketIterator(data, batch_size=BATCH_SIZE, device=device, shuffle=True)

for batch in iterator:
    print(batch.text)

tensor([[ 3,  3],
        [15, 16],
        [ 4, 22],
        [ 2, 13],
        [ 1,  8],
        [ 1,  4],
        [ 1,  2]], device='cuda:0')
tensor([[ 7,  3],
        [12, 18],
        [19, 24],
        [ 4, 14],
        [ 5,  2],
        [20,  1],
        [ 9,  1],
        [ 3,  1],
        [11,  1],
        [23,  1],
        [ 5,  1],
        [ 2,  1]], device='cuda:0')
tensor([[ 6],
        [17],
        [21],
        [10],
        [ 2]], device='cuda:0')


## Load Pre-Trained Embeddings Manually

In [8]:
from torchtext.vocab import Vectors, Vocab

text = "I love this film very much ."
with open('assets/vector_cache/test-vecs.txt', 'w', encoding='utf-8') as f:
    for token in text.split():
        vec = ["%.6f" % vi for vi in np.random.randn(5)]
        f.write(" ".join([token] + vec))
        f.write("\n")

vecs = Vectors('test-vecs.txt', cache='assets/vector_cache')
print(vecs.stoi)
print(vecs.itos)
print(vecs.vectors)

{'I': 0, 'love': 1, 'this': 2, 'film': 3, 'very': 4, 'much': 5, '.': 6}
['I', 'love', 'this', 'film', 'very', 'much', '.']
tensor([[-5.2200e-01, -1.0764e+00,  8.8916e-01,  2.6261e-01,  7.3789e-01],
        [-4.6000e-05,  1.6979e+00, -2.0411e+00, -2.0144e-01,  9.1392e-01],
        [ 1.3107e+00, -7.9961e-01, -1.4983e+00,  1.4374e+00,  2.1713e-01],
        [ 6.4269e-01, -3.0231e+00,  1.1120e+00, -1.1176e-01,  7.4710e-01],
        [-6.1633e-01, -1.6504e-01, -1.5072e-01,  8.6449e-01, -1.7347e+00],
        [ 1.7514e+00, -9.4005e-01,  1.7255e+00,  6.8504e-01, -1.1134e-01],
        [ 1.1694e+00, -2.1626e+00, -8.9035e-01,  1.2425e+00, -1.4998e-01]])


In [9]:
# Set `include_lengths=True` to return the text lengths too. 
TEXT = Field(tokenize='spacy', tokenizer_language='en_core_web_sm', include_lengths=True)
GENDER = Field(sequential=False)
LABEL = LabelField(dtype=torch.float)

# Create a Dataset
fields = [('text', TEXT), ('gender', GENDER), ('label', LABEL)]
raw_data = [["I like this film.", "f", "pos"], 
            ["I hate it.", "f", "neg"], 
            ["I have no feelings about it.", "m", "neg"], 
            ["It is my best.", "m", "pos"], 
            ["My father loves it so much and I do think so.", "f", "pos"]]

examples = [Example.fromlist(d, fields) for d in raw_data]
data = Dataset(examples, fields)
data

<torchtext.data.dataset.Dataset at 0x160ac51edf0>

In [10]:
TEXT.build_vocab(data)
GENDER.build_vocab(data)
LABEL.build_vocab(data)

print(TEXT.vocab.vectors)

None


In [11]:
# The missing tokens are initialized as zeros, or by `unk_init` if provided. 
TEXT.build_vocab(data, vectors=vecs)
print(TEXT.vocab.vectors)

tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 1.1694, -2.1626, -0.8903,  1.2425, -0.1500],
        [-0.5220, -1.0764,  0.8892,  0.2626,  0.7379],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.6427, -3.0231,  1.1120, -0.1118,  0.7471],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 

In [12]:
BATCH_SIZE = 2

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
iterator = BucketIterator(data, batch_size=BATCH_SIZE, device=device, shuffle=True)

# `batch.text` is now a tuple with the second element being the text lengths.
for batch in iterator:
    print(batch.text)

(tensor([[ 3,  3],
        [15, 16],
        [ 4, 22],
        [ 2, 13],
        [ 1,  8],
        [ 1,  4],
        [ 1,  2]], device='cuda:0'), tensor([4, 7], device='cuda:0'))
(tensor([[ 7,  3],
        [12, 18],
        [19, 24],
        [ 4, 14],
        [ 5,  2],
        [20,  1],
        [ 9,  1],
        [ 3,  1],
        [11,  1],
        [23,  1],
        [ 5,  1],
        [ 2,  1]], device='cuda:0'), tensor([12,  5], device='cuda:0'))
(tensor([[ 6],
        [17],
        [21],
        [10],
        [ 2]], device='cuda:0'), tensor([5], device='cuda:0'))


## Load a Dataset from `TorchText`

In [13]:
import torchtext
from torchtext.data import Field, LabelField, BucketIterator

# Set `batch_first=True` in the `Field`.
TEXT = Field(tokenize='spacy', tokenizer_language='en_core_web_sm', include_lengths=True, batch_first=True)
LABEL = LabelField()

train_data, test_data = torchtext.datasets.TREC.splits(TEXT, LABEL, fine_grained=False, root='assets/data')
train_data, valid_data = train_data.split()

In [14]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE, 
                 vectors="glove.6B.100d", vectors_cache="assets/vector_cache", 
                 unk_init=torch.Tensor.normal_)

LABEL.build_vocab(train_data)
print(LABEL.vocab.stoi)

defaultdict(None, {'ENTY': 0, 'HUM': 1, 'DESC': 2, 'NUM': 3, 'LOC': 4, 'ABBR': 5})


In [15]:
# `Field.preprocess` has been called when building the dataset. 
# `Field.preprocess` includes `Field.preprocessing` passed by user. 

# `Field.process` process a batch of examples to create a torch.Tensor.
# `Field.process` includes padding, numericalization, and postprocess (including `Field.postprocessing` passed by user) a batch and create a tensor.
TEXT.process([train_data[0].text, train_data[1].text, train_data[2].text], device=device)

(tensor([[   4,   24,    3,  297,  552,  106,   18,    7,  286,   14,  552,    2],
         [  26,    5, 3375,  677,    2,    1,    1,    1,    1,    1,    1,    1],
         [  10,   22,   61,  443,   41,  156,   18,  107,   36,   13,  357,    2]],
        device='cuda:0'),
 tensor([12,  5, 12], device='cuda:0'))

In [16]:
ex = train_data[0]
print(ex.text)
# If `include_lengths=False`, it should be:  
# `TEXT.numericalize([ex.text], device=device)`
text, text_lens = TEXT.numericalize(([ex.text], [len(ex.text)]), device=device)
print(text)
print(text_lens)

['What', 'do', 'the', 'letters', 'D.C.', 'stand', 'for', 'in', 'Washington', ',', 'D.C.', '?']
tensor([[  4,  24,   3, 297, 552, 106,  18,   7, 286,  14, 552,   2]],
       device='cuda:0')
tensor([12], device='cuda:0')


In [17]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE, sort_within_batch=True, device=device)

## `BucketIterator.splits`
By default, the first dataset (`train_iterator`) would be shuffled, while the other datasets (`valid_iterator` and `test_iterator`) would be not shuffled. 

In [18]:
for i, batch in enumerate(train_iterator):
    text, text_lens = batch.text
    print(text_lens.max(), text_lens[0], text_lens[-1])
    if i >= 9:
        break

tensor(11, device='cuda:0') tensor(11, device='cuda:0') tensor(11, device='cuda:0')
tensor(16, device='cuda:0') tensor(16, device='cuda:0') tensor(15, device='cuda:0')
tensor(6, device='cuda:0') tensor(6, device='cuda:0') tensor(5, device='cuda:0')
tensor(9, device='cuda:0') tensor(9, device='cuda:0') tensor(9, device='cuda:0')
tensor(20, device='cuda:0') tensor(20, device='cuda:0') tensor(17, device='cuda:0')
tensor(11, device='cuda:0') tensor(11, device='cuda:0') tensor(11, device='cuda:0')
tensor(10, device='cuda:0') tensor(10, device='cuda:0') tensor(10, device='cuda:0')
tensor(15, device='cuda:0') tensor(15, device='cuda:0') tensor(14, device='cuda:0')
tensor(10, device='cuda:0') tensor(10, device='cuda:0') tensor(10, device='cuda:0')
tensor(14, device='cuda:0') tensor(14, device='cuda:0') tensor(13, device='cuda:0')


In [19]:
for i, batch in enumerate(train_iterator):
    text, text_lens = batch.text
    print(text_lens.max(), text_lens[0], text_lens[-1])
    if i >= 9:
        break

tensor(14, device='cuda:0') tensor(14, device='cuda:0') tensor(13, device='cuda:0')
tensor(7, device='cuda:0') tensor(7, device='cuda:0') tensor(6, device='cuda:0')
tensor(7, device='cuda:0') tensor(7, device='cuda:0') tensor(7, device='cuda:0')
tensor(11, device='cuda:0') tensor(11, device='cuda:0') tensor(10, device='cuda:0')
tensor(11, device='cuda:0') tensor(11, device='cuda:0') tensor(11, device='cuda:0')
tensor(6, device='cuda:0') tensor(6, device='cuda:0') tensor(5, device='cuda:0')
tensor(16, device='cuda:0') tensor(16, device='cuda:0') tensor(15, device='cuda:0')
tensor(9, device='cuda:0') tensor(9, device='cuda:0') tensor(9, device='cuda:0')
tensor(10, device='cuda:0') tensor(10, device='cuda:0') tensor(9, device='cuda:0')
tensor(12, device='cuda:0') tensor(12, device='cuda:0') tensor(12, device='cuda:0')


In [20]:
for i, batch in enumerate(valid_iterator):
    text, text_lens = batch.text
    print(text_lens.max(), text_lens[0], text_lens[-1])
    if i >= 9:
        break

tensor(5, device='cuda:0') tensor(5, device='cuda:0') tensor(3, device='cuda:0')
tensor(7, device='cuda:0') tensor(7, device='cuda:0') tensor(6, device='cuda:0')
tensor(7, device='cuda:0') tensor(7, device='cuda:0') tensor(7, device='cuda:0')
tensor(8, device='cuda:0') tensor(8, device='cuda:0') tensor(7, device='cuda:0')
tensor(9, device='cuda:0') tensor(9, device='cuda:0') tensor(8, device='cuda:0')
tensor(9, device='cuda:0') tensor(9, device='cuda:0') tensor(9, device='cuda:0')
tensor(10, device='cuda:0') tensor(10, device='cuda:0') tensor(9, device='cuda:0')
tensor(11, device='cuda:0') tensor(11, device='cuda:0') tensor(10, device='cuda:0')
tensor(12, device='cuda:0') tensor(12, device='cuda:0') tensor(11, device='cuda:0')
tensor(13, device='cuda:0') tensor(13, device='cuda:0') tensor(12, device='cuda:0')


In [21]:
for i, batch in enumerate(valid_iterator):
    text, text_lens = batch.text
    print(text_lens.max(), text_lens[0], text_lens[-1])
    if i >= 9:
        break

tensor(5, device='cuda:0') tensor(5, device='cuda:0') tensor(3, device='cuda:0')
tensor(7, device='cuda:0') tensor(7, device='cuda:0') tensor(6, device='cuda:0')
tensor(7, device='cuda:0') tensor(7, device='cuda:0') tensor(7, device='cuda:0')
tensor(8, device='cuda:0') tensor(8, device='cuda:0') tensor(7, device='cuda:0')
tensor(9, device='cuda:0') tensor(9, device='cuda:0') tensor(8, device='cuda:0')
tensor(9, device='cuda:0') tensor(9, device='cuda:0') tensor(9, device='cuda:0')
tensor(10, device='cuda:0') tensor(10, device='cuda:0') tensor(9, device='cuda:0')
tensor(11, device='cuda:0') tensor(11, device='cuda:0') tensor(10, device='cuda:0')
tensor(12, device='cuda:0') tensor(12, device='cuda:0') tensor(11, device='cuda:0')
tensor(13, device='cuda:0') tensor(13, device='cuda:0') tensor(12, device='cuda:0')
