References
- A Tutorial on Torchtext
    - [blog post](http://anie.me/On-Torchtext/)
    - [full code](https://github.com/mjc92/TorchTextTutorial/blob/master/01.%20Getting%20started.ipynb)
- [torchtext(Github)](https://github.com/pytorch/text)

Define a tokenizer

In [1]:
import spacy
# Define a tokenizer
spacy_en = spacy.load('en')
def spacy_tokenize(text):
    '''A tokenizer function'''
    return [token.text for token in spacy_en.tokenizer(text)]

spacy_tokenize('I love ice creams.')

['I', 'love', 'ice', 'creams', '.']

## Defining Field(a preprocessor class) and Corpus

Define a Field

In [2]:
from torchtext import data, datasets

# data.Field: a preprocessor class
CORPUS = data.Field(sequential=True,
                    tokenize=spacy_tokenize,
                    lower=True,
                    use_vocab=True,
                   init_token = '<sos>',
                   eos_token = '<eos>')

print(vars(CORPUS))

'''
use_vocab: if False, integer indices, instead of vocabulary,
is used as inputs
lower: decapitalize all words
init_token, eos_token: SOS, EOS tokens
fix_length: if specified, input lengths are fixed
'''

{'sequential': True, 'use_vocab': True, 'init_token': '<sos>', 'eos_token': '<eos>', 'unk_token': '<unk>', 'fix_length': None, 'tensor_type': <class 'torch.LongTensor'>, 'preprocessing': None, 'postprocessing': None, 'lower': True, 'tokenize': <function spacy_tokenize at 0x7fb786a85598>, 'include_lengths': False, 'batch_first': False, 'pad_token': '<pad>', 'pad_first': False}


'\nuse_vocab: if False, integer indices, instead of vocabulary,\nis used as inputs\nlower: decapitalize all words\ninit_token, eos_token: SOS, EOS tokens\nfix_length: if specified, input lengths are fixed\n'

Define a corpus

In [3]:
corpus = data.TabularDataset(
    path ='../data/wiki.nlp.history.txt',
    format='tsv',
    fields=[('src', CORPUS)])

Corpus statistics

In [6]:
# the number of sentences in the corpus
print('N(sents): %i' % len(corpus))
# 
print('First sentence in the corpus: %s' % vars(corpus[0])['src'])
print('N(tokens) in the first sentence: %i' %
      len(vars(corpus[0])['src']))

N(sents): 15
First sentence in the corpus: ['the', 'history', 'of', 'nlp', 'generally', 'started', 'in', 'the', '1950s', ',', 'although', 'work', 'can', 'be', 'found', 'from', 'earlier', 'periods', '.', 'in', '1950', ',', 'alan', 'turing', 'published', 'an', 'article', 'titled', '"', 'computing', 'machinery', 'and', 'intelligence', '"', 'which', 'proposed', 'what', 'is', 'now', 'called', 'the', 'turing', 'test', 'as', 'a', 'criterion', 'of', 'intelligence', '.']
N(tokens) in the first sentence: 49


## Vocab object

In [11]:
# Build a Vocab(vocabulary) for the corpus
CORPUS.build_vocab(corpus,
                  max_size = 30,
                  min_freq = 3)
'''
max_size: the maximum # of words for the vocabulary
min_freq: the minimum threshold for including a words in the vocabulary

These condtions only apply to CORPUS.vocab.itos, CORPUS.vocab.stoi,
and is ignored in CORPUS.vocab.freqs

'''
# vocabulary size
print('Vocabulary size: %i' % len(CORPUS.vocab))
# Size of for itos, stoi mapping(identical to the vocab size)
print(len(CORPUS.vocab.itos), len(CORPUS.vocab.stoi))

# frequency for *all* of the words appearing in the corpus
# filtering conditions(max_size, min_freq) are ignored here.
print(len(CORPUS.vocab.freqs))
print('Top 50 frequent words: %s' % CORPUS.vocab.freqs.most_common(50))

# unknown words are automatically dealt
print(CORPUS.vocab.itos[CORPUS.vocab.stoi['sdfUAWBHruhq23']])

Vocabulary size: 34
34 34
385
Top 50 frequent words: [(',', 60), ('the', 39), ('of', 33), ('.', 26), ('-', 21), ('a', 16), ('in', 15), ('and', 15), ('to', 13), ('"', 12), ('(', 12), (')', 12), ('systems', 10), ('for', 9), ('machine', 8), ('data', 8), ('which', 7), ('translation', 7), ('learning', 7), ('more', 6), ('that', 6), ('was', 6), ('research', 6), ('were', 6), ('language', 6), ('many', 6), ('models', 6), ('nlp', 5), ('as', 5), ('into', 5), (']', 5), ('however', 5), ('on', 5), ('has', 5), ('results', 5), ('is', 4), ('real', 4), ('when', 4), ('statistical', 4), ('developed', 4), ('with', 4), ('written', 4), ('are', 4), ('algorithms', 4), ('such', 4), ('input', 4), ('annotated', 4), ('generally', 3), ('be', 3), ('from', 3)]
<unk>


## Using an iterator

Train/Test split

# Using Pretrained Word Embeddings

## Steps
1. Load pretrained word embeddings
2. Build the vocabulary based on the pretrained embeddings

    - Downloads the vector, to the *current_folder/.vector_cache* by default, if necessary
        - The vector cache directory can be specified by specifying `cache` explicitly.
        - Reference: [1](https://github.com/pytorch/text/blob/master/test/language_modeling.py), [2](https://github.com/pytorch/text/blob/master/torchtext/vocab.py)

In [8]:
from torchtext.vocab import FastText
import os
FT = data.Field()
corpus_path = '../data/wiki.nlp.history.txt'
vector_cache_path = os.path.join(os.path.expanduser('~'), 'torchtext_data/.vector_cache')
lm = datasets.LanguageModelingDataset(path = corpus_path, text_field = FT)

Build the vocabulary

In [9]:
FT.build_vocab(lm, vectors = FastText(language = 'en',
                                      cache = vector_cache_path))
# Embedding Matrix(PyTorch FloatTensor)
print('*' * 10 + 'Embeddings' + '*' * 10)
print(FT.vocab.vectors)
n, d = FT.vocab.vectors.size()
print('Vocabulary size: %i' % n)
print('Embeeding dimension: %i' % d)

**********Embeddings**********

 0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
 0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
-0.0653 -0.0930 -0.0176  ...   0.1664 -0.1308  0.0354
          ...             ⋱             ...          
-0.0131  0.1322 -0.1948  ...  -0.1957  0.3368  0.2303
-0.1493  0.2157  0.1882  ...   0.3233  0.3822 -0.0868
-0.1382  0.2134  0.2215  ...   0.4661  0.2843 -0.1336
[torch.FloatTensor of size 418x300]

Vocabulary size: 418
Embeeding dimension: 300


Copy the pretrained word embeddings into a PyTorch tensor

In [10]:
import torch.nn as nn
n, d = FT.vocab.vectors.size()
emb = nn.Embedding(n, d)
emb.weight.data.copy_(FT.vocab.vectors)


 0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
 0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
-0.0653 -0.0930 -0.0176  ...   0.1664 -0.1308  0.0354
          ...             ⋱             ...          
-0.0131  0.1322 -0.1948  ...  -0.1957  0.3368  0.2303
-0.1493  0.2157  0.1882  ...   0.3233  0.3822 -0.0868
-0.1382  0.2134  0.2215  ...   0.4661  0.2843 -0.1336
[torch.FloatTensor of size 418x300]