In [1]:
import numpy as np
import torch
import torch.nn
import torchtext.data as data

from src import movie_reviews_dataset

In [2]:
text_field = data.Field(lower=True)
label_field = data.Field(sequential=False)

print('loading dataset')
train_data, dev_data = movie_reviews_dataset.MR.splits(text_field, label_field, root='./data', dev_ratio=.2)

print('len(train)', len(train_data))
print('len(dev)', len(dev_data))

print(train_data.examples[0].text)

loading dataset


len(train) 8530
len(dev) 2132
['about', 'as', 'original', 'as', 'a', 'gangster', 'sweating', 'bullets', 'while', 'worrying', 'about', 'a', 'contract', 'on', 'his', 'life', '']


Construct the vocabulary

In [3]:
text_field.build_vocab(train_data, dev_data)
label_field.build_vocab(train_data, dev_data)

print('text_field.vocab.size', len(text_field.vocab))
print('label_field.vocab.size', len(label_field.vocab))

text_field.vocab.size 21109
label_field.vocab.size 3


In [4]:
train_iter, dev_iter = data.Iterator.splits((train_data, dev_data), 
                                            batch_sizes=(4, len(dev_data)), device=-1, repeat=False)

In [5]:
batch = next(iter(train_iter))
batch.text.data.t_()

for i in range(4):
    sample = batch.text.data[i, :]
    print(i, [text_field.vocab.itos[num] for num in sample])

0 ['this', 'rough', 'trade', 'punch and judy', 'act', "did n't", 'play', 'well', 'then', 'and', 'it', 'plays', 'worse', 'now', '', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
1 ['the', 'film', 'starts', 'promisingly', ',', 'but', 'the', 'ending', 'is', 'all', 'too', 'predictable', 'and', 'far', 'too', 'cliched', 'to', 'really', 'work', '', '<pad>', '<pad>']
2 ['a', 'compassionate', ',', 'moving', 'portrait', 'of', 'an', 'american', '\\(', 'and', 'an', 'america', '\\)', 'always', 'reaching', 'for', 'something', 'just', 'outside', 'his', 'grasp', '']
3 ['while', 'super', 'troopers', 'is', 'above', 'academy', 'standards', ',', 'its', 'quintet', 'of', 'writers', 'could', 'still', 'use', 'some', 'more', 'schooling', '', '<pad>', '<pad>', '<pad>']


In [6]:
embed = torch.nn.Embedding(num_embeddings=len(text_field.vocab), embedding_dim=300)
print('Embedding input dims: ', batch.text.data.size())
embedded = embed(batch.text)
print('Embedding dims: ', embedded.size())
print('Repeated embeddings of the same data should be the same: ', 
      np.array_equal(embedded.data.numpy(), embed(batch.text).data.numpy()))

Embedding input dims:  torch.Size([4, 28])
Embedding dims:  torch.Size([4, 28, 300])
Repeated embeddings of the same data should be the same:  True


In [8]:
text_field.vocab.load_vectors(wv_dir='.data_cache', wv_type='glove.6B', wv_dim=300)
print('Initialized GloVE embeddings with dim: ', text_field.vocab.vectors.size())
embed_glove = torch.nn.Embedding(num_embeddings=len(text_field.vocab), embedding_dim=300)
del embed_glove.weight
embed_glove.weight = torch.nn.Parameter(text_field.vocab.vectors)

embedded_glove = embed_glove(batch.text)
print('Embedding w/ GloVE dims: ', embedded_glove.size())
print('Repeated embeddings of the same data should be the same: ', 
      np.array_equal(embedded_glove.data.numpy(), embed_glove(batch.text).data.numpy()))

print('Embedded tensors should be different between GloVE and un-initialized embeddings: ', 
      not np.array_equal(embedded_glove.data.numpy(), embedded.data.numpy()))

print('Returned embedded value is the same between input embedding tensor and output of the Embedder', 
      np.array_equal(embedded_glove.data[0, 0].numpy(), text_field.vocab.vectors[batch.text.data[0, 0]].numpy()))

loading word vectors from .data_cache/glove.6B.300d.pt


Initialized GloVE embeddings with dim:  torch.Size([21109, 300])


Embedding w/ GloVE dims:  torch.Size([4, 28, 300])
Repeated embeddings of the same data should be the same:  True
Embedded tensors should be different between GloVE and un-initialized embeddings:  True
Returned embedded value is the same between input embedding tensor and output of the Embedder True
