In [1]:
import numpy as np

from torchtext import data
from torchtext import datasets
from torchtext.vocab import Vectors, GloVe
import spacy
from spacy.symbols import ORTH

my_tok = spacy.load('en')


import torch

from sklearn.manifold import TSNE

# we'll use the bokeh library to create beautiful plots
# *_notebook functions are needed for correct use in jupyter
from bokeh.plotting import figure, ColumnDataSource
from bokeh.models import HoverTool
from bokeh.io import output_notebook, show, push_notebook
# output_notebook()

In [2]:
# ---- Data processing ----
BATCH_SIZE = 32

# improve using 'spacy'
def tokenize(x):
    return [tok.text for tok in my_tok.tokenizer(x)]

# Define preprocessing pipeline
TEXT = data.Field(lower=True, tokenize=tokenize, sequential=True)

# create splits
train, valid, test = datasets.PennTreebank.splits(TEXT) # loading custom datasets requires passing in the field, but nothing else.

# this takes a long long time (when you need to downloand the vectors (1st time)), but without a vocab I get errors
TEXT.build_vocab(train, max_size=None, vectors=[GloVe(name='6B', dim='300')])

# Create iterators of batch_size = 32
train_iter, valid_iter, test_iter = data.BPTTIterator.splits(
    (train, valid, test),
    batch_size=BATCH_SIZE,
    bptt_len=30, # this is where we specify the sequence length
    device=torch.device('cpu'),
    repeat=False)


In [5]:
# size of embeddings
embedding_dim = TEXT.vocab.vectors.size()[1]
num_embeddings = TEXT.vocab.vectors.size()[0]

print('Embedding dim: {}'.format(embedding_dim))
print('Num Embeddings: {}'.format(num_embeddings))

# copied from http://mlexplained.com/2018/02/15/language-modeling-tutorial-in-torchtext-practical-torchtext-part-2/

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable as V
 
class RNNModel(nn.Module):
    def __init__(self, ntoken, ninp,
                 nhid, nlayers, bsz,
                 dropout=0.5, tie_weights=True):
        super(RNNModel, self).__init__()
        self.nhid, self.nlayers, self.bsz = nhid, nlayers, bsz
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)
        self.init_weights()
        self.hidden = self.init_hidden(bsz) # the input is a batched consecutive corpus
                                            # therefore, we retain the hidden state across batches
 
    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)
 
    def forward(self, input):
        emb = self.drop(self.encoder(input))
        output, self.hidden = self.rnn(emb, self.hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1))
 
    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        return (V(weight.new(self.nlayers, bsz, self.nhid).zero_()),
                V(weight.new(self.nlayers, bsz, self.nhid).zero_()))
  
    def reset_history(self):
        self.hidden = tuple(V(v.data) for v in self.hidden)


Embedding dim: 300
Num Embeddings: 9731


In [6]:
# Instantiate model with the pretrained vectors
weight_matrix = TEXT.vocab.vectors
model = RNNModel(weight_matrix.size(0), weight_matrix.size(1), 200, 1, BATCH_SIZE)


model.encoder.weight.data.copy_(weight_matrix)


tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0466,  0.2132, -0.0074,  ...,  0.0091, -0.2099,  0.0539],
        ...,
        [ 0.3020,  0.6570,  0.4714,  ...,  0.5794,  0.2269, -0.1504],
        [-0.0879,  0.0287,  0.1615,  ..., -0.3171, -0.2749, -0.3325],
        [-0.0446,  0.2063,  0.0142,  ..., -0.1955, -0.2697,  0.6638]])

In [7]:
# Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3, betas=(0.7, 0.99))
n_tokens = weight_matrix.size(0)

In [10]:
# Training loop
from tqdm import tqdm 
def train_epoch(epoch):
    """One epoch of a training loop"""
    epoch_loss = 0
    for batch in tqdm(train_iter):
    # reset the hidden state or else the model will try to backpropagate to the
    # beginning of the dataset, requiring lots of time and a lot of memory
         model.reset_history()
 
    optimizer.zero_grad()
 
    text, targets = batch.text, batch.target
    prediction = model(text)
    # pytorch currently only supports cross entropy loss for inputs of 2 or 4 dimensions.
    # we therefore flatten the predictions out across the batch axis so that it becomes
    # shape (batch_size * sequence_length, n_tokens)
    # in accordance to this, we reshape the targets to be
    # shape (batch_size * sequence_length)
    loss = criterion(prediction.view(-1, n_tokens), targets.view(-1))
    loss.backward()
 
    optimizer.step()
 
    epoch_loss += loss.data[0] * prediction.size(0) * prediction.size(1)
 
    epoch_loss /= len(train.examples[0].text)
 
    # monitor the loss
    val_loss = 0
    model.eval()
    for batch in valid_iter:
        model.reset_history()
        text, targets = batch.text, batch.target
        prediction = model(text)
        loss = criterion(prediction.view(-1, n_tokens), targets.view(-1))
        val_loss += loss.data[0] * text.size(0)
    val_loss /= len(valid.examples[0].text)
 
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))


In [None]:
# train
n_epochs = 50
for epoch in range(1, n_epochs + 1):
    train_epoch(epoch)

100%|██████████| 1131/1131 [00:00<00:00, 5599.53it/s]
100%|██████████| 1131/1131 [00:00<00:00, 6042.42it/s]

Epoch: 1, Training Loss: 0.0016, Validation Loss: 0.2844



  0%|          | 0/1131 [00:00<?, ?it/s]

Epoch: 2, Training Loss: 0.0016, Validation Loss: 0.2832


100%|██████████| 1131/1131 [00:00<00:00, 4338.04it/s]
100%|██████████| 1131/1131 [00:00<00:00, 6210.64it/s]

Epoch: 3, Training Loss: 0.0016, Validation Loss: 0.2816



100%|██████████| 1131/1131 [00:00<00:00, 5981.68it/s]

Epoch: 4, Training Loss: 0.0016, Validation Loss: 0.2793



100%|██████████| 1131/1131 [00:00<00:00, 5803.37it/s]

Epoch: 5, Training Loss: 0.0015, Validation Loss: 0.2757



100%|██████████| 1131/1131 [00:00<00:00, 5735.97it/s]

Epoch: 6, Training Loss: 0.0015, Validation Loss: 0.2693



  0%|          | 1/1131 [00:00<03:06,  6.07it/s]

Epoch: 7, Training Loss: 0.0014, Validation Loss: 0.2592


100%|██████████| 1131/1131 [00:00<00:00, 5732.47it/s]
100%|██████████| 1131/1131 [00:00<00:00, 5932.07it/s]

Epoch: 8, Training Loss: 0.0013, Validation Loss: 0.2496



100%|██████████| 1131/1131 [00:00<00:00, 6002.07it/s]

Epoch: 9, Training Loss: 0.0012, Validation Loss: 0.2424



100%|██████████| 1131/1131 [00:00<00:00, 6013.47it/s]

Epoch: 10, Training Loss: 0.0011, Validation Loss: 0.2372



100%|██████████| 1131/1131 [00:00<00:00, 5913.72it/s]

Epoch: 11, Training Loss: 0.0010, Validation Loss: 0.2338



100%|██████████| 1131/1131 [00:00<00:00, 5926.73it/s]

Epoch: 12, Training Loss: 0.0009, Validation Loss: 0.2329



100%|██████████| 1131/1131 [00:00<00:00, 5997.91it/s]

Epoch: 13, Training Loss: 0.0009, Validation Loss: 0.2334



  0%|          | 1/1131 [00:00<03:39,  5.15it/s]

Epoch: 14, Training Loss: 0.0008, Validation Loss: 0.2345


100%|██████████| 1131/1131 [00:00<00:00, 5058.30it/s]
  0%|          | 1/1131 [00:00<03:33,  5.29it/s]

Epoch: 15, Training Loss: 0.0008, Validation Loss: 0.2362


100%|██████████| 1131/1131 [00:00<00:00, 5159.95it/s]
