In [1]:
# credit: https://github.com/pytorch/examples/tree/master/word_language_model

In [2]:
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

torch.manual_seed(1)

<torch._C.Generator at 0x7f7226412ed0>

# Data: WikiText-2
WikiText-2 is a collection of over 100 million tokens extracted from the set of verified Good and Featured articles on Wikipedia (https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/)

The raw text data is in ```data/wikitext-2``` divided into 3 files: ```train.txt, valid.txt, text.txt``` each containing training / validation /test split of the data.

In order to process data, we have to 
1. Build a dictionary that maps word to id and viceversa (word <-> id)
2. Tokenize the text using this dictionary 

We will create two abstract classes for this purpose.

In [3]:
import os
import io

class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))

    def tokenize(self, path):
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with io.open(path, 'r', encoding="utf8") as f:
            tokens = 0
            for line in f:
                words = line.split() + ['<eos>']
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with io.open(path, 'r', encoding="utf8") as f:
            ids = torch.LongTensor(tokens)
            token = 0
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token += 1

        return ids

In [4]:
corpus = Corpus('data/wikitext-2')

In [5]:
corpus.train[1000:1100]

tensor([ 74,  17, 417, 418, 182, 151,  17, 419, 403,  37, 420, 300, 160, 421,
         13, 212,  78, 422, 423,  22,  17, 424,  13, 425,  35, 293, 426,  13,
          9,   9,  13, 427,  61, 428, 429,  15,  61,  83, 430, 236, 195,  78,
          9, 351, 431,  13, 147, 432, 433, 434, 435,  16, 436,  73, 437,  22,
        438, 439, 440, 441,   9, 365,  13,  27, 442, 443, 367, 444, 445,  73,
        446, 447,  80,  17,   2,  73, 448, 361, 449, 440,  37,   9, 450,   9,
         13,  27, 451,   9, 452, 453,  73,  26, 454,  27, 455,  16,  17,   2,
         15, 456])

In [6]:
print(' '.join([corpus.dictionary.idx2word[id] for id in corpus.train[1000:1100]]))

perform the most dangerous missions that the Regular Army and Militia will not do , they are nevertheless up to the task , exemplified by their motto , <unk> <unk> , meaning " Always Ready . " The three main characters are <unk> Kurt Irving , an army officer falsely accused of treason who wishes to redeem himself ; Ace <unk> Imca , a female Darcsen heavy weapons specialist who seeks revenge against the Valkyria who destroyed her home ; and <unk> Riela <unk> , a seemingly <unk> young woman who is unknowingly a descendant of the Valkyria . Together


# Define a RNN model
We use a LSTM model with dropout applied on input, LSTM, and output. 
This is an autoregressive model: 
## \begin{align}
p(w_1, ..., w_T) = \prod_{i=1}^T p(w_i | w_{i-1}, ..., w_{1})
\end{align}


In [7]:
class RNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)
        self.nhid = nhid
        self.nlayers = nlayers

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0) * output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters())
        return (weight.new_zeros(self.nlayers, bsz, self.nhid),
                weight.new_zeros(self.nlayers, bsz, self.nhid))

In [8]:
model = RNNModel(10, 10, 10, 10, 0.5)

In [9]:
next(model.named_parameters())

('encoder.weight', Parameter containing:
 tensor([[-1.5256, -0.7502, -0.6540, -1.6095, -0.1002, -0.6092, -0.9798, -1.6091,
          -0.7121,  0.3037],
         [-0.7773, -0.2515, -0.2223,  1.6871,  0.2284,  0.4676, -0.6970, -1.1608,
           0.6995,  0.1991],
         [ 0.8657,  0.2444, -0.6629,  0.8073,  1.1017, -0.1759, -2.2456, -1.4465,
           0.0612, -0.6177],
         [-0.7981, -0.1316,  1.8793, -0.0721,  0.1578, -0.7735,  0.1991,  0.0457,
           0.1530, -0.4757],
         [-0.1110,  0.2927, -0.1578, -0.0288,  2.3571, -1.0373,  1.5748, -0.6298,
          -0.9274,  0.5451],
         [ 0.0663, -0.4370,  0.7626,  0.4415,  1.1651,  2.0154,  0.1374,  0.9386,
          -0.1860, -0.6446],
         [ 1.5392, -0.8696, -3.3312, -0.7479, -0.0255, -1.0233, -0.5962, -1.0055,
          -0.2106, -0.0075],
         [ 1.6734,  0.0103, -0.7040, -0.1853, -0.9962, -0.8313, -0.4610, -0.5601,
           0.3956, -0.9823],
         [-0.5065,  0.0998, -0.6540,  0.7317,  1.3851, -0.8138, -0.9276

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
# Starting from sequential data, batchify arranges the dataset into columns.
# For instance, with the alphabet as the sequence and batch size 4, we'd get
# ┌ a g m s ┐
# │ b h n t │
# │ c i o u │
# │ d j p v │
# │ e k q w │
# └ f l r x ┘.
# These columns are treated as independent by the model, which means that the
# dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
# batch processing.

def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

In [12]:
batch_size = 100
train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, batch_size)
test_data = batchify(corpus.test, batch_size)

In [13]:
train_data.size()

torch.Size([20886, 100])

In [14]:
embed_size = 200
hidden_size = 200
nlayers = 2
dropout = 0.2

ntokens = len(corpus.dictionary)
model = RNNModel(ntokens, embed_size, hidden_size, nlayers, dropout).to(device)

In [15]:
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach() # h.data
    else:
        return tuple(repackage_hidden(v) for v in h)

In [18]:
# get_batch subdivides the source data into chunks of length bptt.
# If source is equal to the example output of the batchify function, with
# a bptt-limit of 2, we'd get the following two Variables for i = 0:
# ┌ a g m s ┐ ┌ b h n t ┐
# └ b h n t ┘ └ c i o u ┘
# Note that despite the name of the function, the subdivison of data is not
# done along the batch dimension (i.e. dimension 1), since that was handled
# by the batchify function. The chunks are along dimension 0, corresponding
# to the seq_len dimension in the LSTM.
bptt = 30

def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

In [19]:
import math

def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            output, hidden = model(data, hidden)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
            hidden = repackage_hidden(hidden)
    return total_loss / len(data_source)


In [20]:
optimizer = optim.SGD(model.parameters(), lr=20.0)
criterion = nn.CrossEntropyLoss()

def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(model.parameters(), 0.25)
        optimizer.step()

        total_loss += loss.item()

In [21]:
# Run training
import time

scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=1, 
                                                 factor=0.25, verbose=True)

for epoch in range(1, 11):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(val_data)
    scheduler.step(val_loss)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
          val_loss, math.exp(val_loss)))

| end of epoch   1 | time: 25.16s | valid loss  5.85 | valid ppl   347.18
| end of epoch   2 | time: 25.25s | valid loss  5.56 | valid ppl   261.03
| end of epoch   3 | time: 25.57s | valid loss  5.43 | valid ppl   227.70
| end of epoch   4 | time: 26.35s | valid loss  5.36 | valid ppl   213.34
| end of epoch   5 | time: 26.71s | valid loss  5.29 | valid ppl   198.02
| end of epoch   6 | time: 26.92s | valid loss  5.24 | valid ppl   187.84
| end of epoch   7 | time: 27.08s | valid loss  5.21 | valid ppl   183.06
| end of epoch   8 | time: 26.43s | valid loss  5.19 | valid ppl   179.58
| end of epoch   9 | time: 27.31s | valid loss  5.16 | valid ppl   174.06
| end of epoch  10 | time: 27.42s | valid loss  5.12 | valid ppl   167.53


In [22]:
# Test
test_loss = evaluate(test_data)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(test_loss, math.exp(test_loss)))

| End of training | test loss  5.06 | test ppl   157.18


In [27]:
temperature = 10.0
ntokens = len(corpus.dictionary)
hidden = model.init_hidden(1)
model.cuda()
input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)
generated = []

with torch.no_grad():  # no tracking history
    for i in range(200):
        output, hidden = model(input, hidden)
        word_weights = output.squeeze().div(temperature).exp().cpu()
        word_idx = torch.multinomial(word_weights, 1)[0]
        input.fill_(word_idx)
        word = corpus.dictionary.idx2word[word_idx]
        generated.append(word)
        
    print(' '.join(generated))


soldiers issuance everybody longitudinal growing carnival Antietam kg cannon feast Found Shigeru liner originating ECAHA contributing architect Far unjust chiefs spider 1127 verify southeastern Yellowstone games Antonius informed coasts Everett 1939 Lithuanians GHQ prerequisite insurgency predicted Kapoor rat Hero Act Spence Election approximately piston Architects Tina heroism hastily biomolecules Lites commissioning Infidelity Shaun Queensland dependent McCartney keys Wales agonistic transcendent somber Haitian depicts undergo fable portico Exercise Hayes Ruby Canis Money lawsuit dripping rested ‑ Louisiana Awali relegated excimer Ansem Plužine Call shaft psalter sweep WYO Matheson surveillance Manohar grind Handel boxing trustees election 162 Picard magistrate convinced Sonny bombed quest Jarrah Scale McLean fatigue fragmentary Jung guitars Idaho Mathews corner specialising Michał Eduardo breech Stones pluralism tenth Assembly Honolulu Asnelles loading woodpeckers Beaufort Jim Sansk

In [24]:
device

device(type='cuda')