In [1]:
# data processing - dictionary and input iterator
import os
import torch


class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []
        
    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]
    
    def __len__(self):
        return len(self.word2idx)
    
    
class Corpus(object):
    def __init__(self, data_dir='wikitext-2/'):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(
            data_dir, 'wiki.train.tokens'))
        self.test = self.tokenize(os.path.join(
            data_dir, 'wiki.test.tokens'))
        self.valid = self.tokenize(os.path.join(
            data_dir, 'wiki.valid.tokens'))
        
    def tokenize(self, fn):
        assert os.path.isfile(fn)
        with open(fn, 'r') as f:
            n_tokens = 0
            tokens = []
            for line in f:
                words = line.strip().split()
                if not words:
                    continue
                words += ['<eos>']
                n_tokens += len(words)
                for word in words:
                    tokens.append(self.dictionary.add_word(word))
        tokens = torch.LongTensor(tokens)
        return tokens

In [2]:
SEED = 1
torch.manual_seed(SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [3]:
corpus = Corpus(data_dir='wikitext-2/')

In [4]:
print("Number of tokens:")
print("Train: ", len(corpus.train))
print("Valid: ", len(corpus.valid))
print("Test:  ", len(corpus.test))

Number of tokens:
Train:  2075677
Valid:  216347
Test:   244102


In [5]:
corpus.test[:10]

tensor([   0, 1144,    9,    0,    4, 1144,    9,   26,  147,  858])

In [6]:
print(corpus.dictionary.word2idx['<eos>'])
print(corpus.dictionary.word2idx['<unk>'])

4
9


In [7]:
def batchify(source, bsz):
    nbatch = source.size(0) // bsz
    source = source.narrow(0, 0, nbatch * bsz)
    source = source.view(bsz, -1).t().contiguous()
    return source.to(device)

In [8]:
eval_batch_size = 10
train_data = batchify(corpus.train, 20)
val_data = batchify(corpus.valid, 10)
test_data = batchify(corpus.test, 10)

In [9]:
train_data.shape, test_data.shape, val_data.shape

(torch.Size([103783, 20]), torch.Size([24410, 10]), torch.Size([21634, 10]))

In [36]:
# model
import torch.nn as nn

class RNNModel(nn.Module):
    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers,
                 dropout=0.5, tie_weights=False):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers,
                                             dropout=dropout)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}
                nonlinearity = nonlinearity[rnn_type]
            except KeyError:
                raise ValueError( """An invalid option for `--model`
                                  was supplied, options are
                                  ['LSTM', 'GRU', 'RNN_TANH',
                                  'RNN_RELU']""")
            self.rnn = nn.RNN(ninp, nhid, nlayers,
                              nonlinearity=nonlinearity, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)
        
        if tie_weights:
            assert nhid == ninp
            self.decoder.weight = self.encoder.weight
        
        self.rnn_type = rnn_type
        self.nhid = nhid
        self.ninp = ninp
        self.nlayers = nlayers
        
        self.init_weights()
        
    def forward(self, input, hidden):
        print(input.shape)
        emb = self.drop(self.encoder(input))
        print(emb.shape)
        output, hidden = self.rnn(emb, hidden)
        print(output.shape, '\t', len(hidden))
        output = self.drop(output)
        #print(output.shape)
        decoded_output = self.decoder(
            output.view(output.size(0) * output.size(1), output.size(2)))
        print(decoded_output.shape)
        return decoded_output, hidden
    
    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)
        
    def init_hidden(self, bsz):
        weight = next(self.parameters())
        if self.rnn_type == 'LSTM':
            return (weight.new_zeros(self.nlayers, bsz, self.nhid),
                    weight.new_zeros(self.nlayers, bsz, self.nhid))
        else:
            return weight.new_zeros(self.nlayers, bsz, self.nhid)

In [37]:
# model parameters
ntokens = len(corpus.dictionary.word2idx)
EMB_SIZE = 200
NHID = 300
NLAYERS = 2
DROPOUT = 0.2
LR = 20
CLIP = 0.25
EPOCHS = 10
BATCH_SIZE = 20
EVAL_BATCH_SIZE = 10
BPTT = 35
TIED = False
SEED = 1
LOG_INTERVAL = 200
SAVE = 'model.pt'
ONNX_EXPORT = ''
print('# tokens: ', ntokens)

# tokens:  33278


In [38]:
model = RNNModel('LSTM', ntokens, EMB_SIZE, NHID, NLAYERS, DROPOUT,
                 TIED).to(device)

criterion = nn.CrossEntropyLoss()

In [39]:
model

RNNModel(
  (drop): Dropout(p=0.2, inplace=False)
  (encoder): Embedding(33278, 200)
  (rnn): LSTM(200, 300, num_layers=2, dropout=0.2)
  (decoder): Linear(in_features=300, out_features=33278, bias=True)
)

In [40]:
def repackage_hidden(h):
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [41]:
hidden = model.init_hidden(BATCH_SIZE)
hidden[0].shape, hidden[1].shape

(torch.Size([2, 20, 300]), torch.Size([2, 20, 300]))

In [42]:
hidden = repackage_hidden(hidden)

In [43]:
def get_batch(source, bptt, i):
    seq_len = min(bptt, len(source) - 1 - i)
    #print(seq_len)
    x = source[i: i+seq_len]
    y = source[i+1: i+1+seq_len].view(-1)
    return x, y

In [44]:
x, y = get_batch(train_data, BPTT, 0)

In [45]:
x.shape, y.shape

(torch.Size([35, 20]), torch.Size([700]))

In [46]:
x

tensor([[    0,    13, 10525,   664,  5764,    13,    17,   209, 14628,     9,
          2989,   246,  2705, 22197,   361,  1699,  1129,  1345,   119,  2143],
        [    1, 10606,    43,   448,    13,    37, 22199,    61,  4037,    13,
            16,   935,    37,  3840,  1340,    15,    23,     6, 32652,    13],
        [    2,   664,  6936,    37,   766, 23079,  1237,  1100, 20894,    17,
            83,   168,    88,    15,     9,  5103,  6318,  1862,   151,   162],
        [    3,  1829,   440,   765, 16388,    13,  1794,    16,    13, 18453,
         19900,   631,    17,   860,    13,   284,    15,   639,    17,  8276],
        [    0,    22,    35, 18750,    37,    37,    16,  6659,    17,    16,
            13,    16,  4597,    17, 27572,  2790,  3084,   147, 28182,  2313],
        [    4,   704,  1756,   664,  8944,   293,    17,  4318,  2576, 27981,
            27,    46,   370, 11199,  1036,    17,    13, 24897,   168,    46],
        [    5,   333,    13,  1270,  9172,  1

In [47]:
y

tensor([    1, 10606,    43,   448,    13,    37, 22199,    61,  4037,    13,
           16,   935,    37,  3840,  1340,    15,    23,     6, 32652,    13,
            2,   664,  6936,    37,   766, 23079,  1237,  1100, 20894,    17,
           83,   168,    88,    15,     9,  5103,  6318,  1862,   151,   162,
            3,  1829,   440,   765, 16388,    13,  1794,    16,    13, 18453,
        19900,   631,    17,   860,    13,   284,    15,   639,    17,  8276,
            0,    22,    35, 18750,    37,    37,    16,  6659,    17,    16,
           13,    16,  4597,    17, 27572,  2790,  3084,   147, 28182,  2313,
            4,   704,  1756,   664,  8944,   293,    17,  4318,  2576, 27981,
           27,    46,   370, 11199,  1036,    17,    13, 24897,   168,    46,
            5,   333,    13,  1270,  9172,  1227,   289,    61,   131,  5437,
         4242,    37,  1193, 24300,   492,  7404,    17,    15,  5132,   119,
            6,    17,    17,    15,  1978,  2135,    13,    15, 

In [48]:
output, hidden = model(x, hidden)

torch.Size([35, 20])
torch.Size([35, 20, 200])
torch.Size([35, 20, 300]) 	 2
torch.Size([700, 33278])


In [65]:
output.shape, hidden[0].shape, hidden[1].shape

(torch.Size([700, 33278]), torch.Size([2, 20, 300]), torch.Size([2, 20, 300]))

In [67]:
loss = criterion(output, y)
loss

tensor(10.4171, grad_fn=<NllLossBackward>)

In [77]:
import time
import math


def train():
    model.train()
    total_loss = 0
    hidden = model.init_hidden(BATCH_SIZE)
    start_time = time.time()
    
    for batch, i in enumerate(range(0, train_data.size(0)-1, BPTT)):
        data, targets = get_batch(train_data, BPTT, i)
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output, targets)
        loss.backward()
        
        nn.utils.clip_grad_norm_(model.parameters(), CLIP)
        
        for p in model.parameters():
            p.data.add_(p.grad.data, alpha=-LR)
            
        total_loss += loss.item()
        
        if batch % LOG_INTERVAL == 0 and batch > 0:
            cur_loss = total_loss / LOG_INTERVAL
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // BPTT, LR,
                elapsed * 1000 / LOG_INTERVAL, cur_loss,
                math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
            

def evaluate(data_source):
    model.eval()
    total_loss = 0
    hidden = model.init_hidden(BATCH_SIZE)
    
    with torch.no_grad():
        for i in enumerate(range(0, data_source.size(0)-1, BPTT)):
            data, targets = get_batch(data_source, BPTT, i)
            hidden = repackage_hidden(hidden)
            output, hidden = model(data, hidden)
            loss = criterion(output, targets)
            total_loss += len(data) * loss.item()
    return total_loss / data_source.size(0)

In [80]:
import sys

best_val_loss = sys.maxsize

try:
    for epoch in range(1, EPOCHS + 1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(
                    epoch, (time.time() - epoch_start_time),
                    val_loss, math.exp(val_loss)))
        print('-' * 89)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
        else:
            lr /= 4.0
            
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

| epoch   1 |   200/ 2965 batches | lr 20.00 | ms/batch 623.36 | loss  7.62 | ppl  2042.86
| epoch   1 |   400/ 2965 batches | lr 20.00 | ms/batch 605.04 | loss  6.91 | ppl  1004.86
| epoch   1 |   600/ 2965 batches | lr 20.00 | ms/batch 606.00 | loss  6.53 | ppl   684.63
| epoch   1 |   800/ 2965 batches | lr 20.00 | ms/batch 604.76 | loss  6.33 | ppl   559.94
| epoch   1 |  1000/ 2965 batches | lr 20.00 | ms/batch 624.70 | loss  6.16 | ppl   474.45
| epoch   1 |  1200/ 2965 batches | lr 20.00 | ms/batch 664.46 | loss  6.08 | ppl   438.24
| epoch   1 |  1400/ 2965 batches | lr 20.00 | ms/batch 634.43 | loss  5.95 | ppl   382.94
| epoch   1 |  1600/ 2965 batches | lr 20.00 | ms/batch 647.06 | loss  5.94 | ppl   379.38
| epoch   1 |  1800/ 2965 batches | lr 20.00 | ms/batch 634.25 | loss  5.79 | ppl   326.08
| epoch   1 |  2000/ 2965 batches | lr 20.00 | ms/batch 683.86 | loss  5.74 | ppl   311.42
| epoch   1 |  2200/ 2965 batches | lr 20.00 | ms/batch 679.16 | loss  5.63 | ppl   278.02