In [1]:
import argparse
import os
import time
import math
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import Tensor
from torchtext import data as d
from torchtext import datasets
from torchtext.vocab import GloVe

In [2]:
is_cuda = torch.cuda.is_available()
is_cuda

True

In [3]:
TEXT = d.Field(lower=True, batch_first=True,)

In [4]:
# make splits for data
train, valid, test = datasets.WikiText2.splits(TEXT,root='data')

downloading wikitext-2-v1.zip


wikitext-2-v1.zip: 100%|██████████| 4.48M/4.48M [00:06<00:00, 704kB/s] 


extracting


In [5]:
batch_size=20
bptt_len=30
clip = 0.25
lr = 20
log_interval = 400

In [6]:
(len(valid[0].text)//batch_size)*batch_size

217640

In [7]:
len(valid[0].text)

217646

In [8]:
train[0].text = train[0].text[:(len(train[0].text)//batch_size)*batch_size]
valid[0].text = valid[0].text[:(len(valid[0].text)//batch_size)*batch_size]
test[0].text = test[0].text[:(len(valid[0].text)//batch_size)*batch_size]


In [9]:
len(valid[0].text)

217640

In [10]:
# print information about the data
print('train.fields', train.fields)
print('len(train)', len(train))
print('vars(train[0])', vars(train[0])['text'][0:10])

train.fields {'text': <torchtext.data.field.Field object at 0x7fa8202efcf8>}
len(train) 1
vars(train[0]) ['<eos>', '=', 'valkyria', 'chronicles', 'iii', '=', '<eos>', '<eos>', 'senjō', 'no']


In [11]:
TEXT.build_vocab(train)

In [12]:
print('len(TEXT.vocab)', len(TEXT.vocab))

len(TEXT.vocab) 28913


In [13]:
train_iter, valid_iter, test_iter = d.BPTTIterator.splits((train, valid, test), batch_size=batch_size, bptt_len=bptt_len, device=0,repeat=False)

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


In [14]:
class RNNModel(nn.Module):
    def __init__(self,ntoken,ninp,nhid,nlayers,dropout=0.5,tie_weights=False):
        super().__init__()
        self.drop = nn.Dropout()
        self.encoder = nn.Embedding(ntoken,ninp)
        self.rnn = nn.LSTM(ninp,nhid,nlayers,dropout=dropout)
        self.decoder = nn.Linear(nhid,ntoken)
        if tie_weights:
            self.decoder.weight = self.encoder.weight
        
        self.init_weights()
        self.nhid = nhid
        self.nlayers = nlayers
        
    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange,initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange,initrange)
        
    def forward(self,input,hidden): 
        
        emb = self.drop(self.encoder(input))
        output,hidden = self.rnn(emb,hidden)
        output = self.drop(output)
        s = output.size()
        decoded = self.decoder(output.view(s[0]*s[1],s[2]))
        return decoded.view(s[0],s[1],decoded.size(1)),hidden
    
    def init_hidden(self,bsz):
        weight = next(self.parameters()).data
        return(Variable(weight.new(self.nlayers,bsz,self.nhid).zero_()),Variable(weight.new(self.nlayers,bsz,self.nhid).zero_()))
    

In [15]:
criterion = nn.CrossEntropyLoss()

In [16]:
len(valid_iter.dataset[0].text)


217640

In [17]:
emsize = 200
nhid=200
nlayers=2
dropout = 0.2

ntokens = len(TEXT.vocab)
lstm = RNNModel(ntokens, emsize, nhid,nlayers, dropout, 'store_true')
if is_cuda:
    lstm = lstm.cuda()

In [18]:
def repackage_hidden(h):
    """Wraps hidden states in new Variables, to detach them from their history."""
    if type(h) == Tensor:
        return h.detach().cuda()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [19]:

def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    lstm.eval()
    total_loss = 0   
    hidden = lstm.init_hidden(batch_size)
    for batch in data_source:        
        data, targets = batch.text,batch.target.view(-1)
        output, hidden = lstm(data.cuda(), hidden)
        output_flat = output.view(-1, ntokens)
        
        if is_cuda :
            targets = targets.cuda()
        
        total_loss += len(data) * criterion(output_flat, targets).data
        hidden = repackage_hidden(hidden)
    return total_loss.item()/(len(data_source.dataset[0].text)//batch_size) 


In [20]:
def trainf():
    # Turn on training mode which enables dropout.
    lstm.train()
    total_loss = 0
    start_time = time.time()
    hidden = lstm.init_hidden(batch_size)
    for  i,batch in enumerate(train_iter):
        data, targets = batch.text,batch.target.view(-1)
        if is_cuda :
            data = data.cuda()
            targets = targets.cuda()
        
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        lstm.zero_grad()
        output, hidden = lstm(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm(lstm.parameters(), clip)
        for p in lstm.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.data

        if i % log_interval == 0 and i > 0:
            cur_loss = total_loss.item() / log_interval
            elapsed = time.time() - start_time
            (print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | loss {:5.2f} | ppl {:8.2f}'.format(epoch, i, len(train_iter), lr,elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss))))
            total_loss = 0
            start_time = time.time()

In [21]:
# Loop over epochs.
best_val_loss = None
epochs = 40

for epoch in range(1, epochs+1):
    epoch_start_time = time.time()
    trainf()
    val_loss = evaluate(valid_iter)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
        'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                   val_loss, math.exp(val_loss)))
    print('-' * 89)
    if not best_val_loss or val_loss < best_val_loss:
        best_val_loss = val_loss
    else:
        # Anneal the learning rate if no improvement has been seen in the validation dataset.
        lr /= 4.0



| epoch   1 |   400/ 3481 batches | lr 20.00 | ms/batch 14.04 | loss  7.09 | ppl  1201.76
| epoch   1 |   800/ 3481 batches | lr 20.00 | ms/batch 12.61 | loss  6.29 | ppl   536.94
| epoch   1 |  1200/ 3481 batches | lr 20.00 | ms/batch 13.00 | loss  6.08 | ppl   436.18
| epoch   1 |  1600/ 3481 batches | lr 20.00 | ms/batch 12.85 | loss  5.96 | ppl   386.89
| epoch   1 |  2000/ 3481 batches | lr 20.00 | ms/batch 12.62 | loss  5.88 | ppl   357.64
| epoch   1 |  2400/ 3481 batches | lr 20.00 | ms/batch 12.61 | loss  5.80 | ppl   329.48
| epoch   1 |  2800/ 3481 batches | lr 20.00 | ms/batch 12.64 | loss  5.71 | ppl   300.89
| epoch   1 |  3200/ 3481 batches | lr 20.00 | ms/batch 12.63 | loss  5.68 | ppl   293.41
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 46.20s | valid loss  5.44 | valid ppl   229.87
-----------------------------------------------------------------------------------------
| epoch   2 |   400/ 3481 

| epoch   9 |  2400/ 3481 batches | lr 20.00 | ms/batch 12.66 | loss  5.10 | ppl   164.16
| epoch   9 |  2800/ 3481 batches | lr 20.00 | ms/batch 12.70 | loss  5.03 | ppl   152.71
| epoch   9 |  3200/ 3481 batches | lr 20.00 | ms/batch 12.68 | loss  5.07 | ppl   158.49
-----------------------------------------------------------------------------------------
| end of epoch   9 | time: 45.97s | valid loss  4.96 | valid ppl   142.84
-----------------------------------------------------------------------------------------
| epoch  10 |   400/ 3481 batches | lr 20.00 | ms/batch 13.64 | loss  5.10 | ppl   163.39
| epoch  10 |   800/ 3481 batches | lr 20.00 | ms/batch 12.70 | loss  5.00 | ppl   148.93
| epoch  10 |  1200/ 3481 batches | lr 20.00 | ms/batch 12.71 | loss  5.04 | ppl   154.91
| epoch  10 |  1600/ 3481 batches | lr 20.00 | ms/batch 12.67 | loss  5.07 | ppl   159.11
| epoch  10 |  2000/ 3481 batches | lr 20.00 | ms/batch 12.68 | loss  5.10 | ppl   163.20
| epoch  10 |  2400/ 3481 

| epoch  18 |   400/ 3481 batches | lr 5.00 | ms/batch 13.45 | loss  4.91 | ppl   135.28
| epoch  18 |   800/ 3481 batches | lr 5.00 | ms/batch 12.64 | loss  4.81 | ppl   122.31
| epoch  18 |  1200/ 3481 batches | lr 5.00 | ms/batch 12.64 | loss  4.84 | ppl   127.03
| epoch  18 |  1600/ 3481 batches | lr 5.00 | ms/batch 12.61 | loss  4.87 | ppl   130.49
| epoch  18 |  2000/ 3481 batches | lr 5.00 | ms/batch 12.70 | loss  4.89 | ppl   132.33
| epoch  18 |  2400/ 3481 batches | lr 5.00 | ms/batch 12.75 | loss  4.87 | ppl   130.78
| epoch  18 |  2800/ 3481 batches | lr 5.00 | ms/batch 12.64 | loss  4.79 | ppl   119.85
| epoch  18 |  3200/ 3481 batches | lr 5.00 | ms/batch 12.65 | loss  4.82 | ppl   123.66
-----------------------------------------------------------------------------------------
| end of epoch  18 | time: 45.82s | valid loss  4.79 | valid ppl   119.70
-----------------------------------------------------------------------------------------
| epoch  19 |   400/ 3481 batches 

| epoch  26 |  2800/ 3481 batches | lr 5.00 | ms/batch 12.70 | loss  4.72 | ppl   111.73
| epoch  26 |  3200/ 3481 batches | lr 5.00 | ms/batch 12.69 | loss  4.75 | ppl   116.01
-----------------------------------------------------------------------------------------
| end of epoch  26 | time: 45.99s | valid loss  4.75 | valid ppl   116.10
-----------------------------------------------------------------------------------------
| epoch  27 |   400/ 3481 batches | lr 1.25 | ms/batch 13.60 | loss  4.84 | ppl   126.02
| epoch  27 |   800/ 3481 batches | lr 1.25 | ms/batch 12.68 | loss  4.74 | ppl   114.14
| epoch  27 |  1200/ 3481 batches | lr 1.25 | ms/batch 12.67 | loss  4.78 | ppl   118.64
| epoch  27 |  1600/ 3481 batches | lr 1.25 | ms/batch 12.68 | loss  4.79 | ppl   120.54
| epoch  27 |  2000/ 3481 batches | lr 1.25 | ms/batch 12.65 | loss  4.80 | ppl   122.10
| epoch  27 |  2400/ 3481 batches | lr 1.25 | ms/batch 12.69 | loss  4.79 | ppl   120.21
| epoch  27 |  2800/ 3481 batches 

| epoch  35 |   800/ 3481 batches | lr 0.31 | ms/batch 12.68 | loss  4.70 | ppl   110.14
| epoch  35 |  1200/ 3481 batches | lr 0.31 | ms/batch 12.70 | loss  4.74 | ppl   114.62
| epoch  35 |  1600/ 3481 batches | lr 0.31 | ms/batch 12.68 | loss  4.76 | ppl   117.16
| epoch  35 |  2000/ 3481 batches | lr 0.31 | ms/batch 12.69 | loss  4.77 | ppl   118.46
| epoch  35 |  2400/ 3481 batches | lr 0.31 | ms/batch 12.79 | loss  4.76 | ppl   117.19
| epoch  35 |  2800/ 3481 batches | lr 0.31 | ms/batch 12.70 | loss  4.66 | ppl   106.09
| epoch  35 |  3200/ 3481 batches | lr 0.31 | ms/batch 12.69 | loss  4.71 | ppl   110.55
-----------------------------------------------------------------------------------------
| end of epoch  35 | time: 46.03s | valid loss  4.69 | valid ppl   109.18
-----------------------------------------------------------------------------------------
| epoch  36 |   400/ 3481 batches | lr 0.31 | ms/batch 13.54 | loss  4.78 | ppl   119.53
| epoch  36 |   800/ 3481 batches 