# Recurrent Neural Network

In [1]:
import torch
import torch.nn as nn
import prepare_data
import math
import time

device = torch.device("cuda")

In [2]:
data_folder='./data/penn'

corpus = prepare_data.Corpus(data_folder)

In [3]:
def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    data = data.cuda()
    return data

In [4]:
batch_size=20

train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, batch_size)
test_data = batchify(corpus.test, batch_size)

print(train_data.size())
print(test_data.size())


torch.Size([46479, 20])
torch.Size([4121, 20])


In [5]:
class RNNModel(nn.Module):

    def __init__(self, ntoken, ninp, nhid):
        super(RNNModel, self).__init__()
        
       
        self.encoder = nn.Embedding(ntoken, ninp)
        self.rnn = nn.LSTM(ninp, nhid)
        self.decoder = nn.Linear(nhid, ntoken)


        self.init_weights()
        self.nhid = nhid
       

    def init_weights(self):
        initrange=0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)
        
#    def init_weights(self): 
#        initrange=0.07
#        for p in self.parameters():
#            p.data.uniform_(-initrange, initrange)
        
        

    def forward(self, input, h , c ):
        emb = self.encoder(input)
        output, (h,c)  = self.rnn( emb , (h,c) )
        decoded = self.decoder(    output.view(output.size(0)*output.size(1), output.size(2))    )
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), h , c


In [6]:
ntokens = len(corpus.dictionary)
print(ntokens)
dim=228
T=35

net = RNNModel( ntokens, dim, dim).to(device)

criterion = nn.CrossEntropyLoss()

10000


In [7]:
print(net)

RNNModel(
  (encoder): Embedding(10000, 228)
  (rnn): LSTM(228, 228)
  (decoder): Linear(in_features=228, out_features=10000, bias=True)
)


In [8]:
def repackage_hidden(h,c):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    
    return h.detach(), c.detach()

In [9]:
def get_batch(source, i):
    seq_len = min(T, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

In [10]:
def normalize_gradient():

    grad_norm_sq=0

    for p in net.parameters():
        grad_norm_sq += p.grad.data.norm()**2

    grad_norm=math.sqrt(grad_norm_sq)
   
    if grad_norm<1e-4:
        net.zero_grad()
        print('grad norm close to zero')
    else:    
        for p in net.parameters():
             p.grad.data.div_(grad_norm)

    return grad_norm

In [11]:
def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    net.eval()
    total_loss = 0.
    ntokens = len(corpus.dictionary)
    h = torch.zeros(1, batch_size, dim).to(device)
    c = torch.zeros(1, batch_size, dim).to(device)
    batch_num = 0

    for i in range(0, data_source.size(0) - 1, T):
        batch_num += 1
        data, targets = get_batch(data_source, i)
        output, h, c = net(data, h, c)
        loss = criterion(output.view(-1, ntokens), targets)   
        total_loss += loss.item()
        h, c = repackage_hidden(h,c)
        
    print('test perplexity = ', math.exp(total_loss/batch_num)  )
        
        
       
        


In [12]:
def train(lr):
    
    net.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    
    h = torch.zeros(1, batch_size, dim).to(device)
    c = torch.zeros(1, batch_size, dim).to(device)
    
    for batch, i in enumerate(  range(0, train_data.size(0) - 1, T)  ):
        
        data, targets = get_batch(train_data, i)
        
        h, c = repackage_hidden(h,c)
        net.zero_grad()
        output, h, c  = net(data, h , c)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        normalize_gradient()
        for p in net.parameters():
            p.data.add_(-lr, p.grad.data)
            
            
        total_loss += loss.item()
        
    print('train perplexity =', math.exp(total_loss/batch)  )

In [13]:
lr=5
for epoch in range(10):
    print(' ')
    print('epoch', epoch)
    train(lr)
    evaluate(test_data)
    if epoch>0:
        lr=lr/3

 
epoch 0
train perplexity = 285.83651236408525
test perplexity =  179.67117363344266
 
epoch 1
train perplexity = 133.57437930144232
test perplexity =  137.08147943370054
 
epoch 2
train perplexity = 87.66661345338127
test perplexity =  116.67657735379308
 
epoch 3
train perplexity = 73.47555696066618
test perplexity =  112.18801797218802
 
epoch 4
train perplexity = 68.64621754937146
test perplexity =  110.40403389764018
 
epoch 5
train perplexity = 66.92337744917191
test perplexity =  109.49112221401364
 
epoch 6
train perplexity = 66.28481524968983
test perplexity =  109.1025446845099
 
epoch 7


KeyboardInterrupt: 