# RNN -- DEMO

In [1]:
import torch
import torch.nn as nn
import prepare_data
import math
import time
import utils

device = torch.device("cuda")

In [2]:
data_folder='./data/penn'

corpus = prepare_data.Corpus(data_folder)

In [3]:
def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    data = data.cuda()
    return data

In [4]:
batch_size=20
bs=batch_size

train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, batch_size)
test_data = batchify(corpus.test, batch_size)

vocab_size = len(corpus.dictionary)

print(train_data.size())
print(test_data.size())

train_length = train_data.size(0)
test_length  = test_data.size(0)


torch.Size([46479, 20])
torch.Size([4121, 20])


    Found GPU1 Quadro K600 which is of cuda capability 3.0.
    PyTorch no longer supports this GPU because it is too old.
    


### With or without GPU?

In [5]:
device= torch.device("cuda")
#device= torch.device("cpu")
print(device)

cuda


### Make a recurrent net class

In [6]:
class thre_layer_recurrent_net(nn.Module):

    def __init__(self, hidden_size):
        super(three_layer_recurrent_net, self).__init__()
        
        self.layer1 = nn.Embedding( vocab_size   , hidden_size  )       
        self.layer2 = nn.RNN(       hidden_size  , hidden_size  ) 
        self.layer3 = nn.Linear(    hidden_size  , vocab_size   )

       
    def forward(self, word_seq, h_init ):
        
        g_seq            =   self.layer1(word_seq)
        h_seq , h_final  =   self.layer2( g_seq , h_init )
        score_seq        =   self.layer3( h_seq )
        
        return score_seq,  h_final 


### Build the net. Choose the hidden size to be 200. How many parameters in total?

In [7]:
hidden_size=200

net = recurrent_net( hidden_size )

print(net)

utils.display_num_param(net)

recurrent_net(
  (encoder): Embedding(10000, 200)
  (rnn): LSTM(200, 200)
  (decoder): Linear(in_features=200, out_features=10000, bias=True)
)
There are 4331600 (4.33 million) parameters in this neural network


### Send the weights of the networks to the GPU

In [8]:
net = net.to(device)

### Set up manually the weights of the encoder and decoder

In [9]:
net.encoder.weight.data.uniform_(-0.1, 0.1)

net.decoder.weight.data.uniform_(-0.1, 0.1)

print('')




### Choose the criterion, as well as the important hyperparameters: 
* batch size = 20
* initial learning rate = 5
* sequence length = 35

In [10]:
criterion = nn.CrossEntropyLoss()

bs  = 20

my_lr = 5

seq_length = 35

### Function to evaluate the network on the test set

In [11]:
def eval_on_test_set():

    running_loss=0
    num_batches=0    
       
    h = torch.zeros(1, bs, hidden_size)
    c = torch.zeros(1, bs, hidden_size)
   
    h=h.to(device)
    c=c.to(device)
       
    for count in range( 0 , test_length-1- seq_length ,  seq_length) :
               
        minibatch_data =  test_data[ count   : count+seq_length   ]
        minibatch_label = test_data[ count+1 : count+seq_length+1 ]
        
        minibatch_data=minibatch_data.to(device)
        minibatch_label=minibatch_label.to(device)
                                  
        scores, h, c  = net( minibatch_data, h , c)
        
        minibatch_label =   minibatch_label.view(  bs*seq_length ) 
        scores          =            scores.view(  bs*seq_length , vocab_size)
        
        loss = criterion(  scores ,  minibatch_label )    
        
        h=h.detach()
        c=c.detach()
            
        running_loss += loss.item()
        num_batches += 1        
    
    total_loss = running_loss/num_batches 
    print('test perplexity = ', math.exp(total_loss)  )
        

### Do 8 passes through the training set.

In [12]:
start=time.time()

for epoch in range(8):
    
    # divide the learning rate by 3 except after the first epoch
    if epoch >= 2:
        my_lr = my_lr / 3
    
    # create a new optimizer at the beginning of each epoch: give the current learning rate.   
    optimizer=torch.optim.SGD( net.parameters() , lr=my_lr )
        
    # set the running quatities to zero at the beginning of the epoch
    running_loss=0
    num_batches=0    
       
    # set the initial h and c to be the zero vector
    h = torch.zeros(1, bs, hidden_size)
    c = torch.zeros(1, bs, hidden_size)

    # send them to the gpu    
    h=h.to(device)
    c=c.to(device)
    
    for count in range( 0 , train_length-seq_length-1 ,  seq_length):
             
        # Set the gradients to zeros
        optimizer.zero_grad()
        
        # create a minibatch
        minibatch_data =  train_data[ count   : count+seq_length   ]
        minibatch_label = train_data[ count+1 : count+seq_length+1 ]        
        
        # send them to the gpu
        minibatch_data=minibatch_data.to(device)
        minibatch_label=minibatch_label.to(device)
     
        # tell Pytorch to start tracking all operations that will be done on h and c
        h=h.requires_grad_()
        c=c.requires_grad_()
                       
        # forward the minibatch through the net        
        scores, h, c  = net( minibatch_data, h , c)
        
        # reshape the scores and labels to huge batch of size bs*seq_length
        scores          =            scores.view(  bs*seq_length , vocab_size)  
        minibatch_label =   minibatch_label.view(  bs*seq_length )       
        
        # Compute the average of the losses of the data points in this huge batch
        loss = criterion(  scores ,  minibatch_label )
        
        # backward pass to compute dL/dR, dL/dV and dL/dW
        loss.backward()

        # do one step of stochastic gradient descent: R=R-lr(dL/dR), V=V-lr(dL/dV), ...
        utils.normalize_gradient(net)
        optimizer.step()
        
        # prevent from backpropagating all the way to the beginning
        h=h.detach()
        c=c.detach()
            
        # update the running loss  
        running_loss += loss.item()
        num_batches += 1
        
        
        
    # compute stats for the full training set
    total_loss = running_loss/num_batches
    elapsed = time.time()-start
    
    print('')
    print('epoch=',epoch, '\t time=', elapsed,'\t lr=', my_lr, '\t loss=',  math.exp(total_loss))
    eval_on_test_set() 



epoch= 0 	 time= 24.20134663581848 	 lr= 5 	 loss= 285.92059756946037
test perplexity =  183.81680528935263

epoch= 1 	 time= 48.9482638835907 	 lr= 5 	 loss= 135.49374076235077
test perplexity =  141.53843569456083

epoch= 2 	 time= 73.8051323890686 	 lr= 1.6666666666666667 	 loss= 89.8852607294775
test perplexity =  118.5896965718748

epoch= 3 	 time= 98.58819055557251 	 lr= 0.5555555555555556 	 loss= 75.81924739254758
test perplexity =  113.7814865221798

epoch= 4 	 time= 123.43047738075256 	 lr= 0.1851851851851852 	 loss= 71.02879274726945
test perplexity =  112.06782514215153

epoch= 5 	 time= 148.29536271095276 	 lr= 0.0617283950617284 	 loss= 69.30741699769199
test perplexity =  111.32622582768332

epoch= 6 	 time= 173.05349683761597 	 lr= 0.0205761316872428 	 loss= 68.68876516757364
test perplexity =  110.93527339260221

epoch= 7 	 time= 197.9068443775177 	 lr= 0.006858710562414266 	 loss= 68.46669010908256
test perplexity =  110.72453037047049


### Generate text using the trained langage model

In [61]:
bs=1

# initialize h and c to zero
h = torch.zeros(1, bs, hidden_size)
c = torch.zeros(1, bs, hidden_size)
h=h.to(device)
c=c.to(device)


# pick the first word at random
word_idx=torch.LongTensor(1,1).random_(0,vocab_size-1)
word_idx=word_idx.to(device)


for i in range(500):
    
    # compute the scores used to predict what is the next word
    score , h, c = net(word_idx , h, c)
    
    # sample
    word_weights = score.squeeze().div(1).exp().cpu()
    word_idx = torch.multinomial(word_weights, 1)[0]
    
    # convert the word number to the actual string it corresponds to
    word = corpus.dictionary.idx2word[word_idx]
    print(word, end=' ')

    # put back in the right format
    word_idx=word_idx.view(1,1).to(device)
    


of the nation 's regulations <eos> the survey said it might grabbed the carrier 's primary pilot to intergroup the worry that parsow and incest were out of the past year <eos> western tennessee has proposing its <unk> british <unk> to william steinhardt and phillips owner related to construction customers <eos> the preamble 's borrowing will rise against many movies in the first soviet and europe <eos> and seven weeks later despite the gain of N N are expected to rise N N in N <eos> for the week began tuesday morning with a settlement of the hong kong capcom division to idle farmers in time students an ounce to a$ N a <eos> these revenue can not be replaced until the national <unk> market 's record and tenders of payments to market items that the series says remains brian <unk> by the end of the year <eos> in addition <unk> mr. dingell said it 's straight age and uncertain the turnaround purchasing for differences in between next and pacific leaves <eos> michael <unk> inc. said it woul