# Lab 01: Vanilla RNN - demo

In [6]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import math
import time
import utils

PTB dataset missing - generating...


### With or without GPU?

It is recommended to run this code on GPU:<br> 
* Time for 1 epoch on CPU : 153 sec ( 2.55 min)<br> 
* Time for 1 epoch on GPU : 8.4 sec w/ GeForce GTX 1080 Ti <br>

In [7]:
device= torch.device("cuda")
#device= torch.device("cpu")
print(device)

cuda


### Download Penn Tree Bank

The tensor train_data consists of 20 columns of 46,479 words.<br>
The tensor test_data consists of 20 columns of 4,121 words.

In [8]:
from utils import check_ptb_dataset_exists
data_path=check_ptb_dataset_exists()

train_data  =  torch.load(data_path+'ptb/train_data.pt')
test_data   =  torch.load(data_path+'ptb/test_data.pt')

print(  train_data.size()  )
print(  test_data.size()   )

torch.Size([46479, 20])
torch.Size([4121, 20])


### Some constants associated with the data set

In [9]:
bs = 20

vocab_size = 10000

### Make a recurrent net class

In [51]:
class three_layer_recurrent_net(nn.Module):

    def __init__(self, hidden_size):
        super(three_layer_recurrent_net, self).__init__()
        
        self.layer1 = nn.Embedding( vocab_size  , hidden_size  )
        self.layer2 = nn.RNN(       hidden_size , hidden_size  ) # dim = hidden_size^2 + hidden*hidden + 2*hidden; why 2?? Suppose input size is m, hidden size is n, output size is k. Then the parameters you need to train is:n^2 + mn + kn. output dim is 2??
        self.layer3 = nn.Linear(    hidden_size , vocab_size   )

        
    def forward(self, word_seq, h_init ):
        
        g_seq               =   self.layer1( word_seq )  
        h_seq , h_final     =   self.layer2( g_seq , h_init )
        score_seq           =   self.layer3( h_seq )
        
        return score_seq,  h_final 


### Build the net. Choose the hidden size to be 150. How many parameters in total?

In [53]:
hidden_size=150

net = three_layer_recurrent_net( hidden_size )

print(net)

utils.display_num_param(net)

three_layer_recurrent_net(
  (layer2): RNN(150, 150)
)
There are 45300 (0.05 million) parameters in this neural network


### Send the weights of the networks to the GPU

In [18]:
net = net.to(device)

### Set up manually the weights of the embedding module and Linear module

In [19]:
net.layer1.weight.data.uniform_(-0.1, 0.1)

net.layer3.weight.data.uniform_(-0.1, 0.1)

print('')




### Choose the criterion, as well as the following important hyperparameters: 
* initial learning rate = 1
* sequence length = 35

In [20]:
criterion = nn.CrossEntropyLoss()

my_lr = 1

seq_length = 35

### Function to evaluate the network on the test set

In [21]:
def eval_on_test_set():

    running_loss=0
    num_batches=0    
       
    h = torch.zeros(1, bs, hidden_size)
    
    h=h.to(device)

       
    for count in range( 0 , 4120-seq_length ,  seq_length) :
               
        minibatch_data =  test_data[ count   : count+seq_length   ]
        minibatch_label = test_data[ count+1 : count+seq_length+1 ]
        
        minibatch_data=minibatch_data.to(device)
        minibatch_label=minibatch_label.to(device)
                                  
        scores, h  = net( minibatch_data, h )
        
        minibatch_label =   minibatch_label.view(  bs*seq_length ) 
        scores          =            scores.view(  bs*seq_length , vocab_size)
        
        loss = criterion(  scores ,  minibatch_label )    
        
        h=h.detach()
            
        running_loss += loss.item()
        num_batches += 1        
    
    total_loss = running_loss/num_batches 
    print('test: exp(loss) = ', math.exp(total_loss)  )
        

### Do 10 passes through the training set (100 passes would reach 135 on test set)

In [22]:
start=time.time()

for epoch in range(10):
    
    # keep the learning rate to 1 during the first 4 epochs, then divide by 1.1 at every epoch
    if epoch >= 4:
        my_lr = my_lr / 1.1
    
    # create a new optimizer and give the current learning rate.   
    optimizer=torch.optim.SGD( net.parameters() , lr=my_lr )
        
    # set the running quantities to zero at the beginning of the epoch
    running_loss=0
    num_batches=0    
       
    # set the initial h to be the zero vector
    h = torch.zeros(1, bs, hidden_size)

    # send it to the gpu    
    h=h.to(device)
    
    for count in range( 0 , 46478-seq_length ,  seq_length):
             
        # Set the gradients to zeros
        optimizer.zero_grad()
        
        # create a minibatch
        minibatch_data =  train_data[ count   : count+seq_length   ]
        minibatch_label = train_data[ count+1 : count+seq_length+1 ]        
        
        # send them to the gpu
        minibatch_data=minibatch_data.to(device)
        minibatch_label=minibatch_label.to(device)
        
        # Detach to prevent from backpropagating all the way to the beginning
        # Then tell Pytorch to start tracking all operations that will be done on h and c
        h=h.detach()
        h=h.requires_grad_()
                       
        # forward the minibatch through the net        
        scores, h  = net( minibatch_data, h )
        
        # reshape the scores and labels to huge batch of size bs*seq_length
        scores          =            scores.view(  bs*seq_length , vocab_size)  
        minibatch_label =   minibatch_label.view(  bs*seq_length )       
        
        # Compute the average of the losses of the data points in this huge batch
        loss = criterion(  scores ,  minibatch_label )
        
        # backward pass to compute dL/dR, dL/dV and dL/dW
        loss.backward()

        # do one step of stochastic gradient descent: R=R-lr(dL/dR), V=V-lr(dL/dV), ...
        utils.normalize_gradient(net)
        optimizer.step()
        
            
        # update the running loss  
        running_loss += loss.item()
        num_batches += 1
        
        
        
    # compute stats for the full training set
    total_loss = running_loss/num_batches
    elapsed = time.time()-start
    
    print('')
    print('epoch=',epoch, '\t time=', elapsed,'\t lr=', my_lr, '\t exp(loss)=',  math.exp(total_loss))
    eval_on_test_set() 



epoch= 0 	 time= 7.427802562713623 	 lr= 1 	 exp(loss)= 524.3818693205285
test: exp(loss) =  340.84088626450574

epoch= 1 	 time= 15.053393840789795 	 lr= 1 	 exp(loss)= 270.36501405559807
test: exp(loss) =  245.98539701803836

epoch= 2 	 time= 22.62943148612976 	 lr= 1 	 exp(loss)= 210.21103129826412
test: exp(loss) =  215.5687713356677

epoch= 3 	 time= 30.193374395370483 	 lr= 1 	 exp(loss)= 180.24771561879214
test: exp(loss) =  198.76079284285197

epoch= 4 	 time= 37.75776267051697 	 lr= 0.9090909090909091 	 exp(loss)= 159.03400924079358
test: exp(loss) =  182.7379972033081

epoch= 5 	 time= 45.340163707733154 	 lr= 0.8264462809917354 	 exp(loss)= 144.1562339657559
test: exp(loss) =  174.56807243262304

epoch= 6 	 time= 52.88029503822327 	 lr= 0.7513148009015777 	 exp(loss)= 133.40008842962203
test: exp(loss) =  168.83114148420105

epoch= 7 	 time= 60.41352105140686 	 lr= 0.6830134553650705 	 exp(loss)= 125.25672247045354
test: exp(loss) =  162.78393364152205

epoch= 8 	 time= 68.

### Choose one sentence (taken from the test set)

In [27]:
sentence1 = "some analysts expect oil prices to remain relatively"

sentence2 = "over the next days and weeks they say investors should look for stocks to"

sentence3 = "prices averaging roughly $ N a barrel higher in the third"

sentence4 = "i think my line has been very consistent mrs. hills said at a news"

sentence5 = "this appears particularly true at gm which had strong sales in"

# or make your own sentence.  No capital letter or punctuation allowed. Each word must be in the allowed vocabulary.
sentence6= "he was very"

# SELECT THE SENTENCE HERE
mysentence = sentence6

### Convert the sentence into a vector, then send to GPU

In [28]:
minibatch_data=utils.sentence2vector(mysentence)
      
minibatch_data=minibatch_data.to(device)

print(minibatch_data)

tensor([[225],
        [ 54],
        [176]], device='cuda:0')


### Set the initial hidden state to zero, then run the RNN.

In [29]:
h = torch.zeros(1, 1, hidden_size)
h=h.to(device)

scores , h = net( minibatch_data , h )

### Display the network prediction for the next word

In [30]:
print(mysentence, '... \n')

utils.show_next_word(scores)

he was very ... 

6.9%	 <unk>
5.8%	 good
5.4%	 much
2.9%	 difficult
2.0%	 clear
1.7%	 important
1.6%	 high
1.5%	 positive
1.3%	 well
1.2%	 little
1.1%	 few
1.1%	 very
1.0%	 the
0.9%	 a
0.9%	 strong
0.8%	 vulnerable
0.8%	 bad
0.8%	 complicated
0.7%	 low
0.6%	 more
0.6%	 hard
0.6%	 expensive
0.6%	 long
0.5%	 likely
0.5%	 serious
0.5%	 expected
0.5%	 surprising
0.5%	 too
0.5%	 successful
0.5%	 part
