# LSTM -- SOLUTION

In [1]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import math
import time
import utils

### With or without GPU?

In [2]:
device= torch.device("cuda")
#device= torch.device("cpu")
print(device)

cuda


### Download Penn Tree Bank (the tensor train_data should consists of 20 columns of ~50,000 words)

In [3]:
train_data  =  torch.load('../../data/PTB/data/train_data.pt')
test_data   =  torch.load('../../data/PTB/data/test_data.pt')

print(  train_data.size()  )
print(  test_data.size()   )

torch.Size([46479, 20])
torch.Size([4121, 20])


### Some constants associated with the data set

In [4]:
bs = 20

vocab_size = 10000


### Make a recurrent net class

In [5]:
class three_layer_recurrent_net(nn.Module):

    def __init__(self, hidden_size):
        super(three_layer_recurrent_net, self).__init__()
        
        self.layer1 = nn.Embedding( vocab_size  , hidden_size  )
        self.layer2 = nn.LSTM(      hidden_size , hidden_size  )
        self.layer3 = nn.Linear(    hidden_size , vocab_size   )

        
    def forward(self, word_seq, h_init, c_init ):
        
        g_seq                      =   self.layer1( word_seq )  
        h_seq , (h_final,c_final)  =   self.layer2( g_seq , (h_init,c_init) )      
        score_seq                  =   self.layer3( h_seq )
        
        return score_seq,  h_final , c_final


### Build the net. Choose the hidden size to be 300. How many parameters in total?

In [6]:
hidden_size=300

net = three_layer_recurrent_net( hidden_size )

print(net)

utils.display_num_param(net)

three_layer_recurrent_net(
  (layer1): Embedding(10000, 300)
  (layer2): LSTM(300, 300)
  (layer3): Linear(in_features=300, out_features=10000, bias=True)
)
There are 6732400 (6.73 million) parameters in this neural network


### Send the weights of the networks to the GPU

In [7]:
net = net.to(device)

### Set up manually the weights of the embedding module and Linear module

In [8]:
net.layer1.weight.data.uniform_(-0.1, 0.1)

net.layer3.weight.data.uniform_(-0.1, 0.1)

print('')




### Choose the criterion, as well as the following important hyperparameters: 
* initial learning rate = 5
* sequence length = 35

In [9]:
criterion = nn.CrossEntropyLoss()

my_lr = 5

seq_length = 35

### Function to evaluate the network on the test set

In [10]:
def eval_on_test_set():

    running_loss=0
    num_batches=0    
       
    h = torch.zeros(1, bs, hidden_size)
    c = torch.zeros(1, bs, hidden_size)
   
    h=h.to(device)
    c=c.to(device)
       
    for count in range( 0 , 4120-seq_length ,  seq_length) :
               
        minibatch_data =  test_data[ count   : count+seq_length   ]
        minibatch_label = test_data[ count+1 : count+seq_length+1 ]
        
        minibatch_data=minibatch_data.to(device)
        minibatch_label=minibatch_label.to(device)
                                  
        scores, h, c  = net( minibatch_data, h , c)
        
        minibatch_label =   minibatch_label.view(  bs*seq_length ) 
        scores          =            scores.view(  bs*seq_length , vocab_size)
        
        loss = criterion(  scores ,  minibatch_label )    
        
        h=h.detach()
        c=c.detach()
            
        running_loss += loss.item()
        num_batches += 1        
    
    total_loss = running_loss/num_batches 
    print('test: exp(loss) = ', math.exp(total_loss)  )
        

### Do 8 passes through the training set.

In [11]:
start=time.time()

for epoch in range(8):
    
    # divide the learning rate by 3 except after the first epoch
    if epoch >= 2:
        my_lr = my_lr / 3
    
    # create a new optimizer at the beginning of each epoch: give the current learning rate.   
    optimizer=torch.optim.SGD( net.parameters() , lr=my_lr )
        
    # set the running quatities to zero at the beginning of the epoch
    running_loss=0
    num_batches=0    
       
    # set the initial h and c to be the zero vector
    h = torch.zeros(1, bs, hidden_size)
    c = torch.zeros(1, bs, hidden_size)

    # send them to the gpu    
    h=h.to(device)
    c=c.to(device)
    
    for count in range( 0 , 46478-seq_length ,  seq_length):
        
        # Set the gradients to zeros
        optimizer.zero_grad()
        
        # create a minibatch
        minibatch_data =  train_data[ count   : count+seq_length   ]
        minibatch_label = train_data[ count+1 : count+seq_length+1 ]        
        
        # send them to the gpu
        minibatch_data=minibatch_data.to(device)
        minibatch_label=minibatch_label.to(device)
        
        # Detach to prevent from backpropagating all the way to the beginning
        # Then tell Pytorch to start tracking all operations that will be done on h and c
        h=h.detach()
        c=c.detach()
        h=h.requires_grad_()
        c=c.requires_grad_()
                       
        # forward the minibatch through the net        
        scores, h, c  = net( minibatch_data, h , c)
        
        # reshape the scores and labels to huge batch of size bs*seq_length
        scores          =            scores.view(  bs*seq_length , vocab_size)  
        minibatch_label =   minibatch_label.view(  bs*seq_length )       
        
        # Compute the average of the losses of the data points in this huge batch
        loss = criterion(  scores ,  minibatch_label )
        
        # backward pass to compute dL/dR, dL/dV and dL/dW
        loss.backward()

        # do one step of stochastic gradient descent: R=R-lr(dL/dR), V=V-lr(dL/dV), ...
        utils.normalize_gradient(net)
        optimizer.step()
        
            
        # update the running loss  
        running_loss += loss.item()
        num_batches += 1
        
        
        
    # compute stats for the full training set
    total_loss = running_loss/num_batches
    elapsed = time.time()-start
    
    print('')
    print('epoch=',epoch, '\t time=', elapsed,'\t lr=', my_lr, '\t exp(loss)=',  math.exp(total_loss))
    eval_on_test_set() 



epoch= 0 	 time= 13.378747701644897 	 lr= 5 	 exp(loss)= 278.80329535959396
test: exp(loss) =  176.10091439623466

epoch= 1 	 time= 27.119251489639282 	 lr= 5 	 exp(loss)= 127.2893813062593
test: exp(loss) =  133.56880609067883

epoch= 2 	 time= 41.00815463066101 	 lr= 1.6666666666666667 	 exp(loss)= 81.5507055656398
test: exp(loss) =  114.37825619482055

epoch= 3 	 time= 54.876781940460205 	 lr= 0.5555555555555556 	 exp(loss)= 67.34177944883491
test: exp(loss) =  110.70630542720009

epoch= 4 	 time= 69.02088618278503 	 lr= 0.1851851851851852 	 exp(loss)= 62.48224239962807
test: exp(loss) =  109.20213748226952

epoch= 5 	 time= 83.68946361541748 	 lr= 0.0617283950617284 	 exp(loss)= 60.721349522223306
test: exp(loss) =  108.41556901626859

epoch= 6 	 time= 98.10883212089539 	 lr= 0.0205761316872428 	 exp(loss)= 60.07901541829639
test: exp(loss) =  107.99166749728593

epoch= 7 	 time= 112.47274374961853 	 lr= 0.006858710562414266 	 exp(loss)= 59.84645040000237
test: exp(loss) =  107.73

### Generate text using the trained langage model

In [12]:
idx2word  =  torch.load('../../data/PTB/data/idx2word.pt')

bs=1

# initialize h and c to zero
h = torch.zeros(1, bs, hidden_size)
c = torch.zeros(1, bs, hidden_size)
h=h.to(device)
c=c.to(device)

# pick the first word at random
word_idx=torch.LongTensor(1,1).random_(0,vocab_size-1)
word_idx=word_idx.to(device)


for i in range(500):
    
    # compute the scores used to predict what is the next word
    scores , h, c = net(word_idx , h, c)
    
    # use a softmax to get a probability distribution over the vocabulary 
    probs=F.softmax(scores,dim=2)
    
    # sample
    probs=probs.squeeze().cpu()
    word_idx = torch.multinomial(probs, 1)
    
    # sample with temperature
    #  word_weights = scores.squeeze().div(1).exp().cpu()
    #  word_idx = torch.multinomial(word_weights, 1)
    
    # convert the word number to the actual string it corresponds to
    word = idx2word[ word_idx.item() ]
    print(word, end=' ')

    # put back in the right format
    word_idx=word_idx.view(1,1).to(device)
    


krenz asserts they began <unk> by exceeds two firms in parliament <eos> walter <unk> an official at texas air corp. and nbc intelligence union finance in metromedia said that manufactured competition anti-abortion <unk> <unk> increases rate on medical expenses and unemployment <eos> that happy authority runs <unk> members than the expense should be plays snapped into the south sea <eos> diet partner <eos> white house plans for most leveraged ventures all moscow wo n't boost <unk> laws if a local party will <unk> their portions of funds to cover those extent for their bills <eos> as a result the japanese government is now paying up an injunction in hutchinson stations in california that some figures are trying to cope with other currencies but said that price caused by the radio guard and very heavy use of <unk> would garden noriega shortages if any democrats cut upper <unk> admits <eos> in national over-the-counter trading yesterday new york the $ <unk> central agreement on the interna