# Lab 01: Language Modeling with Transformers - Demo

In [1]:
# For Google Colaboratory
import sys, os
if 'google.colab' in sys.modules:
    # mount google drive
    from google.colab import drive
    drive.mount('/content/gdrive')
    path_to_file = '/content/gdrive/My Drive/ACE_NLP_Jul23_codes/codes/labs_lecture09/lab01_language_model/'
    print(path_to_file)
    # change current path to the folder containing "file_name"
    os.chdir(path_to_file)
    !pwd

Mounted at /content/gdrive
/content/gdrive/My Drive/ACE_NLP2022_notebooks/labs_lecture09/lab01_language_model/
/content/gdrive/My Drive/ACE_NLP2022_notebooks/labs_lecture09/lab01_language_model


In [2]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import math
import time
import utils

### GPU

It is recommended to run this code on GPU:<br> 
* Time for 1 epoch on GPU : 48 sec w/ Google Colab Tesla P100-PCIE-16GB <br>

In [3]:
device= torch.device("cuda")
#device= torch.device("cpu")
print(device)

if torch.cuda.is_available():
    print('cuda available with GPU:',torch.cuda.get_device_name(0))

cuda
cuda available with GPU: Tesla P100-PCIE-16GB


### Download Penn Tree Bank

The tensor train_data consists of 20 columns of 46,479 words.<br>
The tensor test_data consists of 20 columns of 4,121 words.

In [4]:
from utils import check_ptb_dataset_exists
data_path=check_ptb_dataset_exists()

train_data  =  torch.load(data_path+'ptb/train_data.pt')
test_data   =  torch.load(data_path+'ptb/test_data.pt')

print(  train_data.size()  )
print(  test_data.size()   )

torch.Size([46479, 20])
torch.Size([4121, 20])


### Some constants associated with the data set

In [5]:
bs = 20
vocab_size = 10000

### Make an attention net class

In [6]:

def generate_positional_encoding(seq_length, dim):
    assert dim == 2* (dim//2) # check if dim is divisible by 2
    pe = torch.zeros(seq_length, dim)
    position = torch.arange(0, seq_length, dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, dim, 2).float() * (-torch.log(torch.tensor(10000.0)) / dim))
    pe[:,0::2] = torch.sin(position * div_term)
    pe[:,1::2] = torch.cos(position * div_term)
    return pe        
        
    
class Transformer_decoder(nn.Module):
    
    def __init__(self, hidden_size, nb_heads):
        super(Transformer_decoder, self).__init__()
        assert hidden_size == nb_heads* (hidden_size//nb_heads) # check if hidden_size is divisible by nb_heads
        self.MHA = nn.MultiheadAttention(hidden_size, nb_heads)
        self.LLcat = nn.Linear(2*hidden_size, hidden_size)
        self.LL1 = nn.Linear(hidden_size, hidden_size)
        self.LL2 = nn.Linear(hidden_size, hidden_size)
        self.LN1 = nn.LayerNorm(hidden_size)
        self.LN2 = nn.LayerNorm(hidden_size)
        
    def forward(self, g_seq , pos):  
        seq_length = g_seq.size(0)
        bs = g_seq.size(1)
        pos = pos.unsqueeze(dim=1).repeat_interleave(bs,dim=1) # size=(seq_length, bs, hidden_dim) 
        h_cat = self.LLcat(torch.cat( (g_seq, pos), dim=2 )) # size=(seq_length, bs, hidden_dim) 
        mask_visited_nodes = torch.ones(1, seq_length, device=g_seq.device).bool() # True=no_attention # size=(1, seq_length) 
        mask_visited_nodes[0,:seq_length//2] = False # allow attention to words from index 0 to seq_length/2
        h_seq = []
        h_MHA_seq = []
        for t in range(seq_length//2, seq_length): 
            query = h_cat[t].unsqueeze(0) # size=(1, bs, hidden_dim) 
            mask_visited_nodes = mask_visited_nodes.clone()
            mask_visited_nodes[0,t] = False # allow attention to current word indexed by t
            mask_visited_nodes[0,t-seq_length//2] = True # prevent attention to the (past) word indexed by t-seq_length/2
            h_MHA, _ = self.MHA(query, h_cat, h_cat, attn_mask=mask_visited_nodes) # size=(1, bs, hidden_dim)
            h_MHA_seq.append(h_MHA)
            h_seq.append(query)
        h_MHA_seq = torch.stack(h_MHA_seq).squeeze()
        h_seq = torch.stack(h_seq).squeeze() # size=(seq_length, bs, hidden_dim)  
        h = self.LN1( h_seq + h_MHA_seq ) # size=(1, bs, hidden_dim)
        h_MLP = self.LL2(torch.relu(self.LL1(h))) # size=(1, bs, hidden_dim) 
        h_seq = self.LN2( h + h_MLP ) # size=(1, bs, hidden_dim) 
        return h_seq
    
    
class ANN(nn.Module):
    
    def __init__(self, hidden_size, nb_heads):
        super(ANN, self).__init__()
        self.decoder = Transformer_decoder(hidden_size, nb_heads)
    
    def forward(self, g_seq , pos ):
        h_dec_seq = self.decoder( g_seq , pos )
        return h_dec_seq 
    

class attention_net(nn.Module):

    def __init__(self, hidden_size, nb_heads):
        super(attention_net, self).__init__()  
        self.layer1 = nn.Embedding( vocab_size  , hidden_size  )
        self.layer2 = ANN(hidden_size, nb_heads)
        self.layer3 = nn.Linear(    hidden_size , vocab_size   )

    def forward(self, word_seq, pos ):
        g_seq     =   self.layer1( word_seq ) # size=(seq_length, bs, hidden_dim) 
        h_seq     =   self.layer2( g_seq , pos ) # size=(seq_length, bs, hidden_dim) 
        score_seq =   self.layer3( h_seq ) # size=(seq_length, bs, vocab_size)
        return score_seq 


### Function to evaluate the network on the test set

In [7]:
def eval_on_test_set():

    net.eval()

    running_loss=0
    num_batches=0    
       
    for count in range( 0 , 4120-seq_length ,  seq_length//2) :
               
        minibatch_data =  test_data[ count   : count+seq_length   ]
        minibatch_label = test_data[ count+1 : count+seq_length+1 ]
        pos = generate_positional_encoding(seq_length, hidden_size)
        
        minibatch_data = minibatch_data.to(device)
        minibatch_label = minibatch_label.to(device)
        pos = pos.to(device)

        scores = net( minibatch_data, pos )
        
        scores = scores[-seq_length//2:,:,:]
        minibatch_label = minibatch_label[-seq_length//2:,:]

        minibatch_label = minibatch_label.view(  bs*seq_length//2 ) 
        scores = scores.view(  bs*seq_length//2 , vocab_size)
        
        loss = criterion(scores, minibatch_label) 
        
        running_loss += loss.item()
        num_batches += 1        
    
    total_loss = running_loss/num_batches 
    print('test: exp(loss) = ', math.exp(total_loss)  )


### Build the net. Choose the hidden size to be 128 and the number of heads to be 16. 
### How many parameters in total?

In [8]:
hidden_size = 128 
nb_heads = 16

net = attention_net(hidden_size, nb_heads)
print(net)
utils.display_num_param(net)

attention_net(
  (layer1): Embedding(10000, 128)
  (layer2): ANN(
    (decoder): Transformer_decoder(
      (MHA): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
      )
      (LLcat): Linear(in_features=256, out_features=128, bias=True)
      (LL1): Linear(in_features=128, out_features=128, bias=True)
      (LL2): Linear(in_features=128, out_features=128, bias=True)
      (LN1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (LN2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    )
  )
  (layer3): Linear(in_features=128, out_features=10000, bias=True)
)
There are 2702480 (2.70 million) parameters in this neural network


### Send the network to the GPU

In [9]:
net = net.to(device)

### Choose the loss to be the cross-entropy and the optimizer to be Adam, as well as the following important hyperparameters: 
* initial learning rate = 0.001
* sequence length = 30

In [10]:
criterion = nn.CrossEntropyLoss()

my_lr = 0.001
seq_length = 30
optimizer = torch.optim.Adam(net.parameters(), lr=my_lr)

### Do 10 passes through the training set
### Observe the train perplexity and the test perplexity

In [11]:

start=time.time()
for epoch in range(10):

    # divide the learning rate by 3 except after the first epoch
    if epoch >= 2:
        optimizer.param_groups[0]['lr'] /= 1.1 
        my_lr = optimizer.param_groups[0]['lr']
    
    # set the running quantities to zero at the beginning of the epoch
    running_loss=0
    num_batches=0    
    for count in range( 0 , 46478-seq_length ,  seq_length//2):
        
        # Set the gradients to zeros
        optimizer.zero_grad()
        
        # create a minibatch and the positional encoding
        minibatch_data = train_data[ count   : count+seq_length   ]
        minibatch_label = train_data[ count+1 : count+seq_length+1 ]    
        pos = generate_positional_encoding(seq_length, hidden_size) # size=(seq_length, hidden_dim) 
        
        # send them to the gpu
        minibatch_data = minibatch_data.to(device)
        minibatch_label = minibatch_label.to(device)
        pos = pos.to(device)
        
        # forward the minibatch through the net        
        scores = net( minibatch_data, pos ) # size=(seq_length, bs, vocab_size)

        # select the predicted words that used a window of attention of seq_length//2
        scores = scores[-seq_length//2:,:,:]
        minibatch_label = minibatch_label[-seq_length//2:,:]
        
        # reshape the scores and labels to huge batch of size bs*seq_length
        scores = scores.view(  bs*seq_length//2 , vocab_size) # size=(seq_length/2.bs, vocab_size)
        minibatch_label = minibatch_label.view(  bs*seq_length//2 ) # size=(seq_length/2.bs, vocab_size)
       
        # Compute the average of the losses of the data points in this huge batch
        loss = criterion(scores, minibatch_label)
        
        # backward pass to compute dL/dR, dL/dV and dL/dW
        loss.backward()

        # do one step of stochastic gradient descent: R=R-lr(dL/dR), V=V-lr(dL/dV), ...
        optimizer.step()
        
        # update the running loss  
        running_loss += loss.item()
        num_batches += 1
        
    # compute stats for the full training set
    total_loss = running_loss/num_batches
    elapsed = time.time()-start
    
    print('')
    print('epoch=',epoch, '\t time=', elapsed,'\t lr=', my_lr, '\t exp(loss)=',  math.exp(total_loss))
    eval_on_test_set() 



epoch= 0 	 time= 49.26979398727417 	 lr= 0.001 	 exp(loss)= 276.40929708125935
test: exp(loss) =  198.14782446824466

epoch= 1 	 time= 100.38009977340698 	 lr= 0.001 	 exp(loss)= 148.85428102508214
test: exp(loss) =  168.56028629306493

epoch= 2 	 time= 151.7474946975708 	 lr= 0.0009090909090909091 	 exp(loss)= 112.01972940235004
test: exp(loss) =  159.53060590431687

epoch= 3 	 time= 202.7228238582611 	 lr= 0.0008264462809917355 	 exp(loss)= 92.53003794418727
test: exp(loss) =  158.29914484150208

epoch= 4 	 time= 253.9203450679779 	 lr= 0.0007513148009015777 	 exp(loss)= 80.46591845356971
test: exp(loss) =  159.69302774158947

epoch= 5 	 time= 305.2703056335449 	 lr= 0.0006830134553650705 	 exp(loss)= 72.19765247455692
test: exp(loss) =  162.3745335948197

epoch= 6 	 time= 356.46911120414734 	 lr= 0.0006209213230591549 	 exp(loss)= 66.11349120495728
test: exp(loss) =  165.57647032396972

epoch= 7 	 time= 407.6614124774933 	 lr= 0.0005644739300537772 	 exp(loss)= 61.41710343891095
te

### Choose one sentence (taken from the test set)

In [14]:
sentence1 = "some analysts expect oil prices to remain relatively"

sentence2 = "over the next days and weeks they say investors should look for stocks to"

sentence3 = "prices averaging roughly $ N a barrel higher in the third"

sentence4 = "i think my line has been very consistent mrs. hills said at a news"

sentence5 = "this appears particularly true at gm which had strong sales in"

# or make your own sentence.  No capital letter or punctuation allowed. Each word must be in the allowed vocabulary.
sentence6= "he was very"

# SELECT THE SENTENCE HERE
mysentence = sentence1

### Convert the sentence into a vector, then send to GPU, and display the the network prediction for the next word¶

In [15]:
minibatch_data = utils.sentence2vector(mysentence)
minibatch_data = torch.cat((minibatch_data, minibatch_data), dim=0) # copy-paste the test sequence to use the same attention window size for each word
pos = generate_positional_encoding(minibatch_data.size(0), hidden_size) 

minibatch_data = minibatch_data.to(device)
pos = pos.to(device)   

net.eval()
scores = net( minibatch_data, pos )
scores = scores[-1,:] # select the last score vector for the prediction of the next word from the input sequence

print(mysentence, '... \n')
utils.show_next_word(scores.unsqueeze(0).unsqueeze(0))


some analysts expect oil prices to remain relatively ... 

20.5%	 flat
10.2%	 strong
7.7%	 narrow
3.5%	 <unk>
3.2%	 healthy
3.0%	 sluggish
2.6%	 quiet
2.0%	 genetic
1.7%	 fixed
1.4%	 revised
1.3%	 profit
1.3%	 soft
1.3%	 active
1.3%	 high
1.2%	 positive
1.1%	 stable
1.0%	 big
1.0%	 car
1.0%	 bad
0.9%	 close
0.8%	 heavy
0.7%	 cautious
0.7%	 brisk
0.6%	 poor
0.6%	 increasingly
0.5%	 short
0.5%	 low
0.5%	 full
0.5%	 late
0.5%	 thin
