# Lab 02: Sequence-To-Sequence with Transformers - Demo

### The task is to learn to memorize an input  sequence of length 100 and output the same sequence of length 100 but shifted by one word in the future.
For example, the input sequence is "some analysts expect oil prices to remain relatively"<br>
and the output sequence is "analysts expect oil prices to remain relatively high".

In [50]:
# For Google Colaboratory
import sys, os
if 'google.colab' in sys.modules:
    # mount google drive
    from google.colab import drive
    drive.mount('/content/gdrive')
    path_to_file = '/content/gdrive/My Drive/ACE_NLP_Jul23_codes/codes/labs_lecture09/lab02_translation/'
    print(path_to_file)
    # change current path to the folder containing "file_name"
    os.chdir(path_to_file)
    !pwd

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/My Drive/ACE_NLP2022_notebooks/labs_lecture09/lab02_translation/
/content/gdrive/My Drive/ACE_NLP2022_notebooks/labs_lecture09/lab02_translation


In [51]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import math
import time
import utils

### GPU

It is recommended to run this code on GPU:<br> 
* Time for 1 epoch on GPU : 1.5 sec w/ Google Colab Tesla P100-PCIE-16GB <br>

In [52]:
device= torch.device("cuda")
#device= torch.device("cpu")
print(device)

if torch.cuda.is_available():
    print('cuda available with GPU:',torch.cuda.get_device_name(0))

cuda


### Download Penn Tree Bank

The tensor train_data consists of 20 columns of 46,479 words.<br>
The tensor test_data consists of 20 columns of 4,121 words.

In [53]:
from utils import check_ptb_dataset_exists
data_path=check_ptb_dataset_exists()

train_data  =  torch.load(data_path+'ptb/train_data.pt')
test_data   =  torch.load(data_path+'ptb/test_data.pt')

print(  train_data.size()  )
print(  test_data.size()   )

torch.Size([46479, 20])
torch.Size([4121, 20])


### Extract a small part of PTB

In [54]:
doc_len = 501
train_data = train_data[:doc_len,:]
print(  train_data.size()  )

torch.Size([501, 20])


### Some constants associated with the data set

In [55]:
bs = 20
vocab_size = 10000

### Make an attention net class

In [56]:

def generate_positional_encoding(seq_length, dim):
    assert dim == 2* (dim//2) # check if dim is divisible by 2
    pe = torch.zeros(seq_length, dim)
    position = torch.arange(0, seq_length, dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, dim, 2).float() * (-torch.log(torch.tensor(10000.0)) / dim))
    pe[:,0::2] = torch.sin(position * div_term)
    pe[:,1::2] = torch.cos(position * div_term)
    return pe        
        
    
class Transformer_decoder(nn.Module):
    
    def __init__(self, hidden_size, nb_heads):
        super(Transformer_decoder, self).__init__()
        assert hidden_size == nb_heads* (hidden_size//nb_heads) # check if hidden_size is divisible by nb_heads
        self.MHA_selfatt = nn.MultiheadAttention(hidden_size, nb_heads)
        self.MHA = nn.MultiheadAttention(hidden_size, nb_heads)
        self.LLcat = nn.Linear(2*hidden_size, hidden_size)
        self.LL1 = nn.Linear(hidden_size, hidden_size)
        self.LL2 = nn.Linear(hidden_size, hidden_size)
        self.LN1 = nn.LayerNorm(hidden_size)
        self.LN2 = nn.LayerNorm(hidden_size)
        
    def forward(self, g_seq , pos, h_enc_seq):  
        seq_length = g_seq.size(0)
        bs = g_seq.size(1)
        pos = pos.unsqueeze(dim=1).repeat_interleave(bs,dim=1) # size=(seq_length, bs, hidden_dim) 
        h_cat = self.LLcat(torch.cat( (g_seq, pos), dim=2 )) # size=(seq_length, bs, hidden_dim) 
        mask_visited_nodes = torch.ones(1, seq_length, device=g_seq.device).bool() # True=no_attention # size=(1, seq_length) 
        h_seq = []
        h_MHA_seq = []
        for t in range(seq_length): 
            # prepare masks of attention
            mask_visited_nodes = mask_visited_nodes.clone()
            mask_visited_nodes[0,t] = False # allow attention to current word indexed by t
            mask_visited_nodes_selfatt = mask_visited_nodes.clone()
            mask_visited_nodes_selfatt = mask_visited_nodes_selfatt.repeat_interleave(seq_length,dim=0)
            # MHA for masked self-attention 
            h_MHA_selfatt, _ = self.MHA_selfatt(h_cat, h_cat, h_cat, attn_mask=mask_visited_nodes_selfatt) # size=(1, bs, hidden_dim)
            h_selfatt = self.LN1( h_cat + h_MHA_selfatt ) # size=(1, bs, hidden_dim)
            # MHA for cross-attention layer
            query = h_selfatt[t].unsqueeze(0) # size=(1, bs, hidden_dim) 
            h_MHA, _ = self.MHA(query, h_enc_seq, h_enc_seq, attn_mask=mask_visited_nodes) # size=(1, bs, hidden_dim)
            h_MHA_seq.append(h_MHA)
            h_seq.append(query)
            
        # cross-attention layer
        h_MHA_seq = torch.stack(h_MHA_seq).squeeze() # size=(seq_length, bs, hidden_dim)  
        h_seq = torch.stack(h_seq).squeeze() # size=(seq_length, bs, hidden_dim)  
        h = self.LN1( h_seq + h_MHA_seq ) # size=(1, bs, hidden_dim)
        h_MLP = self.LL2(torch.relu(self.LL1(h))) # size=(1, bs, hidden_dim) 
        h_seq = self.LN2( h + h_MLP ) # size=(1, bs, hidden_dim)   
        return h_seq
    

class Transformer_encoder(nn.Module):
    
    def __init__(self, hidden_size, nb_heads):
        super(Transformer_encoder, self).__init__()
        assert hidden_size == nb_heads* (hidden_size//nb_heads) # check if hidden_size is divisible by nb_heads
        self.MHA = nn.MultiheadAttention(hidden_size, nb_heads)
        self.LLcat = nn.Linear(2*hidden_size, hidden_size)
        self.LL1 = nn.Linear(hidden_size, hidden_size)
        self.LL2 = nn.Linear(hidden_size, hidden_size)
        self.LN1 = nn.LayerNorm(hidden_size)
        self.LN2 = nn.LayerNorm(hidden_size)
        
    def forward(self, g_seq , pos):  
        seq_length = g_seq.size(0)
        bs = g_seq.size(1)
        pos = pos.unsqueeze(dim=1).repeat_interleave(bs,dim=1) # size=(seq_length, bs, hidden_dim) 
        h_cat = self.LLcat(torch.cat( (g_seq, pos), dim=2 )) # size=(seq_length, bs, hidden_dim) 
        h_MHA_seq, _ = self.MHA(h_cat, h_cat, h_cat) # size=(seq_length, bs, hidden_dim)
        h = self.LN1( h_cat + h_MHA_seq ) # size=(1, bs, hidden_dim) 2
        h_MLP = self.LL2(torch.relu(self.LL1(h))) # size=(1, bs, hidden_dim) 
        h_seq = self.LN2( h + h_MLP ) # size=(1, bs, hidden_dim) 
        return h_seq
    
    
class ANN(nn.Module):
    
    def __init__(self, hidden_size, nb_heads):
        super(ANN, self).__init__()
        self.encoder = Transformer_encoder(hidden_size, nb_heads)
        self.decoder = Transformer_decoder(hidden_size, nb_heads)
    
    def forward(self, g_seq , pos ):
        h_enc_seq = self.encoder( g_seq , pos ) # size=(seq_length, bs, hidden_dim) 
        h_dec_seq = self.decoder( g_seq , pos, h_enc_seq ) # size=(seq_length, bs, hidden_dim) 
        return h_dec_seq 
    

class attention_net(nn.Module):

    def __init__(self, hidden_size, nb_heads):
        super(attention_net, self).__init__()  
        self.layer1 = nn.Embedding( vocab_size  , hidden_size  )
        self.layer2 = ANN(hidden_size, nb_heads)
        self.layer3 = nn.Linear(    hidden_size , vocab_size   )

    def forward(self, word_seq, pos ):
        g_seq     =   self.layer1( word_seq ) # size=(seq_length, bs, hidden_dim) 
        h_seq     =   self.layer2( g_seq , pos ) # size=(seq_length, bs, hidden_dim) 
        score_seq =   self.layer3( h_seq ) # size=(seq_length, bs, vocab_size)
        return score_seq 


### Build the net. Choose the hidden size to be 128 and the number of heads to be 16. 
### How many parameters in total?

In [69]:
hidden_size = 128 
nb_heads = 16

net = attention_net(hidden_size, nb_heads)
print(net)
utils.display_num_param(net)

attention_net(
  (layer1): Embedding(10000, 128)
  (layer2): ANN(
    (encoder): Transformer_encoder(
      (MHA): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
      )
      (LLcat): Linear(in_features=256, out_features=128, bias=True)
      (LL1): Linear(in_features=128, out_features=128, bias=True)
      (LL2): Linear(in_features=128, out_features=128, bias=True)
      (LN1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (LN2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): Transformer_decoder(
      (MHA_selfatt): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
      )
      (MHA): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
      )
      (LLcat): Linear(in_features=256, out_features=128, bias=True)
      (LL1): Linear(in_featur

### Send the weights of the networks to the GPU

In [70]:
net = net.to(device)

### Choose the loss to be the cross-entropy and the optimizer to be Adam, as well as the hyperparameters: 
* initial learning rate = 0.001
* sequence length = 100

In [71]:
criterion = nn.CrossEntropyLoss()

my_lr = 0.001
seq_length = 100
optimizer = torch.optim.Adam(net.parameters(), lr=my_lr)

### Do 50 passes through the training set
### Observe the train perplexity

In [72]:
start=time.time()
for epoch in range(50):
    
    # set the running quantities to zero at the beginning of the epoch
    running_loss=0
    num_batches=0    
    doc_len = train_data.size(0)
    for count in range( 0 , doc_len-seq_length ,  seq_length): 
        
        # Set the gradients to zeros
        optimizer.zero_grad()
        
        # create a minibatch
        minibatch_data = train_data[ count   : count+seq_length   ]
        minibatch_label = train_data[ count+1 : count+seq_length+1 ]    
        pos = generate_positional_encoding(seq_length, hidden_size) # size=(seq_length, hidden_dim) 
        
        # send them to the gpu
        minibatch_data = minibatch_data.to(device)
        minibatch_label = minibatch_label.to(device)
        pos = pos.to(device)
        
        # forward the minibatch through the net        
        scores = net( minibatch_data, pos ) # size=(seq_length, bs, vocab_size)

        # reshape the scores and labels to huge batch of size bs*seq_length
        scores = scores.view(  bs*seq_length , vocab_size) # size=(seq_length.bs, vocab_size)
        minibatch_label = minibatch_label.view(  bs*seq_length ) # size=(seq_length.bs, vocab_size)
       
        # Compute the average of the losses of the data points in this huge batch
        loss = criterion(scores, minibatch_label)
        
        # backward pass to compute dL/dR, dL/dV and dL/dW
        loss.backward()

        # do one step of stochastic gradient descent: R=R-lr(dL/dR), V=V-lr(dL/dV), ...
        optimizer.step()
        
        # update the running loss  
        running_loss += loss.item()
        num_batches += 1
        
    # compute stats for the full training set
    total_loss = running_loss/num_batches
    elapsed = time.time()-start
    
    if not epoch%10:
      print('epoch=',epoch, '\t time=', elapsed,'\t lr=', my_lr, '\t exp(loss)=',  math.exp(total_loss))


epoch= 0 	 time= 1.2681910991668701 	 lr= 0.001 	 exp(loss)= 6883.244457877534
epoch= 10 	 time= 13.575231075286865 	 lr= 0.001 	 exp(loss)= 260.02380432357336
epoch= 20 	 time= 25.824889659881592 	 lr= 0.001 	 exp(loss)= 77.53207331566317
epoch= 30 	 time= 37.98508954048157 	 lr= 0.001 	 exp(loss)= 23.98241062630039
epoch= 40 	 time= 50.14222478866577 	 lr= 0.001 	 exp(loss)= 6.672884276588563


### Check if the network was successful 

In [73]:
minibatch_data = train_data[ 0:0+seq_length, 1   ].unsqueeze(1)
minibatch_label = train_data[ 0+1:0+seq_length+1, 1].unsqueeze(1)
print('Input sequence:', minibatch_data[:,0])
print('Expected output sequence:', minibatch_label[:,0])
pos = generate_positional_encoding(seq_length, hidden_size) # size=(seq_length, hidden_dim) 
             
minibatch_data = minibatch_data.to(device)
minibatch_label = minibatch_label.to(device)
pos = pos.to(device)

scores = net( minibatch_data, pos ) 
seq = scores.argmax(dim=1)
print('Predicted output sequence:', seq)

Input sequence: tensor([  93,  718,  590, 1569,   35, 4979,   95,   87,  507,   93,   78,  718,
          26, 2966,  467,   35, 4979,  119, 2862,   64, 1177, 2640,  861, 1449,
          26,  956, 5130,   98,   24,   32, 2361,   34,   78,   54,  461,  229,
          32,  523,  823, 1328,   48, 2749, 1977,  718,  746,   32, 2749,  798,
          95,   64,   27,  363,   64,  926,  138,  924,   42,   27,   27,  467,
          79,   42,   32,  935,  108,  128, 4955,   24,   32,  798,  718,  220,
         971,   64,  660,   32, 1515,  531,   32,  462,   40,  124,   64, 2273,
        4586,   35, 4979,  590,   93, 2966,  718,  220,   26,  109, 2862,   64,
        1177,  718,  220,  709])
Expected output sequence: tensor([ 718,  590, 1569,   35, 4979,   95,   87,  507,   93,   78,  718,   26,
        2966,  467,   35, 4979,  119, 2862,   64, 1177, 2640,  861, 1449,   26,
         956, 5130,   98,   24,   32, 2361,   34,   78,   54,  461,  229,   32,
         523,  823, 1328,   48, 2749, 1977,  