[View in Colaboratory](https://colab.research.google.com/github/vbipin/nlp/blob/master/pytorch_nmt_with_attn.ipynb)

In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

use_cuda = torch.cuda.is_available()

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
#This notebook is adapted from
##http://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

In [6]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
#for monitoring
from time import time
#for parsing the data filename
import re

In [6]:
#here we prepare data directly form the web link. It is useful in Colab notebooks
#to convert to script
#jupyter nbconvert --to script [YOUR_NOTEBOOK].ipynb

In [7]:
#we need the data from : http://www.manythings.org/anki/fra-eng.zip
#import requests
#import gzip
#import io
#import zipfile

#get the contents from the website
#r = requests.get('http://www.manythings.org/anki/fra-eng.zip')

#this is one ugly code; But I need the text from a zip file in a url :(((
#https://stackoverflow.com/questions/37704836/how-do-i-read-a-csv-file-thats-gzipped-from-url-python
#https://codeyarns.com/2013/10/03/how-to-read-contents-of-zip-file-in-python/
#https://docs.python.org/2/library/zipfile.html


#with zipfile.ZipFile( io.BytesIO(r.content), mode='r' ) as zip_file :
#  print (zip_file.namelist())
#  lines = zip_file.read('fra.txt').strip().split(b'\n')
#  lines = [ str(l, 'utf-8') for l in lines ]
#  print(len(lines))

In [8]:
#if we have the lines form a file; create it here.

In [57]:
#Data class usnign torchtext
#from: https://github.com/pytorch/text/blob/master/test/translation.py
import re
import itertools

import spacy

from torchtext import datasets
from torchtext import data
from torchtext.data import Field

spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

url = re.compile('(<url>.*</url>)')

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))]


def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))]

class Data :
    def __init__(self, batch_size=1) :
        DE = data.Field(tokenize=tokenize_de, init_token="<sos>", eos_token="<eos>" )
        EN = data.Field(tokenize=tokenize_en, init_token="<sos>", eos_token="<eos>" )
        self.train, self.val,self.test = datasets.TranslationDataset.splits(
                path='data/multi30k/', train='train',
                validation='val', exts=('.de', '.en'),
                fields=(DE, EN))
        DE.build_vocab(self.train.src,  min_freq=3) #specials=['<sos>','<eos>','<unk>','<pad>'],
        EN.build_vocab(self.train.trg,  max_size=50000) #specials=['<sos>','<eos>','<unk>','<pad>'],
        
        self.src_lang = DE
        self.trg_lang = EN
        
        #self.train_iter, self.val_iter = data.BucketIterator.splits((self.train,self.val), batch_size=batch_size)


    def train_batch(self, batch_size=1) :
        self.train_iter, self.val_iter = data.BucketIterator.splits((self.train,self.val), batch_size=batch_size)
        for batch in itertools.islice(self.train_iter,0,len(self.train_iter)) :
            yield batch.src.view(batch_size,-1), batch.trg.view(batch_size,-1)
        
    def val_batch(self, batch_size=1) :
        self.train_iter, self.val_iter = data.BucketIterator.splits((self.train,self.val), batch_size=batch_size)
        for batch in itertools.islice(self.val_iter,0,len(self.val_iter)) :
            yield batch.src.view(batch_size,-1), batch.trg.view(batch_size,-1)    
        
     

In [58]:
multi30k_data = Data()

In [13]:
##################################################################


Transformer
plan is to code the encoder part first.   
    
We need to have 

    Scaled Dot Product Attention
    Multi head attention
    Position wise feed forward
    Positional Encoding



In [13]:
#################################################
#Scaled DotProduct attention

class ScaledDotProductAttention(nn.Module) :
    
    def __init__( self, d_model ) : #d_model is the dimension of the input embeddings
        super(ScaledDotProductAttention, self).__init__()
        
        self.scale = np.sqrt( d_model )
        
    def forward( self, Q, K, V, attn_mask=None ) : #We ignore the msak now
        """As given in the paper
            Q, K, V are matrices
            Q    shape => ( batch, n_queries, d_model )
            K, V shape => ( batch, N, d_model )
        """
        scores = torch.bmm( Q, K.transpose(1,2) ) / self.scale 
        #Before softmax we need to apply the mask.
        #mask is justt to invalidate certain entries int the scores
        if attn_mask is not None :
            assert attn_mask.size() == scores.size() 
            #fill the mask with -inf so that softmax values will be zeros.
            scores.data.masked_fill_(attn_mask, -np.inf)
            
        attn = F.softmax( scores, dim=1 )
        out = torch.bmm( attn, V )
        
        return out, attn 

#################################################
#Multi head Attention

class MultiHeadAttention(nn.Module) :

    def __init__( self, d_model, h, d_k, d_v  ) : #h is the number of heads, d_model == dk in the paper
        super(MultiHeadAttention, self).__init__()
        
        self.qkv_linear = [ (nn.Linear(d_model, d_k).to(device),nn.Linear(d_model, d_k).to(device),nn.Linear(d_model, d_v).to(device))  
                           for _ in range(h) ] #we create h weights for q, k, and v        
        self.attn = ScaledDotProductAttention( d_model )        
        self.out_linear = nn.Linear( h*d_v, d_model )
        
    def forward( self, Q, K, V, attn_mask=None ) :
        
        #just linear transforms
        qkv = [ (wq(Q), wk(K), wv(V)) for wq, wk, wv in self.qkv_linear ]
        
        head_attn =  [ self.attn(q,k,v) for q,k,v in qkv ] #returns a tuple of head and attns
        
        concat_head = torch.cat( [t[0] for t in head_attn ], -1 ) #t[0] is head
        attn = [t[1] for t in head_attn ] #t[1] is the attention
        
        out = self.out_linear( concat_head )
        
        return out, attn
        
        
class PositionWiseFFN( nn.Module ) : #position wise ffed forward network
    
    def __init__( self, d_model, d_hidden ) :
        super(PositionWiseFFN, self).__init__()
        
        self.linear1 = nn.Linear( d_model, d_hidden )
        #there will be a relu inbetween
        self.linear2 = nn.Linear( d_hidden, d_model )

    def forward( self, x ) :
        out = self.linear2( F.relu( self.linear1(x) ))
        return out

    
#from: https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/transformer/Models.py
def position_encoding_init(n_position, d_pos_vec):
    ''' Init the sinusoid position encoding table '''

    # keep dim 0 for padding token position encoding zero vector
    position_enc = np.array([
        [pos / np.power(10000, 2 * (j // 2) / d_pos_vec) for j in range(d_pos_vec)]
        if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])

    position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i
    position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1
    #return position_enc
    return torch.from_numpy(position_enc).type(torch.FloatTensor)

        
#We find the input embeddings of the incoming index
# and add the positional encodings
class TransformerEmbedding( nn.Module ) :
    def __init__( self, n_vocab, n_position, d_model ) :
        super(TransformerEmbedding, self).__init__()
        
        self.positional_enc = nn.Embedding(n_position, d_model).from_pretrained( position_encoding_init(n_position, d_model ) )
        self.input_emb = nn.Embedding(n_vocab, d_model )
        
    def forward( self, input_seq, input_positions ) :
        #Embedding layer of input + positional encoding is the input 
        emb_input = self.input_emb( input_seq ) + self.positional_enc( input_positions )
        
        return emb_input
        

In [24]:
#The transformer layer with multihead attention and FFN
class TransformerLayer( nn.Module ) :
    def __init__( self, d_model, n_head, d_k, d_v, d_hidden ) :
        super(TransformerLayer, self).__init__()
        
        self.multi_head_attn = MultiHeadAttention( d_model, n_head, d_k, d_v )
        self.ffn = PositionWiseFFN( d_model, d_hidden )
        
        self.norm1 = nn.LayerNorm( d_model )
        self.norm2 = nn.LayerNorm( d_model )
        
    def forward( self, input ) :
        
        residual = input 
        out, attn = self.multi_head_attn( input, input, input )
        out += residual #add the residual connection        
        ffn_input = self.norm1( out )
        
        residual = ffn_input
        ffn_out = self.ffn( ffn_input )
        ffn_out += residual
        layer_out = self.norm2( ffn_out )
        
        return layer_out #, attn 
    
#TransformerNLayers( n_layers, d_model, n_head, d_k, d_v, d_inner_hid )    
class TransformerNLayers( nn.Module ) :
    def __init__( self, n_layers, d_model, n_head, d_k, d_v, d_hidden ) :
        super(TransformerNLayers, self).__init__()
        
        layer_list = [ TransformerLayer(d_model, n_head, d_k, d_v, d_hidden) for _ in range(n_layers) ]
        self.layers = nn.Sequential( *layer_list )
    
    def forward( self, input ) :
        return self.layers( input )
    
    

In [15]:
#Transformer Encoder 

class TransformerEncoder( nn.Module ) :
    
    def __init__( self, n_src_vocab, n_max_seq, n_layers=6, d_model=512, n_head=8, d_k=64, d_v=64,
                 d_word_vec=512, d_inner_hid=1024, dropout=0.1) :
        
        super(TransformerEncoder, self).__init__()
        
        n_position = n_max_seq + 1
        self.n_max_seq = n_max_seq
        self.d_model = d_model
        
        #for processing the input
        self.input_emb = TransformerEmbedding( n_src_vocab, n_position, d_model )
        
        #N layers of transformer
        self.layers = TransformerNLayers( n_layers, d_model, n_head, d_k, d_v, d_inner_hid )
        
    def forward( self, input_seq, input_positions) :
        
        #Embedding layer of input + positional encoding is the input
        emb_input = self.input_emb( input_seq, input_positions )
        
        # pipe it through all the N layers
        #enc_output, enc_slf_attn = self.layers( enc_input )
        enc_output = self.layers( emb_input )
        
        return enc_output #, enc_slf_attns
        

In [63]:
batch = multi30k_data.val_batch(batch_size=1)
x,y = next(batch)
print (x.shape, y.shape)
#e = TransformerEncoder( data.src.n_words, 10 ).to(device)

torch.Size([1, 5]) torch.Size([1, 6])


In [64]:
hidden_size = 256
n_src_vocab = len(multi30k_data.src_lang.vocab)
e = TransformerEncoder(n_src_vocab, n_max_seq=50 ).to(device)

In [65]:
print (x.shape, y.shape)

torch.Size([1, 5]) torch.Size([1, 6])


In [66]:
q = e.forward( x, torch.tensor([0,1,2,3,4], dtype=torch.long).view(1,-1).to(device) )
q.shape

torch.Size([1, 5, 512])

In [40]:
d_model = 256
input_emb = TransformerEmbedding( n_src_vocab, 50, d_model ).to(device)

In [41]:
#transformer Decoder

In [49]:
i = input_emb(x, torch.tensor([[0],[1],[2],[3],[4]], dtype=torch.long).to(device))

In [50]:
i.shape

torch.Size([5, 1, 256])

In [48]:
x

tensor([[  2],
        [  5],
        [ 12],
        [  0],
        [  3]], device='cuda:0')

In [51]:
embedding = nn.Embedding(10, 3, padding_idx=0)
input = torch.LongTensor([[0,2,0,5]])
out = embedding(input)

In [52]:
input.shape, out.shape

(torch.Size([1, 4]), torch.Size([1, 4, 3]))

In [None]:
a = ScaledDotProductAttention(10)
a = MultiHeadAttention(h=8, d_model=10, d_k=12, d_v=15 )

In [None]:
Q = torch.ones( 2, 1, 10)
K = torch.ones(2,33,10)
V = torch.ones(2,33,10)
p,q = a.forward(Q,K,V)

In [None]:
p.shape #, q.shape

In [None]:
x = torch.ones(2,3,4)
n = torch.cat( [x, x] , -1 )
n.shape

In [None]:
def pe( n_positions, d_model ) :
    j = np.arange(0,d_model) * (2.0/d_model)
    q = np.array( [pos/np.power( 10000, j ) for pos in range(n_positions)] ).reshape( -1, d_model )
    #We will apply sin and cos to q
    position_enc = q
    position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i
    position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1
    return position_enc


In [None]:
i = pe( 9, 25 )

In [None]:
i.shape

In [None]:
11 // 2

In [None]:
a[1:, :]

In [None]:
def position_encoding_init(n_position, d_pos_vec):
    ''' Init the sinusoid position encoding table '''

    # keep dim 0 for padding token position encoding zero vector
    position_enc = np.array([
        [pos / np.power(10000, 2 * (j // 2) / d_pos_vec) for j in range(d_pos_vec)]
        if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])

    position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i
    position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1
    return position_enc
    #return torch.from_numpy(position_enc).type(torch.FloatTensor)

In [None]:
p = position_encoding_init( 9, 25)

In [None]:
k = torch.Tensor(p)

In [None]:
k

In [None]:
plt.plot(p[2])

In [None]:
plt.plot(i[2])

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, src_vocab_size, hidden_size, num_layers=1 ):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        
        #embedding vector size is fixed as hidden size
        self.enbedding_vector_size = hidden_size
        self.embedding = nn.Embedding(src_vocab_size, self.enbedding_vector_size )
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(-1)
        output = embedded.view( input.shape[0], 1, -1 ) #seq_length, batch, enbbding
        #print (output.shape)
        #print (hidden.shape)
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        result = Variable(torch.zeros(1, 1, self.hidden_size))
        if use_cuda:
            return result.cuda()
        else:
            return result

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, dest_vocab_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        
        #embedding vector size is fixed as hidden size
        self.enbedding_vector_size = hidden_size
        self.embedding = nn.Embedding(dest_vocab_size, self.enbedding_vector_size )
        
        self.gru = nn.GRU(hidden_size, hidden_size)
        
        self.linear = nn.Linear(hidden_size, dest_vocab_size)
        #self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(-1)
        #output = F.relu(output)
        output = embedded.view( input.shape[0], 1, -1 ) #input shape[0] is 1 as wqe feed one input at a time.
        
        output, hidden = self.gru(output, hidden)
        output = self.linear( output.squeeze() )
        #print (output.shape)
        output = F.log_softmax( output, dim=0 )
        return output.view(1,-1), hidden #output of shape N,C; here N=1

    def initHidden(self):
        result = Variable(torch.zeros(1, 1, self.hidden_size))
        if use_cuda:
            return result.cuda()
        else:
            return result

In [None]:
MAX_LENGTH = 50

class Attn(nn.Module) :
    def __init__(self, hidden_size, max_length) :
        super(Attn, self).__init__()
        self.hidden_size = hidden_size
        self.max_length = max_length
        self.linear = nn.Linear(self.hidden_size, self.max_length)
        
        
    def forward(self, hidden, encoder_outputs) :
        
        attn_scores = self.linear(hidden)
        #print("attn_scores", attn_scores.shape )
                
        attn_weights = F.softmax(attn_scores, dim=2)
        
        #print("attn_weights", attn_weights.shape)
        #print("encoder_outputs",encoder_outputs.shape)
        
        attn_applied = torch.matmul(attn_weights.squeeze(),encoder_outputs)
        #print ("attn_applied ", attn_applied.shape)
        
        return attn_applied, attn_weights
        
        
        
        
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        
        #embedding vector size is fixed as hidden size
        self.enbedding_vector_size = hidden_size

        self.embedding = nn.Embedding(self.output_size, self.enbedding_vector_size)
        
        self.attn = Attn(self.hidden_size, self.max_length)
        #self.attn = nn.Linear(self.hidden_size, self.max_length)        
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        #self.dropout = nn.Dropout(self.dropout_p)
        
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        """input is an index of the word. We create a word vector out of it"""
        embedded = self.embedding(input) 
        #print("embedded", embedded.shape )
                
        """ gru hidden has shape (num_layers * num_dir, batch, hidden_size)
            Here first two dim are 1
        """
        output, hidden = self.gru(embedded.view(1,1,-1), hidden)
        #print ("hidden ", hidden.shape)
        
        #linear W.h 
        #out (max, )
        attn_context, attn_weights = self.attn( hidden, encoder_outputs)
        #print ("attn_context ", attn_context.shape)
        
        
        output = torch.cat((hidden.view(1,-1), attn_context.view(1,-1)), 1)
        #print ("output ", output.shape) 
        
        output = self.attn_combine(output)
        #print ("output ", output.shape)        
        output = F.relu(output) #h tilde
        #print ("output ", output.shape)
        
        #output = F.log_softmax(self.out(output), dim=1)
        output = self.out(output)
        #print ("output ", output.shape)
        
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
#debug_list = []
def translate( encoder, decoder, data, input_sentence ) :
    debug_list = [] #XXX
    x = data.line_to_tensor( data.src, input_sentence ).to(device)
    h = encoder.initHidden().to(device)
    out, h = encoder(x, h)
    g = h
    
    encoder_outputs = torch.zeros(MAX_LENGTH, encoder.hidden_size, device=device)
    for i in range(out.shape[0]) :
        encoder_outputs[i] = out[i][0]
        
    #first input is SOS
    next_word = data.index_to_tensor( data.dest.SOS_token ).to(device)
    predicted_target = []
    for _ in range(25) :        
        scores, g, attn_w = decoder( next_word, g, encoder_outputs )
        #debug_list.append(attn_w)
        if next_word.item() == data.dest.EOS_token :
            break
        predicted_target.append( next_word.item() )
        #now we make the next_word from current_word
        v, next_word = scores.topk(1) #return value and index
        #new_word = data.index_to_tensor( next_word )
        #next_word = torch.multinomial( torch.exp(scores), 1 )[0]
        #next_word = torch.multinomial( scores, 1 )[0]
        
        
    return " ".join([ data.dest.index2word[i] for i in predicted_target ])

In [None]:
#debug_list = []
def translate2( encoder, decoder, x ) :
    debug_list = [] #XXX
    x = x.to(device)
    h = encoder.initHidden().to(device)
    out, h = encoder(x, h)
    g = h
    
    encoder_outputs = torch.zeros(MAX_LENGTH, encoder.hidden_size, device=device)
    for i in range(out.shape[0]) :
        encoder_outputs[i] = out[i][0]
        
    #first input is SOS
    #next_word = in_data.index_to_tensor( in_data.trg_lang.SOS_token ).to(device)
    next_word = x[0]
    predicted_target = []
    for _ in range(25) :        
        scores, g, attn_w = decoder( next_word, g, encoder_outputs )
        #debug_list.append(attn_w)
        #if next_word.item() == in_data.trg_lang.EOS_token :
        #    break
        predicted_target.append( next_word.item() )
        if next_word.item() == 3 : #in_data.trg_lang.EOS_token :
            break
        #now we make the next_word from current_word
        v, next_word = scores.topk(1) #return value and index
        #new_word = data.index_to_tensor( next_word )
        #next_word = torch.multinomial( torch.exp(scores), 1 )[0]
        #next_word = torch.multinomial( scores, 1 )[0]
        
    return predicted_target    
    #return " ".join([ in_data.trg_lang.itos[i] for i in predicted_target ])

In [None]:
MAX_LENGTH = 50
def train(encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, n_data=5000 ) :
    start = time()
    #batch = multi30k_data.batch(n_data=n_data, random=True)
    train_iter = multi30k_data.train_batch()
    
    loss_db = []
    for x, y in train_iter :
        loss = 0
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        
        h = encoder.initHidden().to(device)
        h.detach_()

        out, h = encoder(x, h)
        g = h

        encoder_outputs = torch.zeros(MAX_LENGTH, encoder.hidden_size, device=device)
        for i in range(out.shape[0]) :
            encoder_outputs[i] = out[i][0]
    
        for i in range(len(y) - 1) :
        #for i in range(1) :
            scores, g, attn_w = decoder( y[i], g, encoder_outputs )
            #print(scores.shape)
            #print(next_word.shape)
            
            loss += criterion(scores, y[i+1] )
            #next_word = sample_from_scores( scores )  
            #next_word = sample_from_softmax( scores )

            #next_word = data.index_to_tensor( next_word )

        loss.backward()
        loss_db.append( float(loss) )
        
        decoder_optimizer.step()
        encoder_optimizer.step()
        if n_data < 0 :
            break
        else :
            n_data -= 1
        
    end = time()
    print (end-start)
    return loss_db

In [None]:
hidden_size = 256
encoder = EncoderRNN(data.src.n_words, hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size, data.dest.n_words).to(device)

print(encoder)
print(decoder)

learning_rate = 0.001
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
#criterion = nn.NLLLoss().to(device)
criterion = nn.CrossEntropyLoss().to(device)

In [None]:
avg_loss = []
for _ in range(100) :
    l = train(encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, n_data=1000 )
    avg_loss.append( np.mean(l))

In [None]:
plt.plot(avg_loss)

In [None]:
from torchtext.data import Field
#train_iter = multi30k_data.train_batch()
inp, out = next(train_iter)
target_output   = " ".join([ multi30k_data.trg_lang.vocab.itos[i] for i in out ]) 
tout = translate2( encoder, decoder, inp )

output_sentence = " ".join([ multi30k_data.trg_lang.vocab.itos[i] for i in tout ])
print(target_output)
#print(multi30k_data__.trg_lines[i])
print(output_sentence)
print( sentence_bleu( [target_output.split(' ')], output_sentence.split(' ') , smoothing_function=SmoothingFunction().method1) )