In [None]:
import torch
import torch.nn as nn
import math as m
import torch.nn.functional as F
import torch.optim as optim
from torch.nn import TransformerEncoder
from torch.nn import TransformerDecoder
from torch.nn import MultiheadAttention
import time
import pandas as pd
import numpy as np
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = "cpu"
print(device)

In [None]:
from urllib.request import urlopen
from tempfile import NamedTemporaryFile
from shutil import unpack_archive
zipurl = 'https://download.pytorch.org/tutorial/data.zip'
import zipfile, urllib.request, shutil

file_name = 'myzip.zip'

with urllib.request.urlopen(zipurl) as response, open(file_name, 'wb') as out_file:
    shutil.copyfileobj(response, out_file)
    with zipfile.ZipFile(file_name) as zf:
        zf.extractall()

x, y = [], []
with open('data//eng-fra.txt', encoding='utf-8') as f:
    lines = f.readlines()
    for l in lines:
        tmp = l.rstrip().split('\t')
        x.append(tmp[0].lower())
        y.append(tmp[1].lower())


In [None]:
from bpemb import BPEmb
bpemb_en = BPEmb(vs = 5000, lang = "en")
bpemb_fr = BPEmb(vs = 5000, lang = "fr")

In [None]:
bos_idx = 1
eos_idx = 2

from bpemb import BPEmb
bpemb_en = BPEmb(lang="en", vs=25000)
bpemb_fr = BPEmb(lang='fr', vs=25000)


max_length = 0


xx, yy = [], []
#BOS AND EOS HAVE INDEXES 1 AND 2 IN BPE    

for i in range(len(x)):
    xx_ = [bos_idx]
    yy_ = [bos_idx]
    
    sent1 = xx_
    sent2 = yy_
    
    tmp = bpemb_en.encode_ids(x[i])
    sent1.extend(tmp)
    
    tmp = bpemb_fr.encode_ids(y[i])
    sent2.extend(tmp)

    xx.append(sent1)
    yy.append(sent2)

    if len(sent1) > max_length:
        max_length = len(sent1)
    if len(sent2) > max_length:
        max_length = len(sent2)

max_length += 2
print(xx[3])
    


In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000, dropout = 0):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term1 = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        div_term2 = torch.exp(torch.arange(1, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term1)
        pe[:, 1::2] = torch.cos(position * div_term2)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        



    def forward(self, x):
        
        x = x + self.pe[:, :x.size(1)]
        return x



class TransformerEnc(nn.Module):
    def __init__(self, d_model, dict_size, nhead, batch = 1, dim_feedforward=512, dropout=0, activation='relu', num_layers=6):
        super(TransformerEnc, self).__init__()
        self.posEnc = PositionalEncoding(d_model, max_length, dropout = dropout)
        self.embedding_enc = nn.Embedding.from_pretrained(torch.tensor(bpemb_en.vectors))        
        self.enc_layer = nn.TransformerEncoderLayer(d_model, nhead)
        self.enc = nn.TransformerEncoder(self.enc_layer, num_layers = num_layers)
        
    def forward(self, x):
        x = torch.LongTensor(x).to(device)
        x = self.embedding_enc(x)
        x = self.posEnc(x).transpose(0, 1)
        encoder_out = self.enc(x)

        
        return encoder_out    
        
        
        
class TransformerDec(nn.Module):
    def __init__(self, d_model, dict_size, max_length, nhead, batch = 1, dim_feedforward=512, dropout=0, activation='relu', num_layers=6):
        super(TransformerDec, self).__init__()
        self.dec_layer = nn.TransformerDecoderLayer(d_model, nhead, dropout = dropout)
        self.posDec = PositionalEncoding(d_model, max_length)
        self.embedding_dec = nn.Embedding.from_pretrained(torch.tensor(bpemb_fr.vectors))
        self.dec = nn.TransformerDecoder(self.dec_layer, num_layers = num_layers)
        self.linear = nn.Linear(d_model, dict_size)        
        self.softmax = nn.Softmax(dim=2)
        

    def forward(self, x, y=None, mode='eval'):
        bs = x.shape[1]
        dec_out = None
        if(mode == 'train'):
           
            y = torch.LongTensor(y).to(device)
            y = self.embedding_dec(y)
            y = self.posDec(y).transpose(0, 1)
            pred_dec = self.dec(y, x)
            pred_proba_t = self.linear(pred_dec)
            dec_out = pred_proba_t.permute(1, 2, 0)
        
        return dec_out
        

In [None]:
def get_batch(x, y, max_length, batch = 1):
    x_, y_, w_, target_t_ = [], [], [], []
    maxk = 0
    for i in range(batch):


        j = np.random.randint(0, len(x))
        

        k = np.random.randint(len(y[j]))
        if k > maxk:
            maxk = k
        tmp = torch.zeros(max_length).long()
        
        tmp[:len(x[j])] = torch.from_numpy(np.array(x[j][:len(x[j])])).long()
        tmp[len(x[j])] = eos_idx
        x_.append(tmp)

        tmp = torch.zeros(max_length).long()
        tmp[:len(y[j])]= torch.from_numpy(np.array(y[j][:len(y[j])])).long()
        tmp[k+1:] = 0
        y_.append(tmp)

        tmp = torch.zeros(max_length).long()
        tmp[:len(y[j])-1] = torch.from_numpy(np.array(y[j][1:len(y[j])])).long()
        tmp[len(y[j])-1] = eos_idx
        tmp[:k] = 0
        tmp[k+1:] = 0
        target_t_.append(tmp)

    x_ = torch.cat(x_).reshape(batch, -1)
    y_ = torch.cat(y_).reshape(batch, -1)
    target_t_ = torch.cat(target_t_).reshape(batch, -1)
    w_ = target_t_.clone()
    w_[w_ > 0] = 1

    return x_, y_[:, :maxk + 1], w_[:, :maxk + 1], target_t_[:, :maxk + 1]


def train(input1, target, encoder, decoder, criterion, encoder_optimizer, decoder_optimizer, max_length, num_iters, batch_size):
    encoder.train()
    decoder.train()

    
    epoch_loss = 0.0
    for i in range(num_iters):
        loss = 0.0
        
        enc_inp, dec_inp, w, target_t = get_batch(input1, target, max_length, batch = batch_size)

        enc_out = encoder.forward(enc_inp)
        
        dec_out = decoder.forward(enc_out, dec_inp, mode = 'train')

        loss = criterion(dec_out, target_t.long().to(device))*w.to(device)

        loss = loss.sum()/batch_size
        #loss = loss.sum()/(batch_size*w.sum())

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        
        epoch_loss += loss        
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

    return epoch_loss/num_iters

def getPair(i, data):

    return data[0][i], data[1][i]


def trainEpochs(encoder, decoder, num_iters, epochs, learning_rate, max_length, data_x, data_y, batch_size, print_in, multiplier, mstep, scale = 1):
    start = time.time()

    #encoder_optimizer = optim.SGD(encoder.parameters(), lr = learning_rate)
    #decoder_optimizer = optim.SGD(decoder.parameters(), lr = learning_rate)
    #encoder_optimizer = optim.RMSprop(encoder.parameters(), lr = learning_rate)
    #decoder_optimizer = optim.RMSprop(decoder.parameters(), lr = learning_rate)
    encoder_optimizer = optim.Adam(encoder.parameters(), lr = learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr = learning_rate)
    
    criterion = nn.CrossEntropyLoss(reduction = 'none')
    #criterion = nn.CrossEntropyLoss()

    loss = 0
    scheduler1 = torch.optim.lr_scheduler.StepLR(encoder_optimizer, step_size=mstep, gamma=multiplier)
    scheduler2 = torch.optim.lr_scheduler.StepLR(decoder_optimizer, step_size=mstep, gamma=multiplier)
    
    
    epchs, loss_epochs = [], []


    for e in range(epochs):
        loss = 0
        scheduler1.step()
        scheduler2.step()
        
        loss += train(xx, yy, encoder, decoder, criterion, encoder_optimizer, decoder_optimizer, max_length, num_iters, batch_size)

        
        if e % print_in == 0:
            if e % 30 == 0:
                torch.save(encoder1.state_dict(), 'data//encoder.sd')
                torch.save(decoder1.state_dict(), 'data//decoder.sd')
            epchs.append(e)
            loss_epochs.append(loss.item())
            print('Epoch: ', e, 'Loss: ', scale*loss, ' Time from Start: ', time.time() - start)


    history = pd.DataFrame.from_dict({'epochs': epchs, 'Loss': loss_epochs})
    return(history)

    

        

    


In [None]:
heads = 10
nlayers = 3
encoder1 = TransformerEnc(bpemb_en.vectors.shape[1], bpemb_en.vectors.shape[0], heads, num_layers = nlayers).to(device)
decoder1 = TransformerDec(bpemb_fr.vectors.shape[1], bpemb_fr.vectors.shape[0], max_length, heads, num_layers = nlayers).to(device)


In [None]:
encoder1.load_state_dict(torch.load('data//encoder1.sd'))
decoder1.load_state_dict(torch.load('data//decoder1.sd'))

In [None]:
gamma = 1.1
gstep = 10
batch_size = 80
steps = 1000
printevery = 1
lr = 0.000005
d_model = 61
epochs = 100
teacher = 1
scale = 1000
history = trainEpochs(encoder1, decoder1, steps, epochs, lr, max_length, xx, yy, batch_size, printevery, gamma, gstep, scale)

In [None]:
torch.save(encoder1.state_dict(), 'data//encoder1.sd')
torch.save(decoder1.state_dict(), 'data//decoder1.sd')

In [None]:
s = 'Spare me the grisly details.'#Spare me the grisly details.	Épargnez-moi les détails macabres !
def translate(s):
    ids = bpemb_en.encode_ids_with_bos_eos(s)
    ids = torch.from_numpy(np.array(ids)).long().reshape(1, len(ids))
    
    encoder1.eval()
    decoder1.eval()
    o1 = encoder1(ids)
    l = [1]
    t = 0
    for t in range(max_length):
        l_ = torch.from_numpy(np.array(l)).reshape(1, len(l)).long()
        o2 = decoder1(o1, l_, mode = 'train').argmax(dim = 1)
        l.append(o2[0, t].data.tolist())
        if(o2[0, t].data.tolist() == 2):
            break

    return bpemb_fr.decode_ids(l)

print(translate(s))
