In [33]:
import csv
from bpemb import BPEmb
import torch
import torch.autograd as autograd
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
from torch.autograd import Variable
from torch.nn.init import xavier_uniform_
import copy
from typing import Optional, Any

In [34]:
torch.cuda.empty_cache()

In [35]:
import re
bpemb_sa = BPEmb(lang="sa", vs=100000,dim=300,add_pad_emb=True)
reg = re.compile("[!@#$%^&*,/\_-`~\n]")
sansList = []
with open('allData.csv', encoding = "utf-8") as csvfile:
    spamreader = csv.reader(csvfile)
    for i in spamreader:
        if i != "SANS":
            sansList.append(bpemb_sa.encode(reg.sub(" ",i[1][7:][:-4])))

In [36]:
len(sansList)

18300

In [37]:
# bpemb_sa = BPEmb(lang="sa", vs=100000,dim=300,add_pad_emb=True)
# encodedSansList = []
# for i in sansList:
#     encodedSansList.append(autograd.Variable(torch.tensor([[bpemb_sa.encode_ids(i)]])))

In [38]:
rege = re.compile("[!@#$%^&*:,/\_`~\n]")
engList = []
bpemb_en = BPEmb(lang="en", vs=100000,dim=300, add_pad_emb= True)
with open('allData.csv', encoding = "utf-8") as csvfile:
    spamreader = csv.reader(csvfile)
    for i in spamreader:
        if i != "ENG":
            engList.append(bpemb_en.encode(rege.sub(" ",i[2][7:][:-4])))

In [39]:
from sklearn.model_selection import train_test_split
sans_train, sans_test, eng_train, eng_test = train_test_split(sansList,engList,test_size=0.20,random_state=123)

In [40]:
len(eng_train)

14640

In [41]:
eng_vocab = {}
for each in eng_train:
    for i in each:
        if i not in eng_vocab.keys():
            eng_vocab[i] = len(eng_vocab) + 4

In [42]:
sans_vocab = {}
for each in sans_train:
    for i in each:
        if i not in sans_vocab.keys():
            sans_vocab[i] = len(sans_vocab) + 2

In [43]:
src_vocab_len, tgt_vocab_len = len(sans_vocab), len(eng_vocab)

In [44]:
sans_vocab["<pad>"] = 0
eng_vocab["<pad>"] = 0

In [45]:
# bpemb_en = BPEmb(lang="en", vs=100000,dim=300, add_pad_emb= True)
# encodedEngList = []
# for i in engList:
#     encodedEngList.append(autograd.Variable(torch.tensor([[bpemb_en.encode_ids(i)]])))

In [46]:
eng_vocab["<bos>"] = 1
eng_vocab["<eos>"] = 2
sans_vocab["<unk>"] = 1
eng_vocab["<unk>"] = 3
src_vocab_len, tgt_vocab_len = len(sans_vocab), len(eng_vocab)
print(src_vocab_len, tgt_vocab_len)

21828 18000


In [47]:
list(eng_vocab.keys())[list(eng_vocab.values()).index(1345)]

'▁observing'

In [48]:
batch_size = 4
batch_size_test = 1

In [49]:
entireSansTensorList = []
for i in range(0, len(sans_train), batch_size):
    temp1 = []
    longest = len(max(sans_train[i:i+batch_size], key=len))
    #print(longest)
    for a in sans_train[i:i+batch_size]:
        temp2 = []
        for e in a:  
            temp2.append(sans_vocab.get(e))
        zeros = [0] * (longest - len(a))
        temp1.append(temp2 + zeros)
    entireSansTensorList.append(autograd.Variable(torch.tensor(temp1)).transpose(0,1).long())

In [50]:
entireSansTestTensorList = []
for i in range(0, len(sans_test), batch_size_test):
    temp1 = []
    longest = len(max(sans_test[i:i+batch_size_test], key=len))
    #print(longest)
    for a in sans_test[i:i+batch_size_test]:
        temp2 = []
        for e in a:  
            temp2.append(sans_vocab.get(e,1))
        zeros = [0] * (longest - len(a))
        temp1.append(temp2 + zeros)
    entireSansTestTensorList.append(autograd.Variable(torch.tensor(temp1)).transpose(0,1).long())

In [51]:
entireSansTensorList[1].size()

torch.Size([26, 4])

In [52]:
entireEngTensorList = []
for i in range(0, len(eng_train), batch_size):
    temp1 = []
    longest = len(max(eng_train[i:i+batch_size], key=len))
#     print(longest)
    for a in eng_train[i:i+batch_size]:
        temp2 = []
        for e in a:  
            temp2.append(eng_vocab.get(e))
        zeros = [0] * (longest - len(a))
        temp1.append([1] + temp2 + [2] + zeros)
    entireEngTensorList.append(autograd.Variable(torch.tensor(temp1)).transpose(0,1).long())

In [53]:
entireEngTestTensorList = []
for i in range(0, len(eng_test), batch_size_test):
    temp1 = []
    longest = len(max(eng_test[i:i+batch_size_test], key=len))
#     print(longest)
    for a in eng_test[i:i+batch_size_test]:
        temp2 = []
        for e in a:  
            temp2.append(eng_vocab.get(e,3))
        zeros = [0] * (longest - len(a))
        temp1.append([1] + temp2 + [2] + zeros)
    entireEngTestTensorList.append(autograd.Variable(torch.tensor(temp1)).transpose(0,1).long())

In [54]:
src = entireEngTensorList[0].transpose(0,1)
print(src.size(1))
#src_mask = src_mask.float().masked_fill(src_mask == 0, float('-inf')).masked_fill(src_mask == 1, float(0.0))
#print(src_mask)

41


In [55]:
sans_train_tensor, sans_val_tensor, eng_train_tensor, eng_val_tensor = train_test_split(entireSansTensorList,entireEngTensorList,test_size=0.10,random_state=123)

In [56]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.d_model = d_model
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x * math.sqrt(self.d_model)
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class MyTransformer(nn.Module):
    def __init__(self, d_model: int = 512, nhead: int = 8, num_encoder_layers: int = 6,
                 num_decoder_layers: int = 6, dim_feedforward: int = 2048, dropout: float = 0.1,
                 activation: str = "relu",source_vocab_length: int = 60000,target_vocab_length: int = 60000) -> None:
        super(MyTransformer, self).__init__()
        self.source_embedding = nn.Embedding(source_vocab_length, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, activation)
        encoder_norm = nn.LayerNorm(d_model)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
        self.target_embedding = nn.Embedding(target_vocab_length, d_model)
        decoder_layer = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, activation)
        decoder_norm = nn.LayerNorm(d_model)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm)
        self.out = nn.Linear(512, target_vocab_length)
        self._reset_parameters()
        self.d_model = d_model
        self.nhead = nhead

    def forward(self, src: Tensor, tgt: Tensor, src_mask: Optional[Tensor] = None, tgt_mask: Optional[Tensor] = None,
                memory_mask: Optional[Tensor] = None, src_key_padding_mask: Optional[Tensor] = None,
                tgt_key_padding_mask: Optional[Tensor] = None, memory_key_padding_mask: Optional[Tensor] = None) -> Tensor:
        if src.size(1) != tgt.size(1):
            raise RuntimeError("the batch number of src and tgt must be equal")
        src = self.source_embedding(src)
        src = self.pos_encoder(src)
        memory = self.encoder(src, mask=src_mask, src_key_padding_mask=src_key_padding_mask)
        tgt = self.target_embedding(tgt)
        tgt = self.pos_encoder(tgt)
        output = self.decoder(tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask,
                              tgt_key_padding_mask=tgt_key_padding_mask,
                              memory_key_padding_mask=memory_key_padding_mask)
        output = self.out(output)
        return output

    def _reset_parameters(self):
        r"""Initiate parameters in the transformer model."""
        for p in self.parameters():
            if p.dim() > 1:
                xavier_uniform_(p)

In [57]:
model = MyTransformer(source_vocab_length=src_vocab_len,target_vocab_length=src_vocab_len)
model = model.to("cuda")
optim = torch.optim.AdamW(model.parameters(), lr=0.0000251, betas=(0.9, 0.98), eps=1e-9)
#optim = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

In [74]:
def train(sans_train, eng_train, sans_test, eng_test, model, optim, num_epochs,use_gpu=True): 
    train_losses = []
    valid_losses = []
    for epoch in range(num_epochs):
        train_loss = 0
        valid_loss = 0
        # Train model
        model.train()
        for i in range(len(sans_train)):
            #m = torch.zeros(1,1,512-(sans_train[i].size(2)))
            #print(i)
            src = sans_train[i]
            #src = torch.cat((sans_train[i],m),2).long()
            #n = torch.zeros(1, 1, 512 - (eng_train[i].size(2))-2)
            trg = eng_train[i]
            #trg = torch.cat((eng_new,n),2).long()
#             print(src.size(), trg.size())
            #change to shape (bs , max_seq_len)
            src = src.transpose(0,1)
            #change to shape (bs , max_seq_len+1) , Since right shifted
            trg = trg.transpose(0,1)
            trg_input = trg[:, :-1]
            targets = trg[:, 1:].contiguous().view(-1)
            src_pad_mask = (src == 0)
            src_size = src.size(1)
            src_mask = (torch.triu(torch.ones(src_size, src_size)) == 1).transpose(0, 1)
            src_mask = src_mask.float().masked_fill(src_mask == 0, float('-inf')).masked_fill(src_mask == 1, float(0.0))
            src_mask = src_mask.cuda() if use_gpu else src_mask
            src_pad_mask = src_pad_mask.cuda() if use_gpu else src_pad_mask
            trg_pad_mask = (trg_input == 0)
            trg_pad_mask = trg_pad_mask.cuda() if use_gpu else trg_pad_mask
            size = trg_input.size(1)
            mem_mask = (torch.triu(torch.ones(src.size(1), size)) == 1).transpose(0, 1)
            mem_mask = mem_mask.float().masked_fill(mem_mask == 0, float('-inf')).masked_fill(mem_mask == 1, float(0.0))
            mem_mask = mem_mask.cuda()
            np_mask = torch.triu(torch.ones(size, size)==1).transpose(0,1)
            np_mask = np_mask.float().masked_fill(np_mask == 0, float('-inf')).masked_fill(np_mask == 1, float(0.0))
            np_mask = np_mask.cuda() if use_gpu else np_mask   
            # Forward, backprop, optimizer
            optim.zero_grad()
            preds = model(src.transpose(0,1).to("cuda"), trg_input.transpose(0,1).to("cuda"), src_key_padding_mask = src_pad_mask, tgt_key_padding_mask=trg_pad_mask, memory_key_padding_mask = src_pad_mask, src_mask = src_mask, tgt_mask = np_mask, memory_mask = mem_mask)
            #print(preds)
            preds = preds.transpose(0,1).contiguous().view(-1, preds.size(-1))
            loss = F.cross_entropy(preds.to('cuda'),targets.to('cuda'), ignore_index=0,reduction='sum')
            loss.backward()
            optim.step()
            train_loss += loss.item()/src.size(1)
            print("----------- Batch: " + str(i) + " loss: " + str(train_loss))
        model.eval()
        with torch.no_grad():
            for i in range(len(sans_test)):
                src = sans_test[i]
                #src = torch.cat((sans_train[i],m),2).long()
                #n = torch.zeros(1, 1, 512 - (eng_train[i].size(2))-2)
                trg = eng_test[i]
                #src = src.transpose(0,1)
                #change to shape (bs , max_seq_len+1) , Since right shifted
                #trg = trg.transpose(0,1)
                trg_input = trg[:, :-1]
                targets = trg[:, 1:].contiguous().view(-1)
                src_pad_mask = (src == 0)
                src_size = src.size(1)
                src_mask = (torch.triu(torch.ones(src_size, src_size)) == 1).transpose(0, 1)
                src_mask = src_mask.float().masked_fill(src_mask == 0, float('-inf')).masked_fill(src_mask == 1, float(0.0))
                src_mask = src_mask.cuda() if use_gpu else src_mask
                src_pad_mask = src_pad_mask.cuda() if use_gpu else src_pad_mask
                trg_pad_mask = (trg_input == 0)
                trg_pad_mask = trg_pad_mask.cuda() if use_gpu else trg_pad_mask
                size = trg_input.size(1)
                mem_mask = (torch.triu(torch.ones(src.size(1), size)) == 1).transpose(0, 1)
                mem_mask = mem_mask.float().masked_fill(mem_mask == 0, float('-inf')).masked_fill(mem_mask == 1, float(0.0))
                mem_mask = mem_mask.cuda()
                np_mask = torch.triu(torch.ones(size, size)==1).transpose(0,1)
                np_mask = np_mask.float().masked_fill(np_mask == 0, float('-inf')).masked_fill(np_mask == 1, float(0.0))
                np_mask = np_mask.cuda() if use_gpu else np_mask

                preds = model(src.transpose(0,1).to('cuda'), trg_input.transpose(0,1).to('cuda'), src_key_padding_mask = src_pad_mask, tgt_key_padding_mask=trg_pad_mask, memory_key_padding_mask = src_pad_mask, src_mask = src_mask, tgt_mask = np_mask, memory_mask = mem_mask)
                preds = preds.transpose(0,1).contiguous().view(-1, preds.size(-1))         
                loss = F.cross_entropy(preds.to('cuda'),targets.to('cuda'), ignore_index=0,reduction='sum')
                valid_loss += loss.item()/src.size(1)
            
        # Log after each epoch
        print(f'''Epoch [{epoch+1}/{num_epochs}] complete. Train Loss: {train_loss/len(sans_train):.3f}. Val Loss: {valid_loss/len(sans_test):.3f}''')
        
        #Save best model till now:
        if valid_loss/len(sans_test)<min(valid_losses,default=1e9): 
            print("saving state dict")
            torch.save(model.state_dict(), f"checkpoint_best_epoch.pt")
        
        train_losses.append(train_loss/len(sans_train))
        valid_losses.append(valid_loss/len(sans_test))
        
#         # Check Example after each epoch:
# #         sentences = ["This is an example to check how our model is performing."]
# #         for sentence in sentences:
# #             print(f"Original Sentence: {sentence}")
# #             print(f"Translated Sentence: {greeedy_decode_sentence(model,sentence)}")
    return train_losses,valid_losses

In [75]:
train_losses,valid_losses = train(sans_train_tensor, eng_train_tensor, sans_val_tensor ,eng_val_tensor, model, optim, 10)

----------- Batch: 0 loss: 29.46626683285362
----------- Batch: 1 loss: 71.62312718441612
----------- Batch: 2 loss: 102.10975266017734
----------- Batch: 3 loss: 125.64339659463914
----------- Batch: 4 loss: 166.8838913863058
----------- Batch: 5 loss: 199.9909241621847
----------- Batch: 6 loss: 246.57531230821837
----------- Batch: 7 loss: 274.25908139557635


KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), f"after_10_epochs.pt")

In [None]:
import numpy as np

def decoder(pred):
    vals = []
    for i in range(len(pred)):
        each = pred[i]
        idxs = np.argmax(each, axis=1)
        vals.append(list(eng_vocab.keys())[list(eng_vocab.values()).index(idxs[3])])
    print(vals)
    return bpemb_en.decode(vals)
    

In [None]:
model.eval()
out = model(sans_val_tensor[0].to('cuda'), eng_val_tensor[0].to('cuda'))

In [None]:
print(out.size())
print(len(eng_val_tensor[0]))

In [None]:
print(decoder(out.to('cpu').detach().numpy()))

In [None]:
print(decoder(out.to('cpu').detach().numpy()))

In [None]:
print(decoder(out.to('cpu').detach().numpy()))

In [None]:
print(decoder(out.to('cpu').detach().numpy()))