In [2]:
import pandas as pd
import torch
from torch import optim
from torch import nn
from torch.utils.data import DataLoader, random_split, Dataset
from torchvision.transforms import ToTensor
import unicodedata
import os
import re

In [3]:
if torch.cuda.is_available():
    device = torch.device('cuda', index=0)
else:
    device = torch.device('cpu', index=0)
    
print(device)

cuda:0


In [4]:
data_dir = "./Datasets/final.csv"
dataset = pd.read_csv(data_dir)
dataset.head()

Unnamed: 0.1,Unnamed: 0,id,og,t
0,0,42928-1500614319216-63344,You do not meet a man but frowns:,Every man you meet these days is frowning.
1,1,42928-1500614326583-89821,our bloods No more obey the heavens than our...,Our bodies are in agreement with the planetar...
2,2,A-63849,But what's the matter?,What's wrong?
3,3,42930-1500614347266-80123,"His daughter, and the heir of's kingdom, whom...","The king wanted his daughter, the only heir to..."
4,4,42930-1500614355280-38326,she's wedded; Her husband banish'd; she impr...,"She's married, her husband is banished, she's..."


In [5]:
pairs = list(zip(dataset['og'], dataset['t']))
pairs[0]

(' You do not meet a man but frowns: ',
 'Every man you meet these days is frowning.')

In [23]:
def normalize_str(s):
    final_str = ""
    for ch in unicodedata.normalize('NFD', s):
        if unicodedata.category(ch) != 'Mn':
            final_str+=ch
    
    final_str = re.sub(r"([.!:?'])", r" \1", final_str)
    final_str = re.sub(r"[^a-zA-Z:!?']+", r" ", final_str)
    
    return final_str.strip()

def creatNormalizedPairs(pairs):
    initpairs = []
    for pair in pairs:
        s1, s2 = pair
        # print(len(s1))
        s1 = normalize_str(s1.lower().strip())
        s2 = normalize_str(s2.lower().strip())
        initpairs.append([s1,s2])
        
    return initpairs

In [7]:
MAX_LENGTH = 25

def filterPairs(initpairs):
    pairs = []
    for pair in initpairs:
        if len(pair[0].split(" ")) <= MAX_LENGTH and len(pair[1].split(" ")) <= MAX_LENGTH:
            pairs.append([pair])
            
    return pair

In [8]:
class Vocab:
    def __init__(self, name):
        self.name = name
        self.word2idx = {'SOS':0, 'EOS':1}
        self.idx2word = {0:'SOS', 1:'EOS'}
        self.wordCnt = {}
        self.nwords = 2
        
    def GenerateVocab(self, s):
        for word in s.split(" "):
            if word not in self.word2idx:
                self.word2idx[word] = self.nwords
                self.idx2word[self.nwords] = word
                self.wordCnt[word] = 1
                self.nwords+=1
            else:
                self.wordCnt[word]+=1    
            

In [9]:
def get_inp_ids(sentence, langobj):
    input_ids = []
    for word in sentence.split(" "):
        input_ids.append(langobj.word2idx[word])
        
    if langobj.name == 'shake':
        input_ids.append(langobj.word2idx['EOS'])
    else:
        input_ids.insert(0, langobj.word2idx['SOS'])
        input_ids.append(langobj.word2idx['EOS'])
        
    return torch.tensor(input_ids)

In [10]:
def ids2sentence(ids, vocab):
    sentence = ""
    print(id.ndim)
    for id in ids.squeeze():
        if id==0:
            continue
        word = vocab.idx2word[id.item()]
        sentence += word + " "
        if id == 1:
            break;
    return sentence

In [11]:
class customDataset(Dataset):
    def __init__(self):
        super(customDataset, self).__init__()
    
    def __len__(self):
        return length
    
    def __getitem__(self, index):
        s, t = pairs[index]
        s_input_ids = torch.zeros(MAX_LENGTH+1, dtype=torch.int64)
        t_input_ids = torch.zeros(MAX_LENGTH+2, dtype=torch.int64)
        s_input_ids[:len(s.splt(" "))+1] = get_inp_ids(s, shake)
        t_input_ids[:len(t.split(" "))+2] = get_inp_ids(t, eng)

In [12]:
class Encoder(nn.Module):
    def __init__(self, input_layer, embed_layer, hidden_layer):
        super().__init__()
        self.embed = nn.Embedding(input_layer, embed_layer)
        self.dropout = nn.Dropout(p=0.1)
        self.gru = nn.GRU(embed_layer, hidden_layer, batch_first=True)
        
    def forward(self, x):
        x = self.embed(x)
        x = self.dropout(x)
        _, hidden = self.gru(x)
        return hidden

In [13]:
class Decoder(nn.Module):
    def __init__(self, output_layer, hidden_layer, embed_layer):
        super().__init__()
        self.embed = nn.Embedding(output_layer, embed_layer)
        self.relu = nn.ReLU()
        self.logsmax = nn.LogSoftmax(dim=-1)
        self.gru = nn.GRU(embed_layer, hidden_layer, batch_first=True)
        self.dense = nn.Linear(hidden_layer, output_layer)
        
    def forward(self, x, hidden_wt):
        x = self.embed(x)
        x = self.relu(x)
        out, hidden = self.gru(x, hidden_wt)
        out = self.dense(out)
        out = self.logsmax(out)
        
        return out, hidden
        

In [14]:
def train_one_epoch():
    encoder.train()
    decoder.train()
    track_loss = 0
    
    for i, (s_ids, t_ids) in enumerate(dataloader):
        s_ids = s_ids.to(device)
        t_ids = t_ids.to(device)
        
        hidden_wt = encoder(s_ids)
        
        pred, _ = decoder(t_ids[:,0:-1], hidden_wt)
        
        gt = t_ids[:,1:]
        pred = pred.view(-1,pred.shape[-1])
        gt = gt.reshape(-1)
        
        loss = loss_fn(pred, gt)
        track_loss+=loss.item()
        
        opte.zero_grad()
        optd.zero_grad()
        
        loss.backward()
        
        opte.step()
        optd.step()
    
    return round(track_loss/len(dataloader), 2)

In [15]:
def eval_one(e, n_epochs):
    encoder.eval()
    decoder.eval()
    track_loss = 0
    
    with torch.no_grad():
        for i, (s_ids, t_ids) in enumerate(test_dataloader):
            s_ids = s_ids.to(device)
            t_ids = t_ids.to(device)
            
            hidden_wt = encoder(s_ids)
            # print(t_ids.ndim)
            input_ids = t_ids[:,0]
            # print(input_ids.ndim)
            
            pred = []
            
            if i+1==n_epochs:
                pred_sentence=""
                
            for j in range(1, MAX_LENGTH+2):
                probs, _ = decoder(input_ids.unsqueeze(1),hidden_wt)
                pred.append(probs)
                _, input_ids = torch.topk(probs, dim=-1)
                # print(input_ids.ndim) 
                input_ids=input_ids.squeeze(1,2)
                # print(input_ids.ndim) 
                if e+1==n_epochs:
                    word=eng.index2word[input_ids.item()]
                    pred_sentence+=word + " "
                if input_ids.item() == 1:
                    break
                
            if e+1 == n_epochs:
                src_sentence = ids2sentence(s_ids, shake)
                gt_sentence = ids2sentence(t_ids, eng)
                
                print("\n-----------------------------------")
                print("Source Sentence:",src_sentence)
                print("GT Sentence:",gt_sentence)
                print("Predicted Sentence:",pred_sentence)
                
            pred_cat = torch.cat(pred, dim=1)
            print(pred_cat.ndim)
            pred_reshaped = pred_cat.view(-1, pred_cat.shape[-1])
            gt=t_ids[:,1:j+1]
            gt=gt.view(-1)
            
            loss=loss_fn(pred_reshaped, gt)
            track_loss+=loss.item()
            
        if e+1==n_epochs:
            print("-----------------------------------")
        return round(track_loss/len(test_dataloader), 2)

In [25]:
pairs = creatNormalizedPairs(pairs)
pairs = filterPairs(pairs)
length = len(pairs)
print(length)

2
