In [1]:
from google.colab import drive 
drive.mount('/content/drive/',force_remount = True) 

Mounted at /content/drive/


In [2]:
from io import open 
import unicodedata
import string
import re
import random
import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
import os 
os.chdir('/content/drive/MyDrive/DeepLearningPytorch')

In [4]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
            
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    with open('english_german_translation.txt','r',encoding = 'utf-8') as text_file:
        lines = text_file.readlines()
    # Split every line into pairs and normalize
    pairs = [
        [normalizeString(s).replace('.','').strip() for s in l.split('\t')][0:2] 
        for l in lines]
    
    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = [i for i in pairs if 
             len(i[0].split(' '))>3 and 
             len(i[0].split(' '))<6 and 
             len(i[1].split(' '))<6  and 
             len(i[1].split(' '))>3]
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('eng', 'ger', True)

Reading lines...
Read 255817 sentence pairs
Trimmed to 52448 sentence pairs
Counting words...
Counted words:
ger 12661
eng 7980


In [5]:
max_length_input = max([len(pairs[i][0].split()) for i in range(len(pairs))])
max_length_target = max([len(pairs[i][1].split()) for i in range(len(pairs))])
max_length_input,max_length_target

(5, 5)

In [6]:
pairs[10:19]

[['bin ich zu spat ?', 'am i late ?'],
 ['kann ich essen ?', 'can i eat ?'],
 ['konnen wir gehen ?', 'can we go ?'],
 ['habe ich gewonnen ?', 'did i win ?'],
 ['wie ist die lage ?', 'how is it ?'],
 ['ich liebe es !', 'i love it !'],
 ['ich meine es so !', 'i mean it !'],
 ['ich meine es ernst !', 'i mean it !'],
 ['ich wurde es tun', 'i d do it']]

In [7]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence_input(lang, sentence,max_length_input):
    indexes = indexesFromSentence(lang, sentence)
    while len(indexes)<max_length_input+1:
        indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorFromSentence_output(lang, sentence,max_length_target):
    indexes = indexesFromSentence(lang, sentence)
    while len(indexes)<max_length_target+1:
        indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence_input(input_lang, pair[0],max_length_input)
    target_tensor = tensorFromSentence_output(output_lang, pair[1],max_length_target)
    return (input_tensor, target_tensor)

# creating all input/output sentence pairs 
training_pairs = [tensorsFromPair(i) for i in pairs]
max_seq_len_input = max([len(training_pairs[i][0]) for i in range(len(training_pairs))])
max_seq_len_target = max([len(training_pairs[i][1]) for i in range(len(training_pairs))])

In [8]:
training_pairs[4942]

(tensor([[ 48],
         [ 18],
         [ 77],
         [133],
         [  1],
         [  1]], device='cuda:0'), tensor([[102],
         [329],
         [ 72],
         [ 97],
         [  1],
         [  1]], device='cuda:0'))

In [15]:
class dataset(Dataset):
    def __init__(self,training_pairs):
        self.training_pairs = training_pairs
  
    def __getitem__(self,idx):
        return self.training_pairs[idx][0],self.training_pairs[idx][1]
    def __len__(self):
        return len(self.training_pairs)
    
trainset = dataset(training_pairs)
train_loader = DataLoader(trainset,batch_size=64,shuffle=False)

In [16]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size,embedding_dim):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_size,batch_first=  True)

    def forward(self, input, hidden):
        embedded = self.embedding(input).squeeze()
        output = embedded
        output, hidden = self.rnn(output, hidden)
        return output, hidden

    def initHidden(self,batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

In [17]:
class AttentionDecoderRNN(nn.Module):
    def __init__(self,output_size,hidden_size,embedding_dim,input_seq_max_length,target_seq_max):
        super(AttentionDecoderRNN,self).__init__() 
        self.output_size = output_size 
        self.input_seq_max_length = input_seq_max_length 
        self.embedding = nn.Embedding(output_size,embedding_dim) 
        self.attention_weights = nn.Linear(hidden_size+embedding_dim,input_seq_max_length)
        self.attention_combine = nn.Linear(hidden_size+embedding_dim,hidden_size) 
        self.out = nn.Linear(hidden_size,output_size) 
        self.rnn = nn.GRU(hidden_size,hidden_size,batch_first = True) 
        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self,x,hidden,encoder_outputs):
        embedded = self.embedding(x) # batch,1,emb_dim
        embedded = embedded.permute(1,0,2)

        emb_hid_cat = torch.cat([embedded.permute,hidden],dim =2)  # 1,batch,emb_dim+hid_dim
        attention_weights = self.attention_weights(emb_hid_cat) # 1,batch,input_seq_max_length 
        attention_weights = attention_weights.permute(1,0,2)
        context_vector = torch.bmm(attention_weights,encoder_outputs)

        # combine attn_applied and embedded
        context_vector = context_vector.permute(1,0,2)
        combine_layer_input = torch.cat([context_vector,embedded],dim = 2)
        combine_out = self.attention_combine(combine_layer_input)
        combine_out = combine_out.permute(1,0,2) 
        rnn_out,rnn_hid = self.rnn(combine_out,hidden)
        final_output= self.softmax(self.out(rnn_out))
        return final_output,rnn_hid,attention_weights

In [18]:
class seq2seq(nn.Module):
    def __init__(self,encoder,decoder,teacher_force_ratio = 0.5):
        super(seq2seq,self).__init__()
        self.encoder = encoder
        self.decoder = decoder 
        self.teacher_force_ratio = teacher_force_ratio 
    def forward(self,source,target):
        # source shape : (2,7,1) : batch_size,source_sequence_length,1
        # target shape : (2,7,1) : batch_size,target_seq_length,1
        
        target_seq_length = target.shape[1]
        source_seq_length = source.shape[1] 
        batch_size = source.shape[0] 
        no_output_tokens = self.decoder.output_size 
        
        encoder_hidden = self.encoder.initHidden(batch_size)
        encoder_outputs,encoder_hidden = self.encoder(source,encoder_hidden) 
        
        x = source[:,source_seq_length-1,:]
        outputs = torch.zeros(target_seq_length,batch_size,1,no_output_tokens)
        for timestep in range(0,target_seq_length):
            decoder_output,decoder_hidden,attention_weights = self.decoder(x,encoder_hidden,encoder_outputs)
            # decoder_output shape : (batch_size,1,no_output_tokens)
            outputs[timestep,:,:,:] = decoder_output
            best_guess = decoder_output.argmax(2)
            x = target[:,timestep,:] if random.random()<self.teacher_force_ratio else best_guess
        return outputs 

In [19]:
torch.manual_seed(101)
num_epochs = 30  
learning_rate = 0.001 
batch_size = 256
embedding_dim = 256 
hidden_size = 128 
learning_rate = 0.001 
encoder = EncoderRNN(input_lang.n_words, hidden_size,embedding_dim)
decoder = AttentionDecoderRNN(output_lang.n_words,hidden_size,embedding_dim,max_seq_len_input,max_seq_len_target)
criterion = nn.CrossEntropyLoss(ignore_index=EOS_token)
model = seq2seq(encoder,decoder).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    epoch_loss = 0 
    model.train() 
    for batch_idx,batch in enumerate(train_loader):
        source = batch[0].to(device)
        target = batch[1].to(device)
        batch_output = model(source,target).to(device)
        batch_output = batch_output.squeeze(2)
        batch_output = batch_output.permute(1,0,2)
        first_dim = batch_output.shape[0]* batch_output.shape[1] 
        batch_output = batch_output.reshape(first_dim,decoder.output_size)
        target = target.squeeze(2)
        target = target.reshape(-1) 

        optimizer.zero_grad() 
        loss = criterion(batch_output,target)
        
        epoch_loss+=loss.item() 
        
        loss.backward() 
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step() 
    print(f'epoch = {epoch} - loss = {epoch_loss}')

epoch = 0 - loss = 3908.020450592041
epoch = 1 - loss = 2798.905596256256
epoch = 2 - loss = 2334.059930205345
epoch = 3 - loss = 2030.199009358883
epoch = 4 - loss = 1808.6228539943695
epoch = 5 - loss = 1635.8568942546844
epoch = 6 - loss = 1490.5327734947205
epoch = 7 - loss = 1380.2000402212143
epoch = 8 - loss = 1293.107670456171
epoch = 9 - loss = 1217.4251962602139
epoch = 10 - loss = 1146.8003385961056
epoch = 11 - loss = 1103.887454777956
epoch = 12 - loss = 1046.780684441328
epoch = 13 - loss = 1015.356229454279
epoch = 14 - loss = 973.1371029317379
epoch = 15 - loss = 951.5205255448818
epoch = 16 - loss = 919.0105409771204
epoch = 17 - loss = 894.6627846509218
epoch = 18 - loss = 871.1435580253601
epoch = 19 - loss = 842.2487032711506
epoch = 20 - loss = 837.5911224484444
epoch = 21 - loss = 820.9534720927477
epoch = 22 - loss = 801.0231860727072
epoch = 23 - loss = 785.9413891583681
epoch = 24 - loss = 782.8735755085945
epoch = 25 - loss = 772.8325341790915
epoch = 26 - los

In [121]:
def evaluate(encoder,decoder,test_pairs,max_length):
    with torch.no_grad():
        testing_pairs = [tensorsFromPair(i) for i in test_pairs]
        batch_size = len(testing_pairs)
        testset = dataset(testing_pairs) 
        test_loader = DataLoader(testset,batch_size,shuffle=False) 

        for batch_id,batch in enumerate(test_loader):
            source = batch[0].to(device)
            source_seq_length = source.shape[1] 
            target = batch[1].to(device)
        
            encoder_hidden = encoder.initHidden(batch_size).to(device)  # batch size = 1 
            encoder_outputs,encoder_hidden = encoder(source,encoder_hidden) 
            decoded_words = torch.zeros(size = (max_length,batch_size,1))
            x = source[:,source_seq_length-1,:]
            for timestep in range(0,max_length):
                decoder_output,decoder_hidden,attention_weights = decoder(x,encoder_hidden,encoder_outputs)
                # decoder_output shape : (batch_size,1,no_output_tokens)
                best_guess = decoder_output.argmax(2)
                x = best_guess
                decoded_words[timestep,:,:] = x
            out_v1 = decoded_words.permute(1,0,2).squeeze(2).tolist() 
            final_predictions = [' '.join([output_lang.index2word[int(i)] for i in out]) for out in out_v1]
            
            for i in range(len(test_pairs)):
                print(f'input -> {test_pairs[i][0]}')
                print(f'actual -> {test_pairs[i][1]}')
                print(f'predicted -> {final_predictions[i]}')
                print('----------------------------------------')

In [125]:
evaluate(encoder,decoder,pairs[585:587],4)

input -> ich war ein idiot
actual -> i was a fool
predicted -> i was an idiot
----------------------------------------
input -> ich war nicht verruckt
actual -> i wasn t mad
predicted -> i wasn t crazy
----------------------------------------
