In [65]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [66]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [67]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [68]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    with open('dataset/english_german_translation.txt','r',encoding = 'utf-8') as text_file:
        lines = text_file.readlines()
    # Split every line into pairs and normalize
    pairs = [[normalizeString(s).replace('.','').strip() for s in l.split('\t')][0:2] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [69]:
# filtering the data 
MAX_LENGTH = 6
eng_prefixes = (
    "i am ",
    "he is", 
    "she is",
    "you are", 
    "we are", 
    "they are"
)
def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [70]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('eng', 'ger', True)
print(random.choice(pairs))

Reading lines...
Read 255817 sentence pairs
Trimmed to 1311 sentence pairs
Counting words...
Counted words:
ger 1090
eng 835
['er ist maler', 'he is a painter']


In [71]:
input_lang.n_words,output_lang.n_words

(1090, 835)

In [72]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [73]:
# creating all input/output sentence pairs 
training_pairs = [tensorsFromPair(i) for i in pairs]

In [74]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.rnn(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [75]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size,dropout_p =0.1):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.RNN(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        self.dropout_p = dropout_p 

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [96]:
def to_chunks(list_, b):
    n = int(len(list_)/b)+1
    return [list_[start::n] for start in range(n)]

In [97]:
training_data = to_chunks(training_pairs,128)

In [98]:
# [len(pair_chunks[i]) for i in range(len(pair_chunks))]

In [99]:
from tqdm import tqdm 

In [100]:
teacher_forcing_ratio = 0.5 
encoder_optimizer = optim.Adam(encoder.parameters(),lr = lr)
decoder_optimizer = optim.Adam(decoder.parameters(),lr =lr)
def train_epoch(training_data,encoder,decoder,criterion,lr,encoder_optimizer,decoder_optimizer):
    for batch in tqdm(range(len(training_data))):
        loss = 0
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        batch_samples = training_data[batch] 
        for sample in batch_samples:
            input_tensor,target_tensor = sample
            input_length = input_tensor.size(0)
            target_length = target_tensor.size(0)
            encoder_hidden = encoder.initHidden()
            for ei in range(input_length):
                encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
            decoder_input = torch.tensor([[SOS_token]], device=device)
            decoder_hidden = encoder_hidden
            use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
            if use_teacher_forcing:
                for di in range(target_length):
                    decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
                    topv, topi = decoder_output.topk(1)
                    pred = topi.squeeze().detach()
                    loss += criterion(decoder_output, target_tensor[di])
                    decoder_input = target_tensor[di]  
            else:
                for di in range(target_length):
                    decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
                    topv, topi = decoder_output.topk(1) 
                    decoder_input = topi.squeeze().detach() 
                    pred = topi.squeeze().detach()
                    loss += criterion(decoder_output, target_tensor[di])
                    if decoder_input.item() == EOS_token:
                        break
        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()

In [113]:
criterion = nn.NLLLoss()
lr = 0.01
hidden_size = 1280
encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder = DecoderRNN(hidden_size, output_lang.n_words).to(device)

In [114]:
for epoch in range(1):
    train_epoch(training_data,encoder,decoder,criterion,lr,encoder_optimizer,decoder_optimizer)

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [04:52<00:00, 26.58s/it]


In [118]:
def evaluate(encoder, decoder, sentence, max_length=3):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()
        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],encoder_hidden)
        # print(encoder_hidden)   
        decoder_input = torch.tensor([[SOS_token]], device=device)
        decoder_hidden = encoder_hidden
        decoded_words = []

        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])
            decoder_input = topi.squeeze().detach()

        return decoded_words

In [119]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [120]:
evaluateRandomly(encoder,decoder)

> ihr seid sehr mutig
= you are very brave
< skate crying breathing

> sie ist au er fassung
= she is upset
< logged careless yourself

> ich bin am dosen
= i am dozing
< love about opponent

> er ist immer frohlich
= he is always cheerful
< team about irresistible

> er isst gerade zu mittag
= he is having lunch
< logged colleague butterfly

> wir lernen arabisch
= we are learning arabic
< butterfly daredevil family

> er ist nicht mehr allein
= he isn t alone anymore
< logged driven japan

> sie ist auf diat
= she is on a diet
< butterfly daredevil family

> sie isst gerade
= she is eating
< team about irresistible

> er hat ihr gegenuber vorurteile
= he is prejudiced against her
< butterfly daredevil dieting

