In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np

# from torchtext.datasets import TranslationDataset, Multi30k
# from torchtext.data import Field, BucketIterator
# import spacy

import pickle
import random
import math
import os
import time
import nltk

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cpu


### Import the pre-processed data

In [15]:
path = 'data/'

with open(path + 'english_no_pad_sorted_50k.pickle', 'rb') as handle:
    english = pickle.load(handle)
    
with open(path + 'german_no_pad_sorted_50k.pickle', 'rb') as handle:
    german = pickle.load(handle)   

In [25]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

In [26]:
def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings (tokens) and reverses it
    """
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

SRC = Field(tokenize = tokenize_de, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

In [27]:
x,y,z = Multi30k.splits(exts=('.de', '.en'), fields=(SRC, TRG))

In [30]:
print(x.examples[0])

<torchtext.data.example.Example object at 0x00000266D0D69978>


In [16]:
for i in range(len(german['train'])):
    german['train'][i] = torch.LongTensor(german['train'][i]).to(device)
    english['train'][i] = torch.LongTensor(english['train'][i]).to(device)
    
for i in range(len(german['dev'])):
    german['dev'][i] = torch.LongTensor(german['dev'][i]).to(device)
    english['dev'][i] = torch.LongTensor(english['dev'][i]).to(device)

In [17]:
german['dev'][0]

tensor([  2, 865,  11,   3])

### Token-Based Batching Method

Feed in the data from the preprocessed set, it should be sorted from shortest sentence to longest sentence without any padding on the sentences. The *get_batches* function will be used to create all of the batches for training, the output is a number of batches of varying dimensions, which is based on the batch size. Below is an example.

#### Example, batch-size=14
- Given the first few sentences from the dataset (sorted):
    - [2, 84, 3]      (length 3)
    - [2, 102, 3]     (length 3)
    - [2, 63, 3]      (length 3)
    - [2, 84, 21, 3]  (length 4)
    - [2, 91, 123, 3] (length 4)
    
- We will fill up the batches based on the number of tokens in each sentence. So for example, the first batch (batch_size=14) will look like:
```
    [[2, 84, 3],
     [2, 102, 3],     
     [2, 63, 3],      
     [2, 84, 21, 3]]
```
- We will then zero-pad all of the sentences in batch that are less than maximum length of the longest sentence in the batch to be the same length as the longest sentence:

```
    [[2, 84, 3, 0],
     [2, 102, 3, 0],     
     [2, 63, 3, 0],      
     [2, 84, 21, 3]]
```

- Now we have a batch of dimension: $N x L$, where:
    - $N$ is the number of sentences in the batch, and 
    - $L$ is the dimensionality (number of words) within a sentence.
    - It is important to note that the $N$ and $L$ values will vary from batch to batch, but **MUST** be consistent within each batch

In [69]:
"""
# THE FOLLOWING FUNCTION IS DEPRECATED

def get_batches(german, english, b_sz):
    batches = [[]]
    
    # For every sentence in the dataset, add it to a batch, based on the batch size
    # if the sentence + current length is not greater than the batch size, 
    # then add it to the batch otherwise fill the current batch
    for sent in german:
        cur_len = 0
        for b in batches[-1]:
            cur_len += len(b)
    
        if (cur_len + len(sent)) <= b_sz: 
            batches[-1].append(sent)
        else:
            batches.append([])        
            batches[-1].append(sent)
    
    # For every batch within the entire set of batches, add padding to the sentences
    # that are less than the length of the longest sentence within the each batch.
    for b in batches:
        max_len = len(max(b, key=len))
        
        for sent in b:
            dif = max_len - len(sent)
            if dif > 0:
                pad_list = 0 * dif
                sent.append(pad_list)
        
    return batches



batches = get_batches(german['train'], b_sz=20)
"""

"\n# THE FOLLOWING FUNCTION IS DEPRECATED\n\ndef get_batches(german, english, b_sz):\n    batches = [[]]\n    \n    # For every sentence in the dataset, add it to a batch, based on the batch size\n    # if the sentence + current length is not greater than the batch size, \n    # then add it to the batch otherwise fill the current batch\n    for sent in german:\n        cur_len = 0\n        for b in batches[-1]:\n            cur_len += len(b)\n    \n        if (cur_len + len(sent)) <= b_sz: \n            batches[-1].append(sent)\n        else:\n            batches.append([])        \n            batches[-1].append(sent)\n    \n    # For every batch within the entire set of batches, add padding to the sentences\n    # that are less than the length of the longest sentence within the each batch.\n    for b in batches:\n        max_len = len(max(b, key=len))\n        \n        for sent in b:\n            dif = max_len - len(sent)\n            if dif > 0:\n                pad_list = 0 * dif\

In [88]:
"""
# THE FOLLOWING FUNCTION IS DEPRECATED


def get_batches(german, english, b_sz):
    de_batches = [[]]
    
    # For every sentence in the dataset, add it to a batch, based on the batch size
    # if the sentence + current length is not greater than the batch size, 
    # then add it to the batch otherwise fill the current batch
    for sent in german:
        cur_len = 0
        for b in de_batches[-1]:
            cur_len += len(b)
    
        if (cur_len + len(sent)) <= b_sz: 
            de_batches[-1].append(sent)
        else:
            de_batches.append([])        
            de_batches[-1].append(sent)
    
    # For every batch within the entire set of batches, add padding to the sentences
    # that are less than the length of the longest sentence within the each batch.
    for b in de_batches:
        max_len = len(max(b, key=len))
        
        for sent in b:
            dif = max_len - len(sent)
            if dif > 0:
                pad_list = 0 * dif
                sent.append(pad_list)
    
    en_batches = []
    k=0
    for i in range(len(de_batches)):
        tmp_batch = [0]*len(de_batches[i])
        for j in range(len(de_batches[i])):
            tmp_batch[j] = english[k]
            k+=1
            
        en_batches.append(tmp_batch)
        
    batches = []
    for i in range(len(de_batches)):
        dict_batch = []
        for j in range(len(de_batches[i])):
            tmp_dict = {"source": de_batches[i][j],
                       "target": en_batches[i][j]}
            dict_batch.append(tmp_dict)
        batches.append(dict_batch)
        
    return batches

test_batches = get_batches(german['train'], english['train'], b_sz=100)

for i in range(len(test_batches[0])):
    print(test_batches[0][i]['source'])

# print("load the source and target sentences of the 3rd sentence within the 102nd batch:")
# print("Source:", test_batches[102][3]['source'])
# print("Target:", test_batches[102][3]['target'])
"""

[2, 865, 3, 0]
[2, 865, 3, 0]
[2, 10408, 3, 0]
[2, 115146, 3, 0]
[2, 104464, 3, 0]
[2, 61543, 3, 0]
[2, 2858, 3, 0]
[2, 54951, 3, 0]
[2, 6944, 3, 0]
[2, 865, 3, 0]
[2, 865, 3, 0]
[2, 115146, 3, 0]
[2, 35921, 3, 0]
[2, 15689, 3, 0]
[2, 53131, 3, 0]
[2, 3481, 3, 0]
[2, 3211, 3, 0]
[2, 865, 3, 0]
[2, 3174, 3, 0]
[2, 865, 11, 3]
[2, 8403, 3180, 3]
[2, 83114, 11, 3]
[2, 865, 11, 3]
[2, 865, 11, 3]
[2, 865, 11, 3]
[2, 3550, 11, 3]
[2, 865, 11, 3]
[2, 2302, 11, 3]
[2, 865, 11, 3]


In [98]:
def get_batches(german, english, b_sz):
    de_batches = [[]]
    
    # For every sentence in the dataset, add it to a batch, based on the batch size
    # if the sentence + current length is not greater than the batch size, 
    # then add it to the batch otherwise fill the current batch
    for sent in german:
        cur_len = 0
        for b in de_batches[-1]:
            cur_len += len(b)
    
        if (cur_len + len(sent)) <= b_sz: 
            de_batches[-1].append(sent)
        else:
            de_batches.append([])        
            de_batches[-1].append(sent)
    
    # For every batch within the entire set of batches, add padding to the sentences
    # that are less than the length of the longest sentence within the each batch.
    for b in de_batches:
        max_len = len(max(b, key=len))
        
        for sent in b:
            dif = max_len - len(sent)
            if dif > 0:
                pad_list = [0] * dif
                pad_list = torch.as_tensor(pad_list).to(device)
                sent.data = torch.cat((sent,pad_list), dim=0)

    # Within each batch, load the english sentence that corresponds to each
    # german sentence within that batch to a new english batch.
    en_batches = []
    k=0
    for i in range(len(de_batches)):
        tmp_batch = [0]*len(de_batches[i])
        for j in range(len(de_batches[i])):
            tmp_batch[j] = english[k]
            k+=1
        en_batches.append(tmp_batch)
        
    # Within each english batch, pad all the sentences to be the same length as the 
    # maximum sentence within that batch.
    for b in en_batches:
        max_len = len(max(b, key=len))
        
        for sent in b:
            dif = max_len - len(sent)
            if dif > 0:
                pad_list = [0] * dif
                pad_list = torch.as_tensor(pad_list).to(device)
                sent.data = torch.cat((sent,pad_list), dim=0)
        
    # Define the batches list, which contains an index for each batch. Within each index
    # of the batches list is a batch. Each batch is of length N (number of sentences), 
    # Each index (0..N) within the batch is a dictionary containing the source and target
    # sentence. Therefore, the batches list is a list of lists (essentially a 2D array), 
    # where each element in the list of lists/array is a dictionary containing the source
    # and target tensor.
    batches = []
    for i in range(len(de_batches)):
        dict_batch = []
        for j in range(len(de_batches[i])):
            tmp_dict = {"source": de_batches[i][j],
                       "target": en_batches[i][j]}
            dict_batch.append(tmp_dict)
        batches.append(dict_batch)
        
    return batches

test_batches = get_batches(german['train'], english['train'], b_sz=100)

print("load the source and target sentences of the 3rd sentence within the 102nd batch:")
print("Source:", test_batches[102][3]['source'])
print("Target:", test_batches[102][3]['target'])

load the source and target sentences of the 3rd sentence within the 102nd batch:
Source: tensor([   2,  814,  895, 3180,    3])
Target: tensor([   2,   63,  757, 2587,    3,    0,    0,    0])


In [78]:
def print_sentence(sent, language):
    if language == "german":
        for w in sent:
            print(german['idx2word'][w], end=' ')
    elif language == "english":
        for w in sent:
            print(english['idx2word'][w], end=' ')
    else:
        print("Language should be either 'german' or 'english'")
        
    print("")

print_sentence(test_batches[2984][0]['source'], language="german")
print_sentence(test_batches[2984][0]['target'], language="english")

del test_batches

<sos> damit löschen sie zeit . <eos> 
<sos> so , in doing so , you 're erasing time . <eos> 
<sos> aaa : danke schön . <eos> 
<sos> aaa : thank you . <eos> 


In [None]:
class Encoder(nn.Module):
    def __init__(self, params):
        super(Encoder, self).__init__()
        self.source_vocab_size = params['source_vocab_size']
        self.d_emb = params['d_emb']
        self.d_hid = params['d_hid']
        self.droprate = params['droprate']
        self.layers = params['layers']
        self.bidirectional = params['bidirectional']
          
        self.embeddings = nn.Embedding(self.source_vocab_size, self.d_emb)
        self.dropout = nn.Dropout(self.droprate)
        self.LSTM = nn.LSTM(self.d_emb, self.d_hid, num_layers=self.layers, dropout=self.droprate, bidirectional=self.bidirectional)
        
    def forward(self, source):
        embed = self.dropout(self.embeddings(source))
        outputs, (hidden, cell) = self.LSTM(embed)
        
        return hidden, cell        

In [None]:
class Decoder(nn.Module):
    def __init__(self, params):
        super(Decoder, self).__init__()
        self.target_vocab_size = params['target_vocab_size']
        self.d_emb = params['d_emb']
        self.d_hid = params['d_hid']
        self.droprate = params['droprate']
        self.layers = params['layers']
        
        self.embeddings = nn.Embedding(self.source_vocab_size, self.d_emb)
        self.dropout = nn.Dropout(self.droprate)
        self.LSTM = nn.LSTM(d_emb, d_hid, n_layers=layers, dropout=self.droprate)
        self.out = nn.Linear(d_hid, target_vocab_size)
        
    def forward(self, inp, hidden, cell):
        inp = inp.unsqueeze(0)
        
        embed = self.dropout(self.embeddings(inp))
        
        output, (hidden, cell) = self.LSTM(embed, (hidden, cell))
        
        prediction = self.out(output.squeeze(0))
        
        return prediction, hidden, cell

In [None]:
class seq2seq(nn.Module):
    def __init__(self, encoder, decoder)

In [None]:
def train(english, german, params, net):
    batches = get_batches()
    pass