In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np

# from torchtext.datasets import TranslationDataset, Multi30k
# from torchtext.data import Field, BucketIterator
# import spacy

import pickle
import random
import math
import os
import time
import nltk

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda:0


### Import the pre-processed data

In [17]:
path = 'data/'

with open(path + 'english_no_pad_sorted_5k.pickle', 'rb') as handle:
    english = pickle.load(handle)
    
with open(path + 'german_no_pad_sorted_5k.pickle', 'rb') as handle:
    german = pickle.load(handle)   

In [3]:
for i in range(len(german['train'])):
    german['train'][i] = torch.LongTensor(german['train'][i]).to(device)
    english['train'][i] = torch.LongTensor(english['train'][i]).to(device)
    
for i in range(len(german['dev'])):
    german['dev'][i] = torch.LongTensor(german['dev'][i]).to(device)
    english['dev'][i] = torch.LongTensor(english['dev'][i]).to(device)

### Token-Based Batching Method

Feed in the data from the preprocessed set, it should be sorted from shortest sentence to longest sentence without any padding on the sentences. The *get_batches* function will be used to create all of the batches for training, the output is a number of batches of varying dimensions, which is based on the batch size. Below is an example.

#### Example, batch-size=14
- Given the first few sentences from the dataset (sorted):
    - [2, 84, 3]      (length 3)
    - [2, 102, 3]     (length 3)
    - [2, 63, 3]      (length 3)
    - [2, 84, 21, 3]  (length 4)
    - [2, 91, 123, 3] (length 4)
    
- We will fill up the batches based on the number of tokens in each sentence. So for example, the first batch (batch_size=14) will look like:
```
    [[2, 84, 3],
     [2, 102, 3],     
     [2, 63, 3],      
     [2, 84, 21, 3]]
```
- We will then zero-pad all of the sentences in batch that are less than maximum length of the longest sentence in the batch to be the same length as the longest sentence:

```
    [[2, 84, 3, 0],
     [2, 102, 3, 0],     
     [2, 63, 3, 0],      
     [2, 84, 21, 3]]
```

- Now we have a batch of dimension: $N x L$, where:
    - $N$ is the number of sentences in the batch, and 
    - $L$ is the dimensionality (number of words) within a sentence.
    - It is important to note that the $N$ and $L$ values will vary from batch to batch, but **MUST** be consistent within each batch

In [4]:

# # THE FOLLOWING FUNCTION IS DEPRECATED

# def get_batches(german, english, b_sz):
#     batches = [[]]
    
#     # For every sentence in the dataset, add it to a batch, based on the batch size
#     # if the sentence + current length is not greater than the batch size, 
#     # then add it to the batch otherwise fill the current batch
#     for sent in german:
#         cur_len = 0
#         for b in batches[-1]:
#             cur_len += len(b)
    
#         if (cur_len + len(sent)) <= b_sz: 
#             batches[-1].append(sent)
#         else:
#             batches.append([])        
#             batches[-1].append(sent)
    
#     # For every batch within the entire set of batches, add padding to the sentences
#     # that are less than the length of the longest sentence within the each batch.
#     for b in batches:
#         max_len = len(max(b, key=len))
        
#         for sent in b:
#             dif = max_len - len(sent)
#             if dif > 0:
#                 pad_list = 0 * dif
#                 sent.append(pad_list)
        
#     return batches



# batches = get_batches(german['train'], b_sz=20)


In [5]:

# # THE FOLLOWING FUNCTION IS DEPRECATED


# def get_batches(german, english, b_sz):
#     de_batches = [[]]
    
#     # For every sentence in the dataset, add it to a batch, based on the batch size
#     # if the sentence + current length is not greater than the batch size, 
#     # then add it to the batch otherwise fill the current batch
#     for sent in german:
#         cur_len = 0
#         for b in de_batches[-1]:
#             cur_len += len(b)
    
#         if (cur_len + len(sent)) <= b_sz: 
#             de_batches[-1].append(sent)
#         else:
#             de_batches.append([])        
#             de_batches[-1].append(sent)
    
#     # For every batch within the entire set of batches, add padding to the sentences
#     # that are less than the length of the longest sentence within the each batch.
#     for b in de_batches:
#         max_len = len(max(b, key=len))
        
#         for sent in b:
#             dif = max_len - len(sent)
#             if dif > 0:
#                 pad_list = 0 * dif
#                 sent.append(pad_list)
    
#     en_batches = []
#     k=0
#     for i in range(len(de_batches)):
#         tmp_batch = [0]*len(de_batches[i])
#         for j in range(len(de_batches[i])):
#             tmp_batch[j] = english[k]
#             k+=1
            
#         en_batches.append(tmp_batch)
        
#     batches = []
#     for i in range(len(de_batches)):
#         dict_batch = []
#         for j in range(len(de_batches[i])):
#             tmp_dict = {"source": de_batches[i][j],
#                        "target": en_batches[i][j]}
#             dict_batch.append(tmp_dict)
#         batches.append(dict_batch)
        
#     return batches

# test_batches = get_batches(german['train'], english['train'], b_sz=100)

# for i in range(len(test_batches[0])):
#     print(test_batches[0][i]['source'])

# # print("load the source and target sentences of the 3rd sentence within the 102nd batch:")
# # print("Source:", test_batches[102][3]['source'])
# # print("Target:", test_batches[102][3]['target'])


In [4]:
def get_batches(german, english, b_sz):
    de_batches = [[]]
    
    # For every sentence in the dataset, add it to a batch, based on the batch size
    # if the sentence + current length is not greater than the batch size, 
    # then add it to the batch otherwise fill the current batch
    for sent in german:
        cur_len = 0
        for b in de_batches[-1]:
            cur_len += len(b)
    
        if (cur_len + len(sent)) <= b_sz: 
            de_batches[-1].append(sent)
        else:
            de_batches.append([])        
            de_batches[-1].append(sent)
    
    # For every batch within the entire set of batches, add padding to the sentences
    # that are less than the length of the longest sentence within the each batch.
    for b in de_batches:
        max_len = len(max(b, key=len))
        
        for sent in b:
            dif = max_len - len(sent)
            if dif > 0:
                pad_list = [0] * dif
                pad_list = torch.as_tensor(pad_list).to(device)
                sent.data = torch.cat((sent,pad_list), dim=0)

    # Within each batch, load the english sentence that corresponds to each
    # german sentence within that batch to a new english batch.
    en_batches = []
    k=0
    for i in range(len(de_batches)):
        tmp_batch = [0]*len(de_batches[i])
        for j in range(len(de_batches[i])):
            tmp_batch[j] = english[k]
            k+=1
        en_batches.append(tmp_batch)
        
    # Within each english batch, pad all the sentences to be the same length as the 
    # maximum sentence within that batch.
    for b in en_batches:
        max_len = len(max(b, key=len))
        
        for sent in b:
            dif = max_len - len(sent)
            if dif > 0:
                pad_list = [0] * dif
                pad_list = torch.as_tensor(pad_list).to(device)
                sent.data = torch.cat((sent,pad_list), dim=0)
        
    # Define the batches list, which contains an index for each batch. Within each index
    # of the batches list is a batch. Each batch is of length N (number of sentences), 
    # Each index (0..N) within the batch is a dictionary containing the source and target
    # sentence. Therefore, the batches list is a list of lists (essentially a 2D array), 
    # where each element in the list of lists/array is a dictionary containing the source
    # and target tensor.
    batches = []
    for i in range(len(de_batches)):
        dict_batch = []
        for j in range(len(de_batches[i])):
            tmp_dict = {"source": de_batches[i][j],
                       "target": en_batches[i][j]}
            dict_batch.append(tmp_dict)
        batches.append(dict_batch)
        
    return batches

# print(len(get_batches(german['train'],english['train'],1000)))

# test_batches = get_batches(german['train'], english['train'], b_sz=100)

# print("load the source and target sentences of the 3rd sentence within the 102nd batch:")
# print("Source:", test_batches[102][3]['source'])
# print("Target:", test_batches[102][3]['target'])

## This function is used to retrieve the tensors for individual batches. The batchify() function will be used within the training loop 

In [5]:
def batchify(batch): # We need the batches to be of shape [sentence length, batch size]
    source_sent_len = (len(batch[0]['source']))
    target_sent_len = (len(batch[0]['target']))
    batch_size = (len(batch))
    
    source = torch.empty((source_sent_len, batch_size)).long().to(device)    
    target = torch.empty((target_sent_len, batch_size)).long().to(device)
    
    for i in range(len(batch)):
        source[:,i] = batch[i]['source']
        target[:,i] = batch[i]['target']
    
    return source, target


# # # Example:
# from time import time
# t0 = time()
# tr_batches = get_batches(german['train'], english['train'], b_sz=100)
# print(time()-t0)

# for idx, b in enumerate(tr_batches): # TRAINING LOOP
#     source, target = batchify(b) # returns the source and target tensors that are in the correct 
#                                  # shape to push through the model.

In [6]:
def print_sentence(sent, language):
    if language == "german":
        for w in sent:
            print(german['idx2word'][w], end=' ')
    elif language == "english":
        for w in sent:
            print(english['idx2word'][w], end=' ')
    else:
        print("Language should be either 'german' or 'english'")
        
    print("")

# print_sentence(test_batches[2984][0]['source'], language="german")
# print_sentence(test_batches[2984][0]['target'], language="english")

# del test_batches

In [7]:
class Encoder(nn.Module):
    def __init__(self, params):
        super(Encoder, self).__init__()
        self.source_vocab_size = params['source_vocab_size']
        self.d_emb = params['d_emb']
        self.d_hid = params['d_hid']
        self.droprate = params['droprate']
        self.layers = params['layers']
        self.bidirectional = params['bidirectional']
          
        self.embeddings = nn.Embedding(self.source_vocab_size, self.d_emb)
        self.dropout = nn.Dropout(self.droprate)
        self.LSTM = nn.LSTM(self.d_emb, self.d_hid, num_layers=self.layers, dropout=self.droprate, bidirectional=self.bidirectional)
        if self.bidirectional:
            self.fc = nn.Linear(self.d_hid*2, self.d_hid)
        
    def forward(self, source):
        embed = self.dropout(self.embeddings(source))
        
        if self.bidirectional:
            outputs, hidden = self.LSTM(embed)

            x = torch.cat((hidden[-2][-2,:,:], hidden[-1][-1,:,:]), dim=1)
            hidden = torch.tanh(self.fc(x))
            return outputs, hidden.unsqueeze(0)
        else:
            outputs, (hidden, cell) = self.LSTM(embed)
            return hidden, cell        

In [8]:
class Decoder(nn.Module):
    def __init__(self, params):
        super(Decoder, self).__init__()
        self.target_vocab_size = params['target_vocab_size']
        self.d_emb = params['d_emb']
        self.d_hid = params['d_hid']
        self.droprate = params['droprate']
        self.layers = params['layers']
        
        self.embeddings = nn.Embedding(self.target_vocab_size, self.d_emb)
        self.dropout = nn.Dropout(self.droprate)
        self.LSTM = nn.LSTM(self.d_emb, self.d_hid, num_layers=self.layers, dropout=self.droprate)
        self.out = nn.Linear(self.d_hid, self.target_vocab_size)
        
    def forward(self, inp, hidden, cell):
        inp = inp.unsqueeze(0)
        
        embed = self.dropout(self.embeddings(inp))
        
        output, (hidden, cell) = self.LSTM(embed, (hidden, cell))
        
        prediction = self.out(output.squeeze(0))
        
        return prediction, hidden, cell

In [9]:
class DecoderBD(nn.Module):
    def __init__(self, params):
        super(DecoderBD, self).__init__()
        self.target_vocab_size = params['target_vocab_size']
        self.d_emb = params['d_emb']
        self.d_hid = params['d_hid']
        self.droprate = params['droprate']
        self.layers = params['layers']
        
        self.embeddings = nn.Embedding(self.target_vocab_size, self.d_emb)
        self.dropout = nn.Dropout(self.droprate)
        self.rnn = nn.RNN(self.d_emb, self.d_hid, num_layers=self.layers, dropout=self.droprate)
        self.out = nn.Linear(self.d_hid, self.target_vocab_size)
        
    def forward(self, inp, hidden, encoder_outputs):
        inp = inp.unsqueeze(0)
        
        embed = self.dropout(self.embeddings(inp))

        output, hidden = self.rnn(embed, hidden)
        
        prediction = self.out(output.squeeze(0))
        
        return prediction, hidden

In [10]:
class seq2seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(seq2seq, self).__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.bidirectional = encoder.bidirectional
        
        assert encoder.d_hid == decoder.d_hid, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.layers == decoder.layers, \
            "Encoder and decoder must have equal number of layers!"
    
    def forward(self, source, target, teacher_forcing_ratio = 0.5):
        batch_size = target.shape[1]
        sent_len = target.shape[0]
        target_vocab_size = self.decoder.target_vocab_size
        
        outputs = torch.zeros(sent_len, batch_size, target_vocab_size).to(device)
        
        inp = target[0,:]
        
        if self.bidirectional:
            encoder_outputs, hidden = self.encoder(source)
            for t in range(1, sent_len):
                output, hidden = self.decoder(inp, hidden, encoder_outputs)
                outputs[t] = output
                teacher_force = random.random() < teacher_forcing_ratio
                top = output.max(1)[1]
                inp = target[t] if teacher_force else top
            
        else:
            hidden, cell = self.encoder(source)
            for t in range(1, sent_len):
                output, hidden = self.decoder(inp, hidden, cell)
                outputs[t] = output
                teacher_force = random.random() < teacher_forcing_ratio
                top = output.max(1)[1]
                inp = target[t] if teacher_force else top
        
        return outputs

In [11]:
def train(model, german, english, params):
    model.train()
    
    epoch_loss = 0.0
    
    learningrate = params['learning_rate']
    teacher_force = params['teacher_force']
    batch_size = params['batch_size']
    clip = params['clip']
    
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = optim.Adam(model.parameters(), lr=learningrate)
    
    batches = get_batches(german['train'], english['train'], batch_size)
    print("Batch count:", len(batches))
    random.shuffle(batches)

    for idx, b in enumerate(batches):
        source, target = batchify(b)

        output = model(source, target, teacher_force)

        #trg = [trg sent len, batch size]
        #output = [trg sent len, batch size, output dim]

        output = output[1:].view(-1, output.shape[-1])
        target = target[1:].view(-1)

        #trg = [(trg sent len - 1) * batch size]
        #output = [(trg sent len - 1) * batch size, output dim]

        loss = criterion(output, target)
        
        if idx%500 == 0: print( "batch:", idx, "loss:", loss.item()) 

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()
        optimizer.zero_grad()

        epoch_loss += loss.item()

    return epoch_loss/len(batches)

In [12]:
def evaluate(model, german, english, params):
    model.eval()
    
    batch_size = params['batch_size']
    epoch_loss = 0.0
    
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    
    batches = get_batches(german['dev'], english['dev'], batch_size)
    random.shuffle(batches)
    
    with torch.no_grad():
        for idx, b in enumerate(batches):
            
            source, target = batchify(b)
            output = model(source, target, teacher_forcing_ratio=0)
            
            output = output[1:].view(-1, output.shape[-1])
            target = target[1:].view(-1)
            
            loss = criterion(output, target)
            epoch_loss += loss.item()
            
    return epoch_loss/len(batches)

In [13]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [14]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

In [15]:
tr_params = {}

tr_params['batch_size'] = 200
tr_params['learning_rate'] = 0.0025
tr_params['teacher_force'] = 0.75
tr_params['epochs'] = 20
tr_params['clip'] = 1

enc_params = {}
dec_params = {}

enc_params['source_vocab_size'] = len(german['idx2word'])
enc_params['d_emb'] = 256
enc_params['d_hid'] = 512
enc_params['droprate'] = 0.5
enc_params['layers'] = 1
enc_params['bidirectional'] = True

dec_params['target_vocab_size'] = len(english['idx2word'])
dec_params['d_emb'] = 256
dec_params['d_hid'] = 512
dec_params['droprate'] = 0.5
dec_params['layers'] = 1

enc = Encoder(enc_params)

if enc_params['bidirectional']:
    dec = DecoderBD(dec_params)
    model = seq2seq(enc, dec).to(device)
else:
    dec = Decoder(dec_params)
    model = seq2seq(enc, dec).to(device)

model.apply(init_weights)

best_valid_loss = float('inf')

for epoch in range(tr_params['epochs']):
    start_time = time.time()
    
    train_loss = train(model, german, english, tr_params)
    valid_loss = evaluate(model, german, english, tr_params)
    
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
#         torch.save(model.state_dict(), 'tut1-model.pt')

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

  "num_layers={}".format(dropout, num_layers))


Batch count: 2265
batch: 0 loss: 10.92887020111084
batch: 500 loss: 6.162031650543213
batch: 1000 loss: 6.170685291290283
batch: 1500 loss: 5.518272876739502
batch: 2000 loss: 5.453644275665283
Epoch: 01 | Time: 8m 13s
	Train Loss: 5.755 | Train PPL: 315.803
	 Val. Loss: 7.867 |  Val. PPL: 2609.765
Batch count: 2266
batch: 0 loss: 5.533724784851074
batch: 500 loss: 5.947695732116699
batch: 1000 loss: 5.603154182434082
batch: 1500 loss: 5.628115177154541
batch: 2000 loss: 6.74653959274292
Epoch: 02 | Time: 8m 33s
	Train Loss: 5.594 | Train PPL: 268.751
	 Val. Loss: 8.219 |  Val. PPL: 3709.259
Batch count: 2266
batch: 0 loss: 5.870950222015381
batch: 500 loss: 4.8925604820251465
batch: 1000 loss: 5.914968013763428
batch: 1500 loss: 5.715311050415039
batch: 2000 loss: 5.583240032196045
Epoch: 03 | Time: 8m 46s
	Train Loss: 5.582 | Train PPL: 265.507
	 Val. Loss: 8.866 |  Val. PPL: 7088.029
Batch count: 2266
batch: 0 loss: 5.89659309387207
batch: 500 loss: 6.319077968597412
batch: 1000 los

In [24]:
torch.save(model, 'C:/Users/evan_/Documents/School/Graduate/Year 1/Spring/CS 690D/Project/Base Model/models/model00.pickle')