In [1]:
import time
import math
import random
import os
from itertools import zip_longest

import numpy as np

import pickle as pkl
import gzip

import torch
import torch.nn as nn
from torch.utils.data import Dataset
import torch.nn.functional as F

from torch import optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils import asMinutes, timeSince, load_zipped_pickle, corpus_bleu, directories
from langUtils import loadLangPairs, langDataset, langCollateFn, initHybridEmbeddings, tensorToList

import matplotlib.pyplot as plt
plt.switch_backend('agg')
import seaborn as sns; sns.set()
sns.set_style("darkgrid")
sns.set_context("paper")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()

In [2]:
data_dir, em_dir = directories()

SPECIAL_SYMBOLS_ID = PAD_ID, UNK_ID, SOS_ID, EOS_ID = 0, 1, 2, 3
NUM_SPECIAL = len(SPECIAL_SYMBOLS_ID)

vi, en = loadLangPairs("vi")

In [67]:
BATCH_SIZE = 64

train_dataset = langDataset([(vi.train_num[i], en.train_num[i]) for i in range(len(vi.train_num)) if (len(vi.train[i]) < vi.max_length) & (len(en.train[i]) < en.max_length)])
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=langCollateFn,
                                           shuffle=True)
dev_dataset = langDataset([(vi.dev_num[i], en.dev_num[i]) for i in range(len(vi.dev_num)) if (len(vi.dev[i]) < vi.max_length) & (len(en.dev[i]) < en.max_length)])
dev_loader = torch.utils.data.DataLoader(dataset=dev_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=langCollateFn,
                                           shuffle=True)

In [68]:
class EncoderRNN(nn.Module):
    def __init__(self, params, raw_emb, learn_ids):
        super(EncoderRNN, self).__init__()
        
        self.hidden_size = params['hidden_size']
        self.num_layers = params['num_layers']
        
        self.embedding = initHybridEmbeddings(raw_emb, learn_ids)
        self.gru = nn.GRU(self.embedding.embedding_dim, params['hidden_size'], self.num_layers, batch_first=True, bidirectional=True)
        
    def forward(self, inp, inp_lens, hidden=None):
        embedded = self.embedding(inp)
        packed = pack_padded_sequence(embedded, inp_lens)
        
        output, self.hidden = self.gru(packed, hidden)
        output, _ = pad_packed_sequence(output)
        output = output[:, :, :self.hidden_size] + output[:, : ,self.hidden_size:]
        return output, self.hidden
    
class DecoderRNN(nn.Module):
    def __init__(self, params, raw_emb, learn_ids):
        super(DecoderRNN, self).__init__()
        self.hidden_size = params['hidden_size']
        self.num_layers = params['num_layers']
        self.output_size = params['output_size']

        self.embedding = initHybridEmbeddings(raw_emb, learn_ids)
        self.gru = nn.GRU(self.embedding.embedding_dim, params['hidden_size'], self.num_layers, batch_first=True, bidirectional=True)
        self.out = nn.Linear(self.hidden_size, self.output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, inp, hidden, encoder_output=None):
        embedded = self.embedding(inp)
        output = F.relu(embedded)
        
        output, self.hidden = self.gru(output, hidden)
        orig = output
        output = output[:, :, :self.hidden_size] + output[:, : ,self.hidden_size:]
        output = torch.exp(self.softmax(self.out(output))).squeeze(0)
        return output, hidden, None

def maskNLLLoss(inp, target, mask):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1).to(device)))
    loss = crossEntropy.masked_select(mask.to(device)).mean()
    loss = loss.to(device)
    return loss, nTotal.item()

In [81]:
class Model():
    def __init__(self, encoder, decoder, encoder_optim, decoder_optim):
        
        self.encoder = encoder
        self.decoder = decoder
        
        self.encoder_optim = encoder_optim
        self.decoder_optim = decoder_optim
        
    def fit(self, train_data, dev_data, teacher_forcing_ratio, n_epoch, print_every, n_grams):
        start = time.time()
        
        self.teacher_forcing_ratio = teacher_forcing_ratio
        self.n_epoch = n_epoch
        
        
        print("Initializing...")
        start_epoch = 1
        print_loss_total = 0 
        plot_loss_total = 0
        plot_losses = []
        plot_train_scores = []
        plot_dev_scores = []
        
        for epoch in range(start_epoch, n_epoch):
            for i, (inp, inp_lens, output, out_mask, out_max) in enumerate(train_loader):
                loss = self.trainEpoch(inp, inp_lens, output, out_mask, out_max)
                
                print_loss_total += loss
                plot_loss_total += loss

                if i % print_every == 0:
                    train_score = self.bleuScore(train_loader, n_grams)
                    dev_score = self.bleuScore(dev_loader, n_grams)
                    plot_train_scores.append(train_score)
                    plot_dev_scores.append(dev_score)
                    plot_loss_avg = plot_loss_total / print_every
                    plot_losses.append(plot_loss_avg)
                    plot_loss_total = 0       

                    print_loss_avg = print_loss_total / print_every
                    print_loss_total = 0
                    print("{} ({} {}) Iter: {}/{} | Loss:{:.4} | TrainScore:{} | DevScore:{}".format(timeSince(start, epoch/n_epoch), 
                                                                                            epoch, 
                                                                                            epoch/n_epoch*100, 
                                                                                            i,
                                                                                            len(train_loader),
                                                                                            print_loss_avg, 
                                                                                            train_score,
                                                                                            dev_score))

#                 if i % plot_every == 0:
#                     train_score = self.bleuScore(train_loader, n_grams)
#                     dev_score = self.bleuScore(dev_loader, n_grams)
#                     plot_train_scores.append(train_score)
#                     plot_dev_scores.append(dev_score)
#                     plot_loss_avg = plot_loss_total / plot_every
#                     plot_losses.append(plot_loss_avg)
#                     plot_loss_total = 0       
                
                
        self.plot_losses = plot_losses
        self.plot_train_scores = plot_train_scores
        self.plot_dev_scores = plot_dev_scores
        return "Training Complete!"            
            
    def trainEpoch(self, inp, inp_lens, output, out_mask, out_max):
        self.encoder_optim.zero_grad()
        self.decoder_optim.zero_grad()

        loss, print_losses, n_totals = 0, [], 0
        
        encoder_output, encoder_hidden = self.encoder(inp, inp_lens)
        
        decoder_input = torch.LongTensor([[SOS_ID for _ in range(inp.size(1))]]).to(device)
        decoder_hidden = encoder_hidden[:,-1:,:].contiguous()

        if random.random() < self.teacher_forcing_ratio:
            for t in range(out_max):
                decoder_output, decoder_hidden, _ = self.decoder(decoder_input, decoder_hidden, encoder_output)
                decoder_input = output[t].view(1, -1)
                
                mask_loss, nTotal = maskNLLLoss(decoder_output, output[t], out_mask[t])
                loss += mask_loss
                print_losses.append(mask_loss.item() * nTotal)
                n_totals += nTotal
        else:
            for t in range(out_max):
                decoder_output, decoder_hidden, _ = self.decoder(decoder_input, decoder_hidden, encoder_output)
                _, topi = decoder_output.topk(1)
                decoder_input = torch.LongTensor([[topi[i][0] for i in range(inp.size(1))]]).to(device)
                
                mask_loss, nTotal = maskNLLLoss(decoder_output, output[t], out_mask[t])
                loss += mask_loss
                print_losses.append(mask_loss.item() * nTotal)
                n_totals += nTotal
                
        loss.backward()

        self.encoder_optim.step()
        self.decoder_optim.step()

        return sum(print_losses) / n_totals
    
    def bleuScore(self, data_loader, n_grams):
        with torch.no_grad():
            
            true_outputs = []
            decoder_outputs = []
            
            for i, (inp, inp_lens, output, out_mask, out_max) in enumerate(data_loader):
                if i * BATCH_SIZE > len(dev_loader) * BATCH_SIZE:
                    break
                true_outputs += tensorToList(output)
                
                encoder_output, encoder_hidden = self.encoder(inp, inp_lens)
                
                decoder_input = torch.LongTensor([[SOS_ID for _ in range(inp.size(1))]])
                decoder_hidden = encoder_hidden[:,-1:,:].contiguous()
                
                decoder_batch_outputs = []
                decoder_batch_outputs += decoder_input.tolist()
                
                for t in range(out_max):
                    decoder_output, decoder_hidden, _ = self.decoder(decoder_input, decoder_hidden, encoder_output)
                    _, topi = decoder_output.topk(1)
                    decoder_input = torch.LongTensor([[topi[i][0] for i in range(inp.size(1))]])
                    decoder_batch_outputs += decoder_input.tolist()
                    
                decoder_outputs += tensorToList(torch.tensor(decoder_batch_outputs))
        return corpus_bleu(decoder_outputs, true_outputs, n_grams)

    def showLoss(self):
        plt.figure()
        fig = plt.figure(figsize=(10,6))
        fig_plt = sns.lineplot(x=np.arange(0, self.n_epoch, int(self.n_epoch/len(self.plot_losses))), y=self.plot_losses)
        fig_plt.set_title("Loss Over Time")
        fig_plt.set_ylabel("Loss")
        fig_plt.set_xlabel("Epochs")
        return fig_plt.get_figure()
    
    def showScore(self):
        df = pd.concat([pd.DataFrame({'X':np.arange(0, self.n_epoch, int(self.n_epoch/len(self.plot_losses))), 'Y':self.plot_train_scores, 'Score':'Train'}), 
                        pd.DataFrame({'X':np.arange(0, self.n_epoch, int(self.n_epoch/len(self.plot_losses))), 'Y':self.plot_dev_scores, 'Score':'Dev'})], axis=0)
    
        plt.figure()
        pp = sns.lineplot(data=df, x = 'X', y = 'Y', hue='Score', style="Score", legend= "brief")
        fig_plt.set_title("Score Over Time")
        fig_plt.set_ylabel("Score")
        fig_plt.set_xlabel("Epoch")
        return fig_plt.get_figure()

In [82]:
LEARNING_RATE = 0.001

encoder_params = {'hidden_size':256, 'num_layers':1}
decoder_params = {'hidden_size':encoder_params['hidden_size'], 'num_layers':1, 'output_size':en.n_words}

encoder = EncoderRNN(encoder_params, vi.emb, vi.learn_ids).to(device)
encoder_optim = optim.Adam(encoder.parameters(), lr=LEARNING_RATE)

decoder = DecoderRNN(decoder_params, en.emb, en.learn_ids).to(device)
decoder_optim = optim.Adam(decoder.parameters(), lr=0.001)

In [83]:
model = Model(encoder, decoder, encoder_optim, decoder_optim)
model.fit(train_loader, dev_loader, teacher_forcing_ratio=1.0, n_epoch=10, print_every=400, n_grams=4)

Initializing...
0m 15s (- 2m 15s) (1 10.0) Iter: 0/1713 | Loss:0.01039 | TrainScore:9.301821611466174 | DevScore:10.144865992263254
6m 18s (- 56m 42s) (1 10.0) Iter: 400/1713 | Loss:3.481 | TrainScore:3.9491317896699902 | DevScore:3.4373696280375103
12m 21s (- 111m 10s) (1 10.0) Iter: 800/1713 | Loss:3.195 | TrainScore:2.9839270337976247 | DevScore:2.741219284529995
18m 25s (- 165m 49s) (1 10.0) Iter: 1200/1713 | Loss:3.132 | TrainScore:1.402114597097759 | DevScore:1.476456197084048
24m 30s (- 220m 37s) (1 10.0) Iter: 1600/1713 | Loss:3.099 | TrainScore:1.0247680801474728 | DevScore:0.937463584188316
26m 21s (- 105m 27s) (2 20.0) Iter: 0/1713 | Loss:0.8663 | TrainScore:0.7355134314590411 | DevScore:0.7061211883251046
32m 25s (- 129m 42s) (2 20.0) Iter: 400/1713 | Loss:3.031 | TrainScore:0.6045048829654783 | DevScore:0.8465479049353456
38m 29s (- 153m 59s) (2 20.0) Iter: 800/1713 | Loss:3.027 | TrainScore:0.430588790903775 | DevScore:0.4009448010837915
44m 34s (- 178m 18s) (2 20.0) Iter

'Training Complete!'

In [9]:
for i, (inp, inp_lens, output, out_mask, out_max) in enumerate(train_loader):
    break

In [12]:
encoder.embedding(inp[0])

RuntimeError: cuda runtime error (59) : device-side assert triggered at /opt/conda/conda-bld/pytorch_1532582123400/work/aten/src/THC/generic/THCTensorCopy.cpp:20

In [11]:
encoder.embedding(inp)

RuntimeError: cuda runtime error (59) : device-side assert triggered at /opt/conda/conda-bld/pytorch_1532582123400/work/aten/src/THC/generic/THCTensorCopy.cpp:70