In [2]:
from torchtext import data, datasets
import torch
import torch.autograd as autograd
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import os.path
import pdb
import argparse
import utils

In [10]:
import models

In [3]:
BOS_WORD = '<s>'
EOS_WORD = '</s>'
MAX_LEN = 20
BATCH_SIZE = 32

DE = data.Field(tokenize=utils.tokenize_de)
EN = data.Field(tokenize=utils.tokenize_en, init_token = BOS_WORD, eos_token = EOS_WORD) # only target needs BOS/EOS


In [4]:
# Download dataset, build vocab
train, val, test = datasets.IWSLT.splits(exts=('.de', '.en'), fields=(DE, EN), 
                                         filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and 
                                         len(vars(x)['trg']) <= MAX_LEN)
MIN_FREQ = 5
DE.build_vocab(train.src, min_freq=MIN_FREQ)
EN.build_vocab(train.trg, min_freq=MIN_FREQ)

print("Finish build vocab")

train_iter, val_iter = data.BucketIterator.splits((train, val), batch_size=BATCH_SIZE, device=-1,
                                                  repeat=False, sort_key=lambda x: len(x.src))

print("Done bucketing data")

Finish build vocab
Done bucketing data


In [7]:
batch = next(iter(train_iter))

In [111]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import pdb
USE_CUDA = False

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.embedding_size = hidden_size
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, self.embedding_size)
        self.dropout_p = 0
        self.n_layers = 1
        self.rnn = nn.LSTM(
            self.embedding_size, 
            hidden_size, 
            num_layers=self.n_layers, 
            dropout=self.dropout_p)

    def forward(self, input):
        batch_size = input.size(1)
        embedded = self.embedding(input)
        h_0 = self.init_hidden(batch_size)
        output, hidden = self.rnn(embedded, h_0)
        return output, hidden

    def init_hidden(self, batch_size):
        hidden =  torch.zeros(self.n_layers, batch_size, self.hidden_size)
        if USE_CUDA: hidden = hidden.cuda()
        return (hidden, hidden.clone())

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.rnn = nn.LSTM(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output) # Try other?
        output, hidden = self.rnn(output, hidden)
        output = self.out(output)
        # output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)

class Seq2Seq(nn.Module):
    def __init__(self, hidden_size, input_vocab_size, output_vocab_size):
        super(Seq2Seq, self).__init__()
        self.encoder = EncoderRNN(input_vocab_size,hidden_size)
        self.decoder = DecoderRNN(hidden_size, output_vocab_size)


    def forward(self, source, target):
        if USE_CUDA: source = source.cuda()

        # Encode
        output_encoder, hidden_encoder = self.encoder(source)

        # Decode
        output_decoder, hidden_decoder = self.decoder(target, hidden_encoder)

        # Predict
        return output_decoder

In [130]:
model = Seq2Seq(hidden_size=50, input_vocab_size = len(DE.vocab), output_vocab_size = len(EN.vocab))

def validate(model, val_iter, criterion):
    ''' Calculate perplexity on validation set.'''
    model.eval()

    AL = AverageLosses()

    for i, batch in enumerate(val_iter):
        scores = model(batch.src, batch.trg)
        # Remove <s> from beginning of target
        targets = batch.trg[1:]
        # Remove </s> from end of source bc nothing to predict after that.
        scores = scores[:-1]

        # Reshape.
        new_scr = scores.view(scores.size(0) * scores.size(1), -1)
        new_trg = targets.view(new_scr.size(0))

        loss = criterion(new_scr, new_trg)

        # Count number of non-padding elements on target.
        num_words = (new_trg != 1).sum()

        AL.update(loss.data, n_obs=num_words)

    return exp(loss.avg)
def train(train_iter, val_iter, model, criterion, optimizer, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        AL = AverageLosses()
        for i, batch in enumerate(train_iter):
            loss = train_batch(model, batch, criterion, optimizer)
            AL.update(loss)
            
            if i % 1000 == 10:
                print('''Epoch [{e}/{num_e}]\t Batch [{b}/{num_b}]\t Loss: {l:.3f}'''.format(e=epoch+1, num_e=num_epochs, b=i, num_b=len(train_iter), l=AL.avg))

        ppl = validate(model, val_iter, criterion)
        print('''Epoch [{e}/{num_e}]\t Perplexity: {ppl:.3f}'''.format(e=epoch+1, num_e=num_epochs, ppl=ppl))



In [None]:
optimizer = optim.Adam(model.parameters(), betas=(0.9, 0.98), lr=0.5)
train(train_iter, val_iter, model, nn.CrossEntropyLoss(), optimizer, 1)

Epoch [1/1]	 Batch [10/3722]	 Loss: 5.880
