In [1]:
import numpy as np
import pickle
import torch
import torch.nn as nn
from torch.autograd import Variable
from torchtext import data
from torchtext import vocab
import torch.backends.cudnn as cudnn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import revtok
from torchtext.data import BucketIterator
import random

In [2]:
# Fields for articles(text) and titles(summary)
text = data.ReversibleField(sequential=True, tokenize=None, include_lengths=True)
summary = data.ReversibleField(sequential=True, tokenize=None, 
                               include_lengths=True, eos_token='<eos>')

# Creating training and validation datasets
train, valid = data.TabularDataset.splits(
    path='./processed_data/', train='train_dataset.csv',
    validation='valid_dataset.csv', skip_header=True, 
    format='csv', fields=[('article', text), ('title', summary)])

In [3]:
# Word embedidngs to build vocabulary
#input_vectors = vocab.Vectors('crawl-300d-2M-input.vec')
#output_vectors = vocab.Vectors('crawl-300d-2M-output.vec')

#text.build_vocab(train, vectors=input_vectors)
text.build_vocab(train, vectors='glove.6B.300d')

#summary.build_vocab(train, vectors=input_vectors)
summary.build_vocab(train, vectors='glove.6B.300d')

# Number of dimensions of embeddings
num_dims = 300

In [4]:
batch_size = 128

# Iterator for generating batches
train_iter, valid_iter = BucketIterator.splits(
    (train, valid), 
    (batch_size, 1),
    device=0,
    #sort=True, # sort in ascending order of lenghts
    sort_key=lambda x: len(x.article),
    sort_within_batch=True, # sort each batch in descending order of article length
    repeat=True)

In [5]:
# Replacing embedding of all zeros for UNK with a random embedding
_ = torch.nn.init.normal(text.vocab.vectors[0], mean=0, std=0.05) 
_ = torch.nn.init.normal(summary.vocab.vectors[0], mean=0, std=0.05)

In [17]:
# Input vocabulary
input_vocab = text.vocab
# Number of hidden units in each hidden layer
encoder_hidden_size = 512
# Number of recurrent layers in model
num_layers = 2
# Encoder dropout
encoder_dropout = 0

class EncoderRNN(nn.Module):
    
    def __init__(self, vocab, hidden_size, dropout):
        super(EncoderRNN, self).__init__()
        
        # Hyperparameters
        self._vocab = vocab
        self._hidden_size = hidden_size
        self._dropout = dropout
        
        # Hidden layer and cell state of model
        # Initialize before calling model
        self.hidden = None
        
        # Lookup table that stores word embeddings
        self.embed = nn.Embedding(len(self._vocab), num_dims).cuda()
        self.embed.weight.data.copy_(self._vocab.vectors)
        self.embed.weight.requires_grad = False
        
        # Pytorch lstm module
        self.lstm = nn.LSTM(num_dims, self._hidden_size, 
                            num_layers, dropout=self._dropout)
        self.lstm.cuda()
        
    def init_hidden(self, batch_size):
        return (Variable(torch.cuda.FloatTensor(num_layers, batch_size,
                    self._hidden_size).fill_(0), requires_grad=False), 
                Variable(torch.cuda.FloatTensor(num_layers, batch_size, 
                    self._hidden_size).fill_(0), 
                         requires_grad=False))
    
    def forward(self, batch_data, sequence_lengths):
        # Embedding lookup
        input = self.embed(batch_data)
        # packed_input is of size Txbx*
        # where T is the length of longest sequence
        # b is batch size
        # batch is sorted in descending order of sequence lengths
        packed_input = pack_padded_sequence(input, list(sequence_lengths))
        packed_output, self.hidden = self.lstm(packed_input, self.hidden)
        # Final hidden state
        return self.hidden
    
encoder = EncoderRNN(input_vocab, encoder_hidden_size, encoder_dropout)
encoder.cuda()

EncoderRNN(
  (embed): Embedding(114618, 300)
  (lstm): LSTM(300, 512, num_layers=2)
)

In [18]:
# Output vocabulary
output_vocab = summary.vocab
# Number of hidden units in each hidden layer
decoder_hidden_size = 512
# Encoder dropout
decoder_dropout = 0

class DecoderRNN(nn.Module):
    
    def __init__(self, vocab, hidden_size, dropout):
        super(DecoderRNN, self).__init__()
        
        # Hyperparameters
        self._vocab = vocab
        self._hidden_size = hidden_size
        self._dropout = dropout
        
        # Lookup table that stores word embeddings
        self.embed = nn.Embedding(len(self._vocab), num_dims).cuda()
        self.embed.weight.data.copy_(self._vocab.vectors)
        self.embed.weight.requires_grad = False
    
        # Cell and hidden states
        self.cell_list = []
        self.hidden_list = []
    
        # First cell takes word embeddings as input
        self.cell_list.append(nn.LSTMCell(num_dims, self._hidden_size).cuda())
        for cell in range(1, num_layers):
            self.cell_list.append(nn.LSTMCell(self._hidden_size, self._hidden_size).cuda())
        # ModlueList Holds submodules in a list. 
        # ModuleList can be indexed like a regular Python list, 
        # but modules it contains are properly registered, 
        # and will be visible by all Module methods.
        self.cell_list=nn.ModuleList(self.cell_list) 
        
        # Linear transformation 
        self.linear_transform = nn.Linear(self._hidden_size, len(self._vocab))

    def forward(self, input):
        input = self.embed(input) 
        # Each item in hidden list is a tuple of previous cell and hidden states
        for layer in range(num_layers):
            self.hidden_list[layer] = self.cell_list[layer](input, self.hidden_list[layer])
            input = self.hidden_list[layer][0]
        # output has shape (batch_size, vocab_size)
        output = self.linear_transform(self.hidden_list[num_layers - 1][0])
        return output
    
decoder = DecoderRNN(output_vocab, decoder_hidden_size, decoder_dropout)
decoder.cuda()

DecoderRNN(
  (embed): Embedding(43919, 300)
  (cell_list): ModuleList(
    (0): LSTMCell(300, 512)
    (1): LSTMCell(512, 512)
  )
  (linear_transform): Linear(in_features=512, out_features=43919, bias=True)
)

In [19]:
def sample_distribution(distribution):
# Sample one element from a distribution assumed to be an array of normalized probabilities
    r = random.uniform(0, 1)
    s = 0
    for i in range(len(distribution)):
        s += distribution[i]
        if s >= r:
            return i
    return len(distribution) - 1

def predict_by_sampling(output, batch_size):
    output = output.cpu() 
    next_input = torch.cuda.LongTensor(batch_size)
    # output has shape (batch_size, vocab_size)
    softmax = nn.Softmax(dim=1)
    output = softmax(output)
    for b in range(batch_size): 
        next_input[b] = sample_distribution(output[b].data)
    return Variable(next_input)

def most_likely(output, batch_size):
    if batch_size > 1:
        softmax = nn.Softmax(dim=1)
        output = softmax(output)
        _, next_input = torch.topk(output, 1, dim=1)
    else: 
        softmax = nn.Softmax(dim=0)
        output = softmax(output)
        _, next_input = torch.topk(output, 1)
    return next_input

In [20]:
learning_rate = 0.5

# Filter parameters that do not require gradients
encoder_parameters = filter(lambda p: p.requires_grad, encoder.parameters())
decoder_parameters = filter(lambda p: p.requires_grad, decoder.parameters())
# Optimizers
encoder_optimizer = torch.optim.SGD(encoder_parameters, lr=learning_rate)
decoder_optimizer = torch.optim.SGD(decoder_parameters, lr=learning_rate)
# Loss function
# Way to accumulate loss on sequences with variable lengths in batches :
# size_average: By default, the losses are averaged over observations for each minibatch.
# However, if the field size_average is set to False, the losses are instead summed for each minibatch. 
# Ignored if reduce is False.
# Set size_average to False and divide the loss by the number of non-padding tokens.
# ignore_index: Specifies a target value that is ignored and does not contribute to the input gradient. 
# When size_average is True, the loss is averaged over non-ignored targets.
# Set ignore_index to the padding value
loss_function = nn.CrossEntropyLoss(size_average=False, ignore_index=1).cuda() # 1 is the index of <pad>

def train_model(batch):
    loss = 0
    # Clear model gradients
    encoder.zero_grad()
    decoder.zero_grad()
    # Clear optimizer gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    # Clear hidden state of LSTM
    encoder.hidden = encoder.init_hidden(batch_size)
    #decoder.hidden1 = decoder.init_hidden(batch_size)
    #decoder.hidden2 = decoder.init_hidden(batch_size)
    # articles, titles are LongTensor vairables of shape (max_sequence_length, batch_size)
    # containig word indices from the respective vocabs
    # lengths are LongTensor varibles of shape batch_size containing
    # lengths of all the sequences in the batch
    articles, art_lengths = batch.article
    titles, tit_lengths = batch.title
    hiddenT = encoder(articles, art_lengths)
    # Seperate hidden states corresponding to the the two layers of the encoder
    # and append to hidden state list of decoder as tuples for each layer.
    for layer in range(num_layers):
        decoder.hidden_list.append((hiddenT[0][layer], hiddenT[1][layer]))
    ## replacement for <sos> token. Error? ##
    input = Variable(torch.cuda.LongTensor(batch_size).fill_(1)) # 1 is the index of <pad>

    # Looping over all the sequences
    for t in range(torch.max(tit_lengths)):
        output = decoder(input)
        #input = predict_by_sampling(output, batch_size)
        input = most_likely(output, batch_size)
        loss += loss_function(output, titles[t])
    
    loss = loss/torch.sum(tit_lengths)
    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    # Initialize hidden_list for next batch of inputs
    decoder.hidden_list = []
    
    return loss

In [21]:
def adjust_learning_rate(optimizer, epoch):
    """Sets the learning rate to the initial LR decayed by 10 every 3 epochs"""
    lr = learning_rate * (0.1 ** (epoch // 3))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [22]:
cudnn.benchmark = True
cudnn.fasttest = True

encoder.train()
decoder.train()
step = 1

for batch in train_iter:
    if torch.max(batch.article[1]) > 80: 
        step += 1
        continue
    loss = train_model(batch)
    if step % 100 == 0:
        print('Average minibatch loss at step %d: %.3f' % (step, loss))
    if step % 1000 == 0 and step > 1000:
        adjust_learning_rate(encoder_optimizer, step // 1000)
        adjust_learning_rate(decoder_optimizer, step // 1000)
    step += 1
            

Average minibatch loss at step 100: 8.635
Average minibatch loss at step 200: 7.981
Average minibatch loss at step 300: 8.135
Average minibatch loss at step 400: 7.660
Average minibatch loss at step 500: 7.624
Average minibatch loss at step 600: 7.629
Average minibatch loss at step 700: 7.477
Average minibatch loss at step 800: 7.397
Average minibatch loss at step 900: 7.360
Average minibatch loss at step 1000: 7.571
Average minibatch loss at step 1100: 7.468
Average minibatch loss at step 1200: 7.604
Average minibatch loss at step 1300: 7.458


KeyboardInterrupt: 

for name, param in encoder.named_parameters():
    if param.requires_grad:
        print(name, param.data)