In [1]:
import numpy as np
import pickle
import torch
import torch.nn as nn
from torch.autograd import Variable
from torchtext import data
from torchtext import vocab
import torch.backends.cudnn as cudnn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import revtok
from torchtext.data import BucketIterator
from matplotlib import pylab
from sklearn.manifold import TSNE
import random

In [2]:
# Fields for articles(text) and titles(summary)
text = data.ReversibleField(sequential=True, tokenize=None, include_lengths=True)
summary = data.ReversibleField(sequential=True, tokenize=None, 
                               include_lengths=True, eos_token='<eos>')

# Creating training and validation datasets
train, valid = data.TabularDataset.splits(
    path='./processed_data/', train='train_dataset.csv',
    validation='valid_dataset.csv', skip_header=True, 
    format='csv', fields=[('article', text), ('title', summary)])

In [3]:
# Word embedidngs to build vocabulary
#input_vectors = vocab.Vectors('crawl-300d-2M-input.vec')
#output_vectors = vocab.Vectors('crawl-300d-2M-output.vec')

#text.build_vocab(train, vectors=input_vectors)
text.build_vocab(train, vectors='glove.6B.300d')

#summary.build_vocab(train, vectors=input_vectors)
summary.build_vocab(train, vectors='glove.6B.300d')

# Number of dimensions of embeddings
num_dims = 300

In [4]:
batch_size = 128

# Iterator for generating batches
train_iter, valid_iter = BucketIterator.splits(
    (train, valid), 
    (batch_size, 1),
    device=0,
    #sort=True, # sort in ascending order of lenghts
    sort_key=lambda x: len(x.article),
    sort_within_batch=True, # sort each batch in descending order of article length
    repeat=True)

In [5]:
# Replacing embedding of all zeros for UNK with a random embedding
_ = torch.nn.init.normal(text.vocab.vectors[0], mean=0, std=0.05) 
_ = torch.nn.init.normal(summary.vocab.vectors[0], mean=0, std=0.05)

In [6]:
# Input vocabulary
input_vocab = text.vocab
# Number of hidden units in each hidden layer
encoder_hidden_size = 512
# Number of recurrent layers in encoder
encoder_num_layers = 2
# Encoder dropout
encoder_dropout = 0

class EncoderRNN(nn.Module):
    
    def __init__(self, vocab, hidden_size, num_layers, dropout):
        super(EncoderRNN, self).__init__()
        
        # Hyperparameters
        self._vocab = vocab
        self._hidden_size = hidden_size
        self._num_layers = num_layers
        self._dropout = dropout
        
        # Hidden layer and cell state of model
        # Initialize before calling model
        self.hidden = None
        
        # Lookup table that stores word embeddings
        self.embed = nn.Embedding(len(self._vocab), num_dims).cuda()
        self.embed.weight.data.copy_(self._vocab.vectors)
        self.embed.weight.requires_grad = False
        
        # Pytorch lstm module
        self.lstm = nn.LSTM(num_dims, self._hidden_size, 
                            self._num_layers, dropout=self._dropout)
        self.lstm.cuda()
        
    def init_hidden(self, batch_size):
        return (Variable(torch.cuda.FloatTensor(self._num_layers, batch_size,
                    self._hidden_size).fill_(0), requires_grad=False), 
                Variable(torch.cuda.FloatTensor(self._num_layers, batch_size, 
                    self._hidden_size).fill_(0), 
                         requires_grad=False))
    
    def forward(self, batch_data, sequence_lengths):
        batch_size = len(sequence_lengths)
        # Embedding lookup
        input = self.embed(batch_data)
        # packed_input is of size Txbx*
        # where T is the length of longest sequence
        # b is batch size
        # batch is sorted in descending order of sequence lengths
        packed_input = pack_padded_sequence(input, list(sequence_lengths))
        packed_output, self.hidden = self.lstm(packed_input, self.hidden)
        # Final hidden state
        return self.hidden
    
encoder = EncoderRNN(input_vocab, encoder_hidden_size, 
                     encoder_num_layers, encoder_dropout)
encoder.cuda()

EncoderRNN(
  (embed): Embedding(114618, 300)
  (lstm): LSTM(300, 512, num_layers=2)
)

In [7]:
# Output vocabulary
output_vocab = summary.vocab
# Number of hidden units in each hidden layer
decoder_hidden_size = 512
# Number of recurrent layers in encoder
decoder_num_layers = 2
# Encoder dropout
decoder_dropout = 0

class DecoderRNN(nn.Module):
    
    def __init__(self, vocab, hidden_size, num_layers, dropout):
        super(DecoderRNN, self).__init__()
        
        # Hyperparameters
        self._vocab = vocab
        self._hidden_size = hidden_size
        self._num_layers = num_layers
        self._dropout = dropout
        
        # Lookup table that stores word embeddings
        self.embed = nn.Embedding(len(self._vocab), num_dims).cuda()
        self.embed.weight.data.copy_(self._vocab.vectors)
        self.embed.weight.requires_grad = False
        
        # Pytorch lstm cell for first layer
        self.lstmCell1 = nn.LSTMCell(num_dims, self._hidden_size)
        self.lstmCell1.cuda()
        
        # Pytorch lstm cell for second layer
        self.lstmCell2 = nn.LSTMCell(self._hidden_size, self._hidden_size)
        self.lstmCell2.cuda()
        
        # Linear transformation 
        self.linear_transform = nn.Linear(self._hidden_size, len(self._vocab))
    
    def forward(self, input, hidden1, hidden2):
        input = self.embed(input) 
        hidden1 = self.lstmCell1(input, hidden1) 
        hidden2 = self.lstmCell2(hidden1[0], hidden2)
        output = self.linear_transform(hidden2[0])
        return hidden1, hidden2, output
    
decoder = DecoderRNN(output_vocab, decoder_hidden_size, 
                     decoder_num_layers, decoder_dropout)
decoder.cuda()

DecoderRNN(
  (embed): Embedding(43919, 300)
  (lstmCell1): LSTMCell(300, 512)
  (lstmCell2): LSTMCell(512, 512)
  (linear_transform): Linear(in_features=512, out_features=43919, bias=True)
)

In [8]:
def sample_distribution(distribution):
# Sample one element from a distribution assumed to be an array of normalized probabilities
    r = random.uniform(0, 1)
    s = 0
    for i in range(len(distribution)):
        s += distribution[i]
        if s >= r:
            return i
    return len(distribution) - 1

def predict_by_sampling(output, batch_size):
    output = output.cpu() 
    next_input = torch.cuda.LongTensor(batch_size)
    # output has shape (batch_size, vocab_size)
    softmax = nn.Softmax(dim=1)
    output = softmax(output)
    for b in range(batch_size): 
        next_input[b] = sample_distribution(output[b].data)
    return Variable(next_input)

def most_likely(output, batch_size):
    if batch_size > 1:
        softmax = nn.Softmax(dim=1)
        output = softmax(output)
        _, next_input = torch.topk(output, 1, dim=1)
    else: 
        softmax = nn.Softmax(dim=0)
        output = softmax(output)
        _, next_input = torch.topk(output, 1)
    return next_input

In [None]:
class Beam(object):
    
    def __init__(self, beam_width, batch_size, vocab):
        self._beam_width = beam_width
        self._batch_size = batch_size
        self._vocab = vocab
        self.outputs = torch.FloatTensor(self._beam_width, self._batch_size, len(self._vocab))
        
    def beam_search(self, ):
        
    

In [9]:
learning_rate = 0.5
beam_size = 3

# Filter parameters that do not require gradients
encoder_parameters = filter(lambda p: p.requires_grad, encoder.parameters())
decoder_parameters = filter(lambda p: p.requires_grad, decoder.parameters())
# Optimizers
encoder_optimizer = torch.optim.SGD(encoder_parameters, lr=learning_rate)
decoder_optimizer = torch.optim.SGD(decoder_parameters, lr=learning_rate)
# Loss function
# By default the losses are averaged over observations for each minibatch
# When reduce is False returns a loss per batch element instead  
#loss_function = nn.CrossEntropyLoss(reduce=False)
loss_function = nn.CrossEntropyLoss(size_average=False, ignore_index=1).cuda() # 1 is the index of <pad>

def train_model(batch):
    loss = 0
    # Clear model gradients
    encoder.zero_grad()
    decoder.zero_grad()
    # Clear optimizer gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    # Clear hidden state of LSTM
    encoder.hidden = encoder.init_hidden(batch_size)
    # articles, titles are LongTensor vairables of shape (max_sequence_length, batch_size)
    # containig word indices from the respective vocabs
    # lengths are LongTensor varibles of shape batch_size containing
    # lengths of all the sequences in the batch
    articles, art_lengths = batch.article
    titles, tit_lengths = batch.title
    hiddenT = encoder(articles, art_lengths)
    # Seperate hidden states corresponding to the the two layers of the encoder
    hidden1 = (hiddenT[0][0], hiddenT[1][0])
    hidden2 = (hiddenT[0][1], hiddenT[1][1])
    
    inputs = []
    for i in range(beam_size):
        inputs.append(Variable(torch.cuda.LongTensor(batch_size).fill_(1))) # 1 is the index of <pad> ##error?
    # Initial input is a tensor of all <pad> tokens with zero embeddings
    input = Variable(torch.cuda.LongTensor(batch_size).fill_(1)) # 1 is the index of <pad> ##error?
    # Looping over all the sequences
    for t in range(torch.max(tit_lengths)):
        hidden1, hidden2, output = decoder(input, hidden1, hidden2)
        
        #input = predict_by_sampling(output, batch_size)
        #input = most_likely(output, batch_size)
        
        
        loss += loss_function(output, titles[t])
    
    loss = loss/torch.sum(tit_lengths)
    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss

In [10]:
def adjust_learning_rate(optimizer, epoch):
    """Sets the learning rate to the initial LR decayed by 10 every 3 epochs"""
    lr = learning_rate * (0.1 ** (epoch // 3))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [11]:
cudnn.benchmark = True
cudnn.fasttest = True

encoder.train()
decoder.train()
step = 1

for batch in train_iter:
    if torch.max(batch.article[1]) > 80: 
        step += 1
        continue
    loss = train_model(batch)
    if step % 100 == 0:
        valid_loss = validatio
        print('Average minibatch loss at step %d: %.3f' % (step, loss))
    if step % 1000 == 0 and step > 1000:
        adjust_learning_rate(encoder_optimizer, step // 1000)
        adjust_learning_rate(decoder_optimizer, step // 1000)
    step += 1
            

Average minibatch loss at step 100: 8.445
Average minibatch loss at step 200: 8.056
Average minibatch loss at step 300: 8.104
Average minibatch loss at step 500: 7.692
Average minibatch loss at step 600: 8.008
Average minibatch loss at step 700: 7.761
Average minibatch loss at step 800: 7.610


KeyboardInterrupt: 

In [14]:
for batch in valid_iter:
    encoder.hidden = encoder.init_hidden(1)
    # articles, titles are LongTensor vairables of shape (max_sequence_length, batch_size)
    # containig word indices from the respective vocabs
    # lengths are LongTensor varibles of shape batch_size containing
    # lengths of all the sequences in the batch
    articles, art_lengths = batch.article
    titles, tit_lengths = batch.title
    hiddenT = encoder(articles, art_lengths)
    # Seperate hidden states corresponding to the the two layers of the encoder
    hidden1 = (hiddenT[0][0], hiddenT[1][0])
    hidden2 = (hiddenT[0][1], hiddenT[1][1])
    # Initial input is a tensor of all <pad> tokens with zero embeddings
    input = Variable(torch.cuda.LongTensor(1).fill_(1)) # 1 is the index of <pad>
    # Looping over all the sequences
    for t in range(torch.max(tit_lengths)):
        hidden1, hidden2, output = decoder(input, hidden1, hidden2)
        
        #input = predict_by_sampling(output, batch_size)
        input = predict_by_sampling(output, 1)
        word = summary.vocab.itos[input.data.cpu().numpy()[0]]
        if word == '<eos>': 
            print('eos')
            break
        print(word+' ', end='')

 is   over   iowa   if   fed   leaves   new   giving   merkels   to   trump   a   on   clashes   outlook   texas   deal   new   at   is   groupie   terrapattern   american   university   serious   hardware   one   to   flunky   your   and   young  eos
 two   allowed   board   your   the   eleven   private   democratic   but   horsemen   jarmuschs   dicey  eos
 trump   summit   america   suspected   conway   we   vx   pounds   occupies   law   deal   fundamentals  eos
 rightwing   company   cuomo   in   to   rhetoric   food   of   be   lock   stone   did   is   immigration   obama   remarks   could   girlfriends   storch   woman   unsheltered   nypost   up   home   burundi   street   all  eos
 vetoes   using   bush   worlds   after   administration   weiszs   us   obama   chums   sea   fake   is   of   to   release  eos
 this   wasnt   if   image   freezer   for  eos
 report   gop   zika   be   new   hundred   mcphee   for   in   wrecks   is   government   online   opec   be   been   wi

KeyboardInterrupt: 

for name, param in encoder.named_parameters():
    if param.requires_grad:
        print(name, param.data)