In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.backends.cudnn as cudnn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import pandas as pd
from fastText import load_model
from matplotlib import pylab
from sklearn.manifold import TSNE
from tensorboardX import SummaryWriter
from torch.optim.lr_scheduler import StepLR

writer = SummaryWriter()

In [2]:
# Fasttext embeddings trained on train and val sets
# ./fasttext skipgram -input input_text_file -output output_model -dim 128 (fastText-0.1.0)
fasttext_model = load_model('word_vectors/fasttext_model.bin')
num_dims = 128

In [3]:
# vocab contains frequent words apperaing in the text along with their frequencies
# minimum frequency = 6
vocab_file = open('finished_files/vocab')
# Store appearing words
vocab_words = {}
for line in vocab_file:
    li = line.split()
    if len(li) == 2:
        word, freq = li
        vocab_words[word] = freq
# Final word to id dictionary    
word2id = {}
tokens = ['<pad>', '<unk>', '<eos>']
for token in tokens:
    word2id[token] = len(word2id)
# Retrieve words from fasttext model and keep only those which are also present in 'vocab'
fasttext_words = fasttext_model.get_words()
for word in fasttext_words:
    if word in vocab_words:
        word2id[word] = len(word2id)        
vocab_size = len(word2id)
# Reverse dictionary
id2word = dict(zip(word2id.values(), word2id.keys()))
# Embeddings
embeddings = np.zeros((vocab_size, num_dims))
# <pad> token vector contains all zeros. Rest sampled from a normal distribution
mu, sigma = 0, 0.05
for i in range(1, len(tokens)):
    embeddings[i] = np.random.normal(mu, sigma, num_dims)
# Get word vectors from fasttext model and store in embeddings matrix
for i in range(len(tokens), vocab_size):
    embeddings[i] = fasttext_model.get_word_vector(id2word[i])
    
del fasttext_model, vocab_words

In [4]:
temp = {}
for i in range(10000):
    temp[i] = id2word[i]
id2word = temp
embeddings = embeddings[:10000]
word2id = dict(zip(id2word.values(), id2word.keys()))

vocab_size = len(word2id)

num_points = 500

tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
two_d_embeddings = tsne.fit_transform(embeddings[1:num_points+1, :])

def plot(embeddings, labels):
    assert embeddings.shape[0] >= len(labels), 'More labels than embeddings'
    pylab.figure(figsize=(15,15))  # in inches
    for i, label in enumerate(labels):
        x, y = embeddings[i,:]
        pylab.scatter(x, y)
        pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
    pylab.show()

words = [id2word[i] for i in range(1, num_points+1)]
plot(two_d_embeddings, words)

In [5]:
batch_size = 64
max_article_size = 50 #400
max_abstract_size = 15 #100
hidden_size = 512

In [6]:
class Batch:
    def __init__(self):
        self.abstract = (None, None)
        self.article = (None, None)

In [7]:
class BatchGenerator: 
    
    def __init__(self, batch_size, dataframe):
        self.batch_size = batch_size
        # train, valid, or test dataframe imported from csv
        self.df = dataframe
        self.generator = self.row_generator()
        
        
    def row_generator(self):
        for row in self.df.itertuples(index=False):
            yield row
            
    def build_batch(self, rows):
        # If number of rows less than batch size, get extra rows from the beginning of the dataframe
        if len(rows) < self.batch_size:
            temp_generator = self.row_generator()
            for i in range(self.batch_size - len(rows)):
                rows.append(self.get_row(temp_generator))
                
        # Get lengths of all the sequences in the batch upto max number of tokens
        # + 1 is for the <eos> token
        abstract_lengths = torch.cuda.LongTensor(
            [len(row.abstract.split()[:max_abstract_size]) for row in rows]) + 1
        article_lengths = torch.cuda.LongTensor(
            [len(row.article.split()[:max_article_size]) for row in rows]) + 1 
        abs_len = torch.max(abstract_lengths)
        art_len = torch.max(article_lengths) 
        
        # Variables containing abstracts and articles of the batch
        abstracts = torch.cuda.LongTensor(abs_len, self.batch_size).fill_(0) # zero padding
        articles = torch.cuda.LongTensor(art_len, self.batch_size).fill_(0) # zero padding
        
        # Sort rows in descending order of sequence (article) lengths
        article_lengths, indices = torch.sort(article_lengths, descending=True)
        rows = [rows[i] for i in indices]
        abstract_lengths = torch.cuda.LongTensor([abstract_lengths[i] for i in indices])
        
        for i in range(self.batch_size):
            # Tokenize abstract and take max_abstract_size number of tokens
            tokens = rows[i].abstract.split()[:max_abstract_size]
            tokens.append('<eos>')
            # Convert each token to word index
            # <unk> token index for unknown words
            token_list = torch.LongTensor([word2id[token] if token in word2id 
                                           else word2id['<unk>'] for token in tokens])
            # Store as column in abstracts variable with zero padding
            abstracts[:,i][:len(token_list)] = token_list
            
            # Same for articles
            tokens = rows[i].article.split()[:max_article_size]
            tokens.append('<eos>')
            token_list = torch.LongTensor([word2id[token] if token in word2id 
                                           else word2id['<unk>'] for token in tokens])
            articles[:,i][:len(token_list)] = token_list
            
        batch = Batch()
        batch.article = (Variable(articles), article_lengths)
        batch.abstract = (Variable(abstracts), abstract_lengths)
        return batch
            
    def get_row(self, generator):
        row = generator.__next__()
        while not isinstance(row.article, str):
            row = generator.__next__()
        return row
        
        
    def get_batch(self):
        rows = []
        for b in range(self.batch_size):
            try: rows.append(self.get_row(self.generator))
            except StopIteration: break
        if rows: return self.build_batch(rows)
        else: raise StopIteration

In [8]:
class Encoder(nn.Module):
    
    def __init__(self):
        super(Encoder, self).__init__()
        
        # Hidden states of the encoder
        self.hidden_states = None
        self.sequence_lengths = None
        
        # Lookup table that stores word embeddings
        self.embed = nn.Embedding(vocab_size, num_dims).cuda()
        self.embed.weight.data.copy_(torch.from_numpy(embeddings))
        self.embed.weight.requires_grad = False
        
        # Pytorch lstm module
        self.lstm = nn.LSTM(num_dims, hidden_size, num_layers=1, bidirectional=True)
        self.lstm.cuda()
        
        # Linear transformation 
        self.linear_transform = nn.Linear(2*hidden_size, vocab_size)
    
    def forward(self, articles, article_lengths):
        # Embedding lookup
        input = self.embed(articles) # (T,B,N)
        # batch is sorted in descending order of sequence lengths
        packed_input = pack_padded_sequence(input, list(article_lengths))
        packed_output, last_hidden = self.lstm(packed_input)
        self.hidden_states, self.sequence_lengths = pad_packed_sequence(packed_output) # hidden_states (T,B,2H)
        
        # Sum hidden states for all time steps for bidirectional lstm
        #self.hidden_states = unpacked[:,:,:hidden_size] + unpacked[:,:,hidden_size:] for summing hidden states
        
        # Concatenate hidden and cell states of last time step for bidirectional lstm
        h_n = torch.cat((last_hidden[0][0], last_hidden[0][1]), dim=1) #(B,2H)
        c_n = torch.cat((last_hidden[1][0], last_hidden[1][1]), dim=1) #(B,2H)
        
        hiddenT = (h_n, c_n)
        output = self.linear_transform(h_n)
        
        # Final hidden state
        return hiddenT, output
    
encoder = Encoder()
encoder.cuda()

Encoder(
  (embed): Embedding(10000, 128)
  (lstm): LSTM(128, 512, bidirectional=True)
  (linear_transform): Linear(in_features=1024, out_features=10000, bias=True)
)

In [9]:
class Decoder(nn.Module):
    
    def __init__(self):
        super(Decoder, self).__init__()
        
        # Lookup table that stores word embeddings
        self.embed = nn.Embedding(vocab_size, num_dims).cuda()
        self.embed.weight.data.copy_(torch.from_numpy(embeddings))
        self.embed.weight.requires_grad = False
    
        self.hidden = None
        self.lstm_cell = nn.LSTMCell(num_dims, 2*hidden_size).cuda()

        # Linear transformations 
        self.output_lt = nn.Linear(2*hidden_size, vocab_size)
        self.context_lt = nn.Linear(2*hidden_size, 2*hidden_size)
        
        #self.W_att = nn.Parameter()
            
    
    def attention(self, h_t, batch_size):
        score = self.score_function(h_t, batch_size)
        #for b in range(batch_size):
            #score[:,b][encoder.sequence_lengths[b]:] = float('-inf')
        softmax = nn.Softmax(dim=0)
        a_t = softmax(score).unsqueeze(2) # (T,B,1)
        c_t = encoder.hidden_states * a_t # c_t (T,B,2H)
        context = torch.sum(c_t, dim=0) # (B,2H)
        return context
        
    def score_function(self, h_t, batch_size):
        seq_len = encoder.hidden_states.shape[0]
        score = Variable(torch.cuda.FloatTensor(seq_len, batch_size))
        for t in range(seq_len):
            score[t] = torch.diag(torch.mm(encoder.hidden_states[t], torch.transpose(h_t, 1, 0)))
        return score #(T,B)
    

    def forward(self, input, batch_size):
        # input is a LongTensor of size B
        input = self.embed(input) #(B,N)

        # Attention mechanism
        context = self.attention(self.hidden[0], batch_size) #(B,2H)
        h_0 = self.context_lt(context) + self.hidden[0] #(B,2H)
        self.hidden = (h_0, self.hidden[1])
        
        self.hidden = self.lstm_cell(input, self.hidden)

        output = self.output_lt(self.hidden[0]) #(B,V)
        return output
    
decoder = Decoder()
decoder.cuda()

Decoder(
  (embed): Embedding(10000, 128)
  (lstm_cell): LSTMCell(128, 1024)
  (output_lt): Linear(in_features=1024, out_features=10000, bias=True)
  (context_lt): Linear(in_features=1024, out_features=1024, bias=True)
)

for name, param in decoder.named_parameters():
    if param.requires_grad:
        print (name, param.data)

In [18]:
def beam_search_decoder():
    beam_size = 3
    
    log_softmax = nn.LogSoftmax(dim=1)
    batch_size = 1
    df = pd.read_csv('datasets/val.csv')
    sample = np.random.randint(0,10000)
    generator = BatchGenerator(batch_size, df[sample:])
    batch = generator.get_batch()
        
    # article, abstract are LongTensor vairables of shape (max_sequence_length, 1)
    # containig word indices from the respective vocabs
    article, article_length = batch.article
    abstract, abstract_length = batch.abstract
    hiddenT, output = encoder(article, article_length)
    
    # Initialize decoder hidden state
    
    decoder.hidden = hiddenT
    
    # First input to the decoder is the predicted word from the last state of encoder
    input = log_softmax(output, batch_size)
    value, input = torch.topk(output, 1, dim=1)
    inputs = [input]*beam_size
    values = torch.cuda.FloatTensor(beam_size, vocab_size).fill_(value)
    
    while True:
        for b in range(beam_size):
            output = decoder(inputs[b], batch_size)
            output = log_softmax(output)
            values[b] = output.data + values[b]
        values = values.view(-1)
        topk, indices = torch.topk(values, beam_size)
        indices = (indices + 1) % vocab_size
        for b in range(beam_size):
            if indices[b] == 0: indices[b] = vocab_size
            values[b] = topk[b]
        indices = indices - 1
        
        

beam_search_decoder()

Variable containing:
   55
   72
 4210
   55
  227
 1571
   73
    4
  246
    1
  561
   10
    4
  104
  124
   15
 9749
  114
 6874
    5
 1866
    5
   25
    4
 1771
    9
  114
   86
 5242
   25
  283
    1
    3
  102
  164
    1
    1
 3674
    6
 8563
 1064
    1
  676
   18
   17
 2874
 7401
   15
   63
   17
    2
[torch.cuda.LongTensor of size 51x1 (GPU 0)]
 
 51
[torch.cuda.LongTensor of size 1 (GPU 0)]



In [11]:
learning_rate = 0.7

# Filter parameters that do not require gradients
encoder_parameters = filter(lambda p: p.requires_grad, encoder.parameters())
decoder_parameters = filter(lambda p: p.requires_grad, decoder.parameters())
# Optimizers
encoder_optimizer = torch.optim.SGD(encoder_parameters, lr=learning_rate)
decoder_optimizer = torch.optim.SGD(decoder_parameters, lr=learning_rate)
# Loss function
# Way to accumulate loss on sequences with variable lengths in batches :
# size_average: By default, the losses are averaged over observations for each minibatch.
# However, if the field size_average is set to False, the losses are instead summed for each minibatch. 
# Ignored if reduce is False.
# Set size_average to False and divide the loss by the number of non-padding tokens.
# ignore_index: Specifies a target value that is ignored and does not contribute to the input gradient. 
# When size_average is True, the loss is averaged over non-ignored targets.
# Set ignore_index to the padding value
loss_function = nn.NLLLoss(size_average=False, ignore_index=0).cuda() # 0 is the index of <pad>

def train_model(batch):
    loss = 0
    # Clear optimizer gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    # articles, abstracts are LongTensor vairables of shape (max_sequence_length, B)
    # containig word indices from the respective vocabs
    # lengths are LongTensors of shape batch_size containing
    # lengths of all the sequences in the batch
    articles, article_lengths = batch.article
    abstracts, abstract_lengths = batch.abstract
    hiddenT, output = encoder(articles, article_lengths)
    
    # Initialize decoder hidden state
    
    decoder.hidden = hiddenT
    
    # First input to the decoder is the predicted word from the last state of encoder
    input = most_likely(output, batch_size)
    # Looping over all the sequences
    for t in range(torch.max(abstract_lengths)):
        output = decoder(input, batch_size)
        input = most_likely(output, batch_size)
        loss += loss_function(output, abstracts[t])
        
    loss = loss/torch.sum(abstract_lengths)
    loss.backward()
    
    #nn.utils.clip_grad_norm(encoder.parameters(), 0.5)
    #nn.utils.clip_grad_norm(decoder.parameters(), 0.5)

    encoder_optimizer.step()
    decoder_optimizer.step()
    
    # Initialize hidden_list for next batch of inputs
    decoder.hidden_list = []
    
    return loss.data[0]

In [12]:
def validation_loss(df):
    batch_size = 1
    generator = BatchGenerator(batch_size, df)
    loss = 0
    step = 0
    while True:
        try:
            batch = generator.get_batch()
            step += 1
        except StopIteration: break
        loss += calc_loss(batch, batch_size)
    loss = loss/step
    return loss

def calc_loss(batch, batch_size):
    loss = 0

    articles, article_lengths = batch.article
    abstracts, abstract_lengths = batch.abstract
    hiddenT, output = encoder(articles, article_lengths)
    
    # Initialize decoder hidden state
    
    decoder.hidden = hiddenT
    
    # First input to the decoder is the predicted word from the last state of encoder
    input = most_likely(output, batch_size)
    # Looping over all the sequences
    for t in range(torch.max(abstract_lengths)):
        output = decoder(input, batch_size)
        input = most_likely(output, batch_size)
        loss += loss_function(output, abstracts[t])
        
    loss = loss/torch.sum(abstract_lengths)
    
    return loss

In [13]:
def most_likely(output, batch_size):
    log_softmax = nn.LogSoftmax(dim=1)
    output = log_softmax(output)
    input = torch.topk(output, 1, dim=1)
    return input

In [14]:
cudnn.benchmark = True
cudnn.fasttest = True
epochs = 100 

encoder.train()
decoder.train()

train_df = pd.read_csv('datasets/train.csv')
val_df = pd.read_csv('datasets/val.csv')
iteration = 0

en_scheduler = StepLR(encoder_optimizer, step_size=20, gamma=0.5)
de_scheduler = StepLR(decoder_optimizer, step_size=20, gamma=0.5)

for epoch in range(epochs):
    en_scheduler.step()
    de_scheduler.step()
    
    generator = BatchGenerator(batch_size, train_df[:10000])
    while True:
        try: 
            batch = generator.get_batch()
        except StopIteration: break
        loss = train_model(batch)
        
        if iteration % 30 == 0:
            print('Average minibatch loss at step %d: %.3f' % (iteration, loss))
            writer.add_scalar('train_loss', loss, iteration)
            writer.export_scalars_to_json("./all_scalars.json")
            
        if iteration % 500 == 0:
            encoder.eval()
            decoder.eval()
            val_loss = validation_loss(val_df[:100]) # truncating validation dataframe
            print('Validation loss: %.3f' % val_loss)

            writer.add_scalar('valid_loss', val_loss, iteration)
            writer.export_scalars_to_json("./all_scalars.json")

            encoder.train()
            decoder.train()
            
        iteration += 1

Average minibatch loss at step 0: 9.217
Validation loss: 9.197
Average minibatch loss at step 30: 6.990
Average minibatch loss at step 60: 6.851
Average minibatch loss at step 90: 6.838
Average minibatch loss at step 120: 6.785
Average minibatch loss at step 150: 6.653
Average minibatch loss at step 180: 6.423
Average minibatch loss at step 210: 6.429
Average minibatch loss at step 240: 6.603
Average minibatch loss at step 270: 6.450
Average minibatch loss at step 300: 6.527
Average minibatch loss at step 330: 6.361
Average minibatch loss at step 360: 6.308
Average minibatch loss at step 390: 6.276
Average minibatch loss at step 420: 6.259
Average minibatch loss at step 450: 6.158
Average minibatch loss at step 480: 6.333
Validation loss: 6.359
Average minibatch loss at step 510: 6.161
Average minibatch loss at step 540: 6.227
Average minibatch loss at step 570: 6.399
Average minibatch loss at step 600: 6.191
Average minibatch loss at step 630: 6.290
Average minibatch loss at step 660:

Average minibatch loss at step 5580: 5.883
Average minibatch loss at step 5610: 5.797
Average minibatch loss at step 5640: 5.704
Average minibatch loss at step 5670: 5.735
Average minibatch loss at step 5700: 5.676
Average minibatch loss at step 5730: 5.717
Average minibatch loss at step 5760: 5.916
Average minibatch loss at step 5790: 5.738
Average minibatch loss at step 5820: 5.795
Average minibatch loss at step 5850: 5.750
Average minibatch loss at step 5880: 5.783
Average minibatch loss at step 5910: 5.622
Average minibatch loss at step 5940: 5.756
Average minibatch loss at step 5970: 5.660
Average minibatch loss at step 6000: 5.711
Validation loss: 6.165
Average minibatch loss at step 6030: 5.762
Average minibatch loss at step 6060: 5.770
Average minibatch loss at step 6090: 5.753
Average minibatch loss at step 6120: 5.618
Average minibatch loss at step 6150: 5.580
Average minibatch loss at step 6180: 5.767
Average minibatch loss at step 6210: 5.640
Average minibatch loss at step 

Average minibatch loss at step 11100: 5.333
Average minibatch loss at step 11130: 5.382
Average minibatch loss at step 11160: 5.306
Average minibatch loss at step 11190: 5.333
Average minibatch loss at step 11220: 5.557
Average minibatch loss at step 11250: 5.361
Average minibatch loss at step 11280: 5.394
Average minibatch loss at step 11310: 5.419
Average minibatch loss at step 11340: 5.445
Average minibatch loss at step 11370: 5.275
Average minibatch loss at step 11400: 5.360
Average minibatch loss at step 11430: 5.285
Average minibatch loss at step 11460: 5.375
Average minibatch loss at step 11490: 5.395
Validation loss: 6.161
Average minibatch loss at step 11520: 5.386
Average minibatch loss at step 11550: 5.411
Average minibatch loss at step 11580: 5.238
Average minibatch loss at step 11610: 5.263
Average minibatch loss at step 11640: 5.392
Average minibatch loss at step 11670: 5.311
Average minibatch loss at step 11700: 5.327
Average minibatch loss at step 11730: 5.270
Average m

cudnn.benchmark = True
cudnn.fasttest = True
epochs = 7000 #7000

encoder.train()
decoder.train()

train_df = pd.read_csv('datasets/train.csv')
val_df = pd.read_csv('datasets/val.csv')
iteration = 1

for epoch in range(epochs):
    
    if epoch % 1500 == 0: #500, 275
        learning_rate = learning_rate / 2 #2
        # Filter parameters that do not require gradients
        encoder_parameters = filter(lambda p: p.requires_grad, encoder.parameters())
        decoder_parameters = filter(lambda p: p.requires_grad, decoder.parameters())
        # Optimizers
        encoder_optimizer = torch.optim.SGD(encoder_parameters, lr=learning_rate)
        decoder_optimizer = torch.optim.SGD(decoder_parameters, lr=learning_rate)
        print('')
        print('learning rate: %f' % learning_rate)
        print('')
        
    generator = BatchGenerator(batch_size, train_df[:64]) #64

    while True:
        try: 
            batch = generator.get_batch()
        except StopIteration: break
        loss = train_model(batch)
        
        if iteration % 2 == 0:
            print('Average minibatch loss at step %d: %.3f' % (iteration, loss))
            writer.add_scalar('train_loss', loss, iteration)
            writer.export_scalars_to_json("./all_scalars.json")
        
        """if iteration % 8 == 0:    
            encoder.eval()
            decoder.eval()
            val_loss = validation_loss(val_df[:8]) # truncating validation dataframe
            print('Validation loss: %.3f' % val_loss)
            
            writer.add_scalar('valid_loss', val_loss, iteration)
            writer.export_scalars_to_json("./all_scalars.json")
            
            encoder.train()
            decoder.train()"""
        iteration += 1

In [15]:
torch.save(encoder.state_dict(), 'encoder')
torch.save(decoder.state_dict(), 'decoder')