In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.backends.cudnn as cudnn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import pandas as pd
from fastText import load_model
from matplotlib import pylab
from sklearn.manifold import TSNE
from tensorboardX import SummaryWriter

writer = SummaryWriter()

In [2]:
# Fasttext embeddings trained on train and val sets
# ./fasttext skipgram -input input_text_file -output output_model -dim 128 (fastText-0.1.0)
fasttext_model = load_model('word_vectors/fasttext_model.bin')
num_dims = 128

In [3]:
# vocab contains frequent words apperaing in the text along with their frequencies
# minimum frequency = 6
vocab_file = open('finished_files/vocab')
# Store appearing words
vocab_words = {}
for line in vocab_file:
    li = line.split()
    if len(li) == 2:
        word, freq = li
        vocab_words[word] = freq
# Final word to id dictionary    
word2id = {}
tokens = ['<pad>', '<unk>', '<sos>', '<eos>']
for token in tokens:
    word2id[token] = len(word2id)
# Retrieve words from fasttext model and keep only those which are also present in 'vocab'
fasttext_words = fasttext_model.get_words()
for word in fasttext_words:
    if word in vocab_words:
        word2id[word] = len(word2id)        
vocab_size = len(word2id)
# Reverse dictionary
id2word = dict(zip(word2id.values(), word2id.keys()))
# Embeddings
embeddings = np.zeros((vocab_size, num_dims))
# <pad> token vector contains all zeros. Rest sampled from a normal distribution
mu, sigma = 0, 0.05
for i in range(1, len(tokens)):
    embeddings[i] = np.random.normal(mu, sigma, num_dims)
# Get word vectors from fasttext model and store in embeddings matrix
for i in range(len(tokens), vocab_size):
    embeddings[i] = fasttext_model.get_word_vector(id2word[i])
    
del fasttext_model, vocab_words

In [4]:
temp = {}
for i in range(10000):
    temp[i] = id2word[i]
id2word = temp
embeddings = embeddings[:10000]
word2id = dict(zip(id2word.values(), id2word.keys()))

vocab_size = len(word2id)

num_points = 500

tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
two_d_embeddings = tsne.fit_transform(embeddings[1:num_points+1, :])

def plot(embeddings, labels):
    assert embeddings.shape[0] >= len(labels), 'More labels than embeddings'
    pylab.figure(figsize=(15,15))  # in inches
    for i, label in enumerate(labels):
        x, y = embeddings[i,:]
        pylab.scatter(x, y)
        pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
    pylab.show()

words = [id2word[i] for i in range(1, num_points+1)]
plot(two_d_embeddings, words)

In [5]:
batch_size = 64
max_article_size = 50 #400
max_abstract_size = 15 #100
hidden_size = 512
hidden_layers = 2

In [6]:
class Batch:
    def __init__(self):
        self.abstract = (None, None)
        self.article = (None, None)

In [7]:
class BatchGenerator: 
    
    def __init__(self, batch_size, dataframe):
        self.batch_size = batch_size
        # train, valid, or test dataframe imported from csv
        self.df = dataframe
        self.generator = self.row_generator()
        
        
    def row_generator(self):
        for row in self.df.itertuples(index=False):
            yield row
            
    def build_batch(self, rows):
        # If number of rows less than batch size, get extra rows from the beginning of the dataframe
        if len(rows) < self.batch_size:
            temp_generator = self.row_generator()
            for i in range(self.batch_size - len(rows)):
                rows.append(self.get_row(temp_generator))
                
        # Get lengths of all the sequences in the batch upto max number of tokens
        # + 1 is for the <eos> token
        abstract_lengths = torch.cuda.LongTensor(
            [len(row.abstract.split()[:max_abstract_size]) for row in rows]) + 1
        article_lengths = torch.cuda.LongTensor(
            [len(row.article.split()[:max_article_size]) for row in rows]) + 1 
        abs_len = torch.max(abstract_lengths)
        art_len = torch.max(article_lengths) 
        
        # Variables containing abstracts and articles of the batch
        abstracts = torch.cuda.LongTensor(abs_len, self.batch_size).fill_(0) # zero padding
        articles = torch.cuda.LongTensor(art_len, self.batch_size).fill_(0) # zero padding
        
        # Sort rows in descending order of sequence (article) lengths
        article_lengths, indices = torch.sort(article_lengths, descending=True)
        rows = [rows[i] for i in indices]
        abstract_lengths = torch.cuda.LongTensor([abstract_lengths[i] for i in indices])
        
        for i in range(self.batch_size):
            # Tokenize abstract and take max_abstract_size number of tokens
            tokens = rows[i].abstract.split()[:max_abstract_size]
            tokens.append('<eos>')
            # Convert each token to word index
            # <unk> token index for unknown words
            token_list = torch.LongTensor([word2id[token] if token in word2id 
                                           else word2id['<unk>'] for token in tokens])
            # Store as column in abstracts variable with zero padding
            abstracts[:,i][:len(token_list)] = token_list
            
            # Same for articles
            tokens = rows[i].article.split()[:max_article_size]
            tokens.append('<eos>')
            token_list = torch.LongTensor([word2id[token] if token in word2id 
                                           else word2id['<unk>'] for token in tokens])
            articles[:,i][:len(token_list)] = token_list
            
        batch = Batch()
        batch.article = (Variable(articles), article_lengths)
        batch.abstract = (Variable(abstracts), abstract_lengths)
        return batch
            
    def get_row(self, generator):
        row = generator.__next__()
        while not isinstance(row.article, str):
            row = generator.__next__()
        return row
        
        
    def get_batch(self):
        rows = []
        for b in range(self.batch_size):
            try: rows.append(self.get_row(self.generator))
            except StopIteration: break
        if rows: return self.build_batch(rows)
        else: raise StopIteration

In [8]:
class Encoder(nn.Module):
    
    def __init__(self, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        
        # Hidden layer and cell state of model
        # Initialize before calling model
        self.hidden = None
        
        # Lookup table that stores word embeddings
        self.embed = nn.Embedding(vocab_size, num_dims).cuda()
        self.embed.weight.data.copy_(torch.from_numpy(embeddings))
        self.embed.weight.requires_grad = False
        
        # Pytorch lstm module
        self.lstm = nn.LSTM(num_dims, hidden_size, hidden_layers)
        self.lstm.cuda()
        
        # Linear transformation 
        self.linear_transform = nn.Linear(hidden_size, vocab_size)
    
    # Funtion to initialize hidden layers
    def init_hidden(self, batch_size, volatile=False):
        tensor1 = torch.cuda.FloatTensor(hidden_layers, batch_size, hidden_size).fill_(0)
        tensor2 = torch.cuda.FloatTensor(hidden_layers, batch_size, hidden_size).fill_(0)
        return (Variable(tensor1, volatile=volatile), Variable(tensor2, volatile=volatile))
    
    def forward(self, articles, article_lengths):
        # Embedding lookup
        input = self.embed(articles)
        # input to pack_padded_sequence can be of Txbx*
        # where T is the length of longest sequence
        # b is batch size
        # batch is sorted in descending order of sequence lengths
        #packed_input = pack_padded_sequence(input, list(article_lengths))
        #packed_output, self.hidden = self.lstm(packed_input, self.hidden)
        _, self.hidden = self.lstm(input, self.hidden)
        
        output = self.linear_transform(self.hidden[0][hidden_layers - 1])
        
        # Final hidden state
        return self.hidden, output
    
encoder = Encoder(batch_size)
encoder.cuda()

Encoder(
  (embed): Embedding(10000, 128)
  (lstm): LSTM(128, 512, num_layers=2)
  (linear_transform): Linear(in_features=512, out_features=10000, bias=True)
)

In [9]:
class Decoder(nn.Module):
    
    def __init__(self):
        super(Decoder, self).__init__()
        
        # Lookup table that stores word embeddings
        self.embed = nn.Embedding(vocab_size, num_dims).cuda()
        self.embed.weight.data.copy_(torch.from_numpy(embeddings))
        self.embed.weight.requires_grad = False
    
        # Cell and hidden states
        self.cell_list = []
        self.hidden_list = []
    
        # First cell takes word embeddings as input
        self.cell_list.append(nn.LSTMCell(num_dims, hidden_size).cuda())
        for cell in range(1, hidden_layers):
            self.cell_list.append(nn.LSTMCell(hidden_size, hidden_size).cuda())
        # ModlueList Holds submodules in a list. 
        # ModuleList can be indexed like a regular Python list, 
        # but modules it contains are properly registered, 
        # and will be visible by all Module methods.
        self.cell_list=nn.ModuleList(self.cell_list) 
        
        # Linear transformation 
        self.linear_transform = nn.Linear(hidden_size, vocab_size)

    def forward(self, input):
        # input is a LongTensor of size batch_size
        input = self.embed(input) 
        # Each item in hidden list is a tuple of previous cell and hidden states
        for layer in range(hidden_layers):
            self.hidden_list[layer] = self.cell_list[layer](input, self.hidden_list[layer])
            input = self.hidden_list[layer][0]
        # output has shape (batch_size, vocab_size)
        output = self.linear_transform(self.hidden_list[hidden_layers - 1][0])
        return output
    
decoder = Decoder()
decoder.cuda()

Decoder(
  (embed): Embedding(10000, 128)
  (cell_list): ModuleList(
    (0): LSTMCell(128, 512)
    (1): LSTMCell(512, 512)
  )
  (linear_transform): Linear(in_features=512, out_features=10000, bias=True)
)

for name, param in decoder.named_parameters():
    if param.requires_grad:
        print (name, param.data)

In [10]:
learning_rate = 1.5 #2.0

# Filter parameters that do not require gradients
encoder_parameters = filter(lambda p: p.requires_grad, encoder.parameters())
decoder_parameters = filter(lambda p: p.requires_grad, decoder.parameters())
# Optimizers
encoder_optimizer = torch.optim.SGD(encoder_parameters, lr=learning_rate)
decoder_optimizer = torch.optim.SGD(decoder_parameters, lr=learning_rate)
# Loss function
# Way to accumulate loss on sequences with variable lengths in batches :
# size_average: By default, the losses are averaged over observations for each minibatch.
# However, if the field size_average is set to False, the losses are instead summed for each minibatch. 
# Ignored if reduce is False.
# Set size_average to False and divide the loss by the number of non-padding tokens.
# ignore_index: Specifies a target value that is ignored and does not contribute to the input gradient. 
# When size_average is True, the loss is averaged over non-ignored targets.
# Set ignore_index to the padding value
loss_function = nn.CrossEntropyLoss(size_average=False, ignore_index=0).cuda() # 0 is the index of <pad>###

def train_model(batch):
    loss = 0
    # Clear optimizer gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    # Clear hidden state of LSTM
    encoder.hidden = encoder.init_hidden(batch_size)
    # articles, abstracts are LongTensor vairables of shape (max_sequence_length, batch_size)
    # containig word indices from the respective vocabs
    # lengths are LongTensor varibles of shape batch_size containing
    # lengths of all the sequences in the batch
    articles, article_lengths = batch.article
    abstracts, abstract_lengths = batch.abstract
    hiddenT, output = encoder(articles, article_lengths)
    
    # Seperate hidden states corresponding to the the two layers of the encoder
    # and append to hidden state list of decoder as tuples for each layer.
    for layer in range(hidden_layers):
        decoder.hidden_list.append((hiddenT[0][layer], hiddenT[1][layer]))
    #input = Variable(torch.cuda.LongTensor(batch_size).fill_(2)) # 2 is the index of <sos>
    input = most_likely(output, batch_size)

    # Looping over all the sequences
    for t in range(torch.max(abstract_lengths)):
        output = decoder(input)
        input = most_likely(output, batch_size)
        loss += loss_function(output, abstracts[t])
        
    loss = loss/torch.sum(abstract_lengths)
    loss.backward()
    
    #nn.utils.clip_grad_norm(encoder.parameters(), 0.5)
    #nn.utils.clip_grad_norm(decoder.parameters(), 0.5)

    encoder_optimizer.step()
    decoder_optimizer.step()
    
    # Initialize hidden_list for next batch of inputs
    decoder.hidden_list = []
    
    return loss

In [11]:
def validation_loss(df):
    batch_size = 1
    generator = BatchGenerator(batch_size, df)
    loss = 0
    step = 0
    while True:
        try:
            batch = generator.get_batch()
            step += 1
        except StopIteration: break
        loss += calc_loss(batch, batch_size)
    loss = loss/step
    return loss

def calc_loss(batch, batch_size):
    loss = 0
    encoder.hidden = encoder.init_hidden(batch_size, volatile=True)
    articles, article_lengths = batch.article
    abstracts, abstract_lengths = batch.abstract
    
    articles.volatile = True
    abstracts.volatile = True
        
    hiddenT, output = encoder(articles, article_lengths) ###
    for layer in range(hidden_layers):
        decoder.hidden_list.append((hiddenT[0][layer], hiddenT[1][layer])) 
    #input = Variable(torch.cuda.LongTensor(batch_size).fill_(2), volatile=True)
    input = most_likely(output, batch_size)
    
    for t in range(torch.max(abstract_lengths)):
        output = decoder(input)
        input = most_likely(output, batch_size)
        loss += loss_function(output, abstracts[t])
    loss = loss/torch.sum(abstract_lengths)
    decoder.hidden_list = []
    return loss

In [12]:
def most_likely(output, batch_size):
    if batch_size > 1:
        softmax = nn.Softmax(dim=1)
        output = softmax(output)
        _, next_input = torch.topk(output, 1, dim=1)
    else: 
        softmax = nn.Softmax(dim=0)
        output = softmax(output)
        _, next_input = torch.topk(output, 1)
    return next_input

In [13]:
cudnn.benchmark = True
cudnn.fasttest = True
epochs = 7000 #7000

encoder.train()
decoder.train()

train_df = pd.read_csv('datasets/train.csv')
val_df = pd.read_csv('datasets/val.csv')
iteration = 1

for epoch in range(epochs):
    
    if epoch % 1500 == 0: #1500
        learning_rate = learning_rate / 2 #2
        # Filter parameters that do not require gradients
        encoder_parameters = filter(lambda p: p.requires_grad, encoder.parameters())
        decoder_parameters = filter(lambda p: p.requires_grad, decoder.parameters())
        # Optimizers
        encoder_optimizer = torch.optim.SGD(encoder_parameters, lr=learning_rate)
        decoder_optimizer = torch.optim.SGD(decoder_parameters, lr=learning_rate)
        print('')
        print('learning rate: %f' % learning_rate)
        print('')
        
    generator = BatchGenerator(batch_size, train_df[:1000]) 

    while True:
        try: 
            batch = generator.get_batch()
        except StopIteration: break
        loss = train_model(batch)
        
        if iteration % 2 == 0:
            print('Average minibatch loss at step %d: %.3f' % (iteration, loss))
            writer.add_scalar('train_loss', loss, iteration)
            writer.export_scalars_to_json("./all_scalars.json")
        
        if iteration % 8 == 0:    
            encoder.eval()
            decoder.eval()
            val_loss = validation_loss(val_df[:8]) # truncating validation dataframe
            print('Validation loss: %.3f' % val_loss)
            
            writer.add_scalar('valid_loss', val_loss, iteration)
            writer.export_scalars_to_json("./all_scalars.json")
            
            encoder.train()
            decoder.train()
        iteration += 1


learning rate: 0.750000

Average minibatch loss at step 2: 9.200
Average minibatch loss at step 4: 9.160
Average minibatch loss at step 6: 9.117
Average minibatch loss at step 8: 9.076
Validation loss: 9.026
Average minibatch loss at step 10: 9.034
Average minibatch loss at step 12: 8.945
Average minibatch loss at step 14: 8.850
Average minibatch loss at step 16: 8.716
Validation loss: 8.583
Average minibatch loss at step 18: 8.489
Average minibatch loss at step 20: 8.075
Average minibatch loss at step 22: 7.761
Average minibatch loss at step 24: 7.664
Validation loss: 7.492
Average minibatch loss at step 26: 7.619
Average minibatch loss at step 28: 7.358
Average minibatch loss at step 30: 7.514
Average minibatch loss at step 32: 7.222
Validation loss: 7.433
Average minibatch loss at step 34: 7.185
Average minibatch loss at step 36: 7.265
Average minibatch loss at step 38: 7.090
Average minibatch loss at step 40: 7.206
Validation loss: 7.266
Average minibatch loss at step 42: 7.195
Av

Average minibatch loss at step 348: 5.992
Average minibatch loss at step 350: 5.889
Average minibatch loss at step 352: 5.872
Validation loss: 6.471
Average minibatch loss at step 354: 6.667
Average minibatch loss at step 356: 6.160
Average minibatch loss at step 358: 6.040
Average minibatch loss at step 360: 6.372
Validation loss: 6.806
Average minibatch loss at step 362: 6.151
Average minibatch loss at step 364: 5.936
Average minibatch loss at step 366: 6.163
Average minibatch loss at step 368: 5.822
Validation loss: 6.381
Average minibatch loss at step 370: 6.119
Average minibatch loss at step 372: 6.565
Average minibatch loss at step 374: 6.258
Average minibatch loss at step 376: 6.028
Validation loss: 6.358
Average minibatch loss at step 378: 6.105
Average minibatch loss at step 380: 5.938
Average minibatch loss at step 382: 6.640
Average minibatch loss at step 384: 6.496
Validation loss: 6.782
Average minibatch loss at step 386: 6.200
Average minibatch loss at step 388: 6.201
Ave

Average minibatch loss at step 694: 6.079
Average minibatch loss at step 696: 5.840
Validation loss: 6.608
Average minibatch loss at step 698: 5.877
Average minibatch loss at step 700: 5.704
Average minibatch loss at step 702: 5.751
Average minibatch loss at step 704: 5.685
Validation loss: 6.365
Average minibatch loss at step 706: 6.093
Average minibatch loss at step 708: 5.970
Average minibatch loss at step 710: 5.989
Average minibatch loss at step 712: 5.804
Validation loss: 6.397
Average minibatch loss at step 714: 5.904
Average minibatch loss at step 716: 6.086
Average minibatch loss at step 718: 5.824
Average minibatch loss at step 720: 5.505
Validation loss: 6.302
Average minibatch loss at step 722: 6.002
Average minibatch loss at step 724: 6.381
Average minibatch loss at step 726: 5.820
Average minibatch loss at step 728: 5.844
Validation loss: 6.486
Average minibatch loss at step 730: 5.887
Average minibatch loss at step 732: 5.613
Average minibatch loss at step 734: 5.862
Ave

Average minibatch loss at step 1040: 5.422
Validation loss: 7.056
Average minibatch loss at step 1042: 5.702
Average minibatch loss at step 1044: 6.046
Average minibatch loss at step 1046: 5.652
Average minibatch loss at step 1048: 5.627
Validation loss: 6.598
Average minibatch loss at step 1050: 5.656
Average minibatch loss at step 1052: 5.539
Average minibatch loss at step 1054: 5.517
Average minibatch loss at step 1056: 5.535
Validation loss: 6.924
Average minibatch loss at step 1058: 5.595
Average minibatch loss at step 1060: 5.783
Average minibatch loss at step 1062: 5.592
Average minibatch loss at step 1064: 5.598
Validation loss: 6.614
Average minibatch loss at step 1066: 5.677
Average minibatch loss at step 1068: 5.583
Average minibatch loss at step 1070: 5.601
Average minibatch loss at step 1072: 5.404
Validation loss: 6.478
Average minibatch loss at step 1074: 5.606
Average minibatch loss at step 1076: 5.788
Average minibatch loss at step 1078: 5.596
Average minibatch loss at

Average minibatch loss at step 1378: 5.394
Average minibatch loss at step 1380: 5.576
Average minibatch loss at step 1382: 5.437
Average minibatch loss at step 1384: 5.420
Validation loss: 6.646
Average minibatch loss at step 1386: 5.498
Average minibatch loss at step 1388: 5.326
Average minibatch loss at step 1390: 5.300
Average minibatch loss at step 1392: 5.189
Validation loss: 6.800
Average minibatch loss at step 1394: 5.376
Average minibatch loss at step 1396: 5.551
Average minibatch loss at step 1398: 5.411
Average minibatch loss at step 1400: 5.501
Validation loss: 6.786
Average minibatch loss at step 1402: 5.480
Average minibatch loss at step 1404: 5.270
Average minibatch loss at step 1406: 5.293
Average minibatch loss at step 1408: 5.481
Validation loss: 7.002
Average minibatch loss at step 1410: 5.429
Average minibatch loss at step 1412: 5.547
Average minibatch loss at step 1414: 5.413
Average minibatch loss at step 1416: 5.432
Validation loss: 6.722
Average minibatch loss at

Average minibatch loss at step 1718: 5.318
Average minibatch loss at step 1720: 5.319
Validation loss: 6.828
Average minibatch loss at step 1722: 5.399
Average minibatch loss at step 1724: 5.188
Average minibatch loss at step 1726: 5.159
Average minibatch loss at step 1728: 5.046
Validation loss: 6.589
Average minibatch loss at step 1730: 5.323
Average minibatch loss at step 1732: 5.451
Average minibatch loss at step 1734: 5.349
Average minibatch loss at step 1736: 5.319
Validation loss: 6.806
Average minibatch loss at step 1738: 5.505
Average minibatch loss at step 1740: 5.304
Average minibatch loss at step 1742: 5.276
Average minibatch loss at step 1744: 5.186
Validation loss: 7.075
Average minibatch loss at step 1746: 5.315
Average minibatch loss at step 1748: 5.515
Average minibatch loss at step 1750: 5.304
Average minibatch loss at step 1752: 5.326
Validation loss: 6.827
Average minibatch loss at step 1754: 5.393
Average minibatch loss at step 1756: 5.263
Average minibatch loss at

Validation loss: 6.836
Average minibatch loss at step 2058: 5.293
Average minibatch loss at step 2060: 5.214
Average minibatch loss at step 2062: 5.191
Average minibatch loss at step 2064: 5.048
Validation loss: 6.892
Average minibatch loss at step 2066: 5.224
Average minibatch loss at step 2068: 5.432
Average minibatch loss at step 2070: 5.279
Average minibatch loss at step 2072: 5.228
Validation loss: 6.890
Average minibatch loss at step 2074: 5.362
Average minibatch loss at step 2076: 5.142
Average minibatch loss at step 2078: 5.208
Average minibatch loss at step 2080: 4.986
Validation loss: 7.011
Average minibatch loss at step 2082: 5.234
Average minibatch loss at step 2084: 5.383
Average minibatch loss at step 2086: 5.254
Average minibatch loss at step 2088: 5.278
Validation loss: 6.993
Average minibatch loss at step 2090: 5.303
Average minibatch loss at step 2092: 5.168
Average minibatch loss at step 2094: 5.232
Average minibatch loss at step 2096: 5.065
Validation loss: 7.096
Av

Average minibatch loss at step 2396: 5.107
Average minibatch loss at step 2398: 5.050
Average minibatch loss at step 2400: 4.886
Validation loss: 6.912
Average minibatch loss at step 2402: 5.163
Average minibatch loss at step 2404: 5.329
Average minibatch loss at step 2406: 5.325
Average minibatch loss at step 2408: 5.189
Validation loss: 6.995
Average minibatch loss at step 2410: 5.273
Average minibatch loss at step 2412: 5.093
Average minibatch loss at step 2414: 5.028
Average minibatch loss at step 2416: 4.863
Validation loss: 6.918
Average minibatch loss at step 2418: 5.137
Average minibatch loss at step 2420: 5.319
Average minibatch loss at step 2422: 5.182
Average minibatch loss at step 2424: 5.193
Validation loss: 6.982
Average minibatch loss at step 2426: 5.283
Average minibatch loss at step 2428: 5.115
Average minibatch loss at step 2430: 5.122
Average minibatch loss at step 2432: 4.887
Validation loss: 6.970
Average minibatch loss at step 2434: 5.135
Average minibatch loss at

KeyboardInterrupt: 

torch.save(encoder.state_dict(), 'encoder')
torch.save(decoder.state_dict(), 'decoder')