In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.backends.cudnn as cudnn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import pandas as pd
from fastText import load_model
from matplotlib import pylab
from sklearn.manifold import TSNE
from tensorboardX import SummaryWriter

writer = SummaryWriter()

In [2]:
# Fasttext embeddings trained on train and val sets
# ./fasttext skipgram -input input_text_file -output output_model -dim 128 (fastText-0.1.0)
fasttext_model = load_model('word_vectors/fasttext_model.bin')
num_dims = 128

In [3]:
# vocab contains frequent words apperaing in the text along with their frequencies
# minimum frequency = 6
vocab_file = open('finished_files/vocab')
# Store appearing words
vocab_words = {}
for line in vocab_file:
    li = line.split()
    if len(li) == 2:
        word, freq = li
        vocab_words[word] = freq
# Final word to id dictionary    
word2id = {}
tokens = ['<pad>', '<unk>', '<eos>']
for token in tokens:
    word2id[token] = len(word2id)
# Retrieve words from fasttext model and keep only those which are also present in 'vocab'
fasttext_words = fasttext_model.get_words()
for word in fasttext_words:
    if word in vocab_words:
        word2id[word] = len(word2id)        
vocab_size = len(word2id)
# Reverse dictionary
id2word = dict(zip(word2id.values(), word2id.keys()))
# Embeddings
embeddings = np.zeros((vocab_size, num_dims))
# <pad> token vector contains all zeros. Rest sampled from a normal distribution
mu, sigma = 0, 0.05
for i in range(1, len(tokens)):
    embeddings[i] = np.random.normal(mu, sigma, num_dims)
# Get word vectors from fasttext model and store in embeddings matrix
for i in range(len(tokens), vocab_size):
    embeddings[i] = fasttext_model.get_word_vector(id2word[i])
    
del fasttext_model, vocab_words

In [4]:
temp = {}
for i in range(10000):
    temp[i] = id2word[i]
id2word = temp
embeddings = embeddings[:10000]
word2id = dict(zip(id2word.values(), id2word.keys()))

vocab_size = len(word2id)

num_points = 500

tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
two_d_embeddings = tsne.fit_transform(embeddings[1:num_points+1, :])

def plot(embeddings, labels):
    assert embeddings.shape[0] >= len(labels), 'More labels than embeddings'
    pylab.figure(figsize=(15,15))  # in inches
    for i, label in enumerate(labels):
        x, y = embeddings[i,:]
        pylab.scatter(x, y)
        pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
    pylab.show()

words = [id2word[i] for i in range(1, num_points+1)]
plot(two_d_embeddings, words)

In [5]:
batch_size = 64
max_article_size = 50 #400
max_abstract_size = 15 #100
hidden_size = 512

In [6]:
class Batch:
    def __init__(self):
        self.abstract = (None, None)
        self.article = (None, None)

In [7]:
class BatchGenerator: 
    
    def __init__(self, batch_size, dataframe):
        self.batch_size = batch_size
        # train, valid, or test dataframe imported from csv
        self.df = dataframe
        self.generator = self.row_generator()
        
        
    def row_generator(self):
        for row in self.df.itertuples(index=False):
            yield row
            
    def build_batch(self, rows):
        # If number of rows less than batch size, get extra rows from the beginning of the dataframe
        if len(rows) < self.batch_size:
            temp_generator = self.row_generator()
            for i in range(self.batch_size - len(rows)):
                rows.append(self.get_row(temp_generator))
                
        # Get lengths of all the sequences in the batch upto max number of tokens
        # + 1 is for the <eos> token
        abstract_lengths = torch.cuda.LongTensor(
            [len(row.abstract.split()[:max_abstract_size]) for row in rows]) + 1
        article_lengths = torch.cuda.LongTensor(
            [len(row.article.split()[:max_article_size]) for row in rows]) + 1 
        abs_len = torch.max(abstract_lengths)
        art_len = torch.max(article_lengths) 
        
        # Variables containing abstracts and articles of the batch
        abstracts = torch.cuda.LongTensor(abs_len, self.batch_size).fill_(0) # zero padding
        articles = torch.cuda.LongTensor(art_len, self.batch_size).fill_(0) # zero padding
        
        # Sort rows in descending order of sequence (article) lengths
        article_lengths, indices = torch.sort(article_lengths, descending=True)
        rows = [rows[i] for i in indices]
        abstract_lengths = torch.cuda.LongTensor([abstract_lengths[i] for i in indices])
        
        for i in range(self.batch_size):
            # Tokenize abstract and take max_abstract_size number of tokens
            tokens = rows[i].abstract.split()[:max_abstract_size]
            tokens.append('<eos>')
            # Convert each token to word index
            # <unk> token index for unknown words
            token_list = torch.LongTensor([word2id[token] if token in word2id 
                                           else word2id['<unk>'] for token in tokens])
            # Store as column in abstracts variable with zero padding
            abstracts[:,i][:len(token_list)] = token_list
            
            # Same for articles
            tokens = rows[i].article.split()[:max_article_size]
            tokens.append('<eos>')
            token_list = torch.LongTensor([word2id[token] if token in word2id 
                                           else word2id['<unk>'] for token in tokens])
            articles[:,i][:len(token_list)] = token_list
            
        batch = Batch()
        batch.article = (Variable(articles), article_lengths)
        batch.abstract = (Variable(abstracts), abstract_lengths)
        return batch
            
    def get_row(self, generator):
        row = generator.__next__()
        while not isinstance(row.article, str):
            row = generator.__next__()
        return row
        
        
    def get_batch(self):
        rows = []
        for b in range(self.batch_size):
            try: rows.append(self.get_row(self.generator))
            except StopIteration: break
        if rows: return self.build_batch(rows)
        else: raise StopIteration

In [8]:
class Encoder(nn.Module):
    
    def __init__(self, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        
        # Hidden states of the encoder
        self.hidden_states = None
        self.sequence_lengths = None
        
        # Lookup table that stores word embeddings
        self.embed = nn.Embedding(vocab_size, num_dims).cuda()
        self.embed.weight.data.copy_(torch.from_numpy(embeddings))
        self.embed.weight.requires_grad = False
        
        # Pytorch lstm module
        self.lstm = nn.LSTM(num_dims, hidden_size, num_layers=1, bidirectional=True)
        self.lstm.cuda()
        
        # Linear transformation 
        self.linear_transform = nn.Linear(2 * hidden_size, vocab_size)
    
    def forward(self, articles, article_lengths):
        # Embedding lookup
        input = self.embed(articles) # (T,B,N)
        # batch is sorted in descending order of sequence lengths
        packed_input = pack_padded_sequence(input, list(article_lengths))
        packed_output, last_hidden = self.lstm(packed_input)
        self.hidden_states, self.sequence_lengths = pad_packed_sequence(packed_output) # hidden_states (T,B,H*2)
        
        # Sum hidden states for all time steps for bidirectional lstm
        #self.hidden_states = unpacked[:,:,:hidden_size] + unpacked[:,:,hidden_size:] for summing hidden states
        
        # Concatenate hidden and cell states of last time step for bidirectional lstm
        h_n = torch.cat((last_hidden[0][0], last_hidden[0][1]), dim=1)
        c_n = torch.cat((last_hidden[1][0], last_hidden[1][1]), dim=1)
        
        hiddenT = (h_n, c_n)
        output = self.linear_transform(h_n)
        
        # Final hidden state
        return hiddenT, output
    
encoder = Encoder(batch_size)
encoder.cuda()

Encoder(
  (embed): Embedding(100000, 128)
  (lstm): LSTM(128, 512, bidirectional=True)
  (linear_transform): Linear(in_features=1024, out_features=100000, bias=True)
)

In [9]:
class Decoder(nn.Module):
    
    def __init__(self):
        super(Decoder, self).__init__()
        
        # Lookup table that stores word embeddings
        self.embed = nn.Embedding(vocab_size, num_dims).cuda()
        self.embed.weight.data.copy_(torch.from_numpy(embeddings))
        self.embed.weight.requires_grad = False
    
        self.hidden = None
        self.lstm_cell = nn.LSTMCell(num_dims, 2 * hidden_size).cuda()

        # Linear transformation 
        self.linear_transform = nn.Linear(2 * hidden_size, vocab_size)
        
        #self.W_att = nn.Parameter()
        
        
    
    def attention(h_t, batch_size):
        score = score_function(h_t, batch_size)
        
        
        
    def score_function(h_t, batch_size):
        seq_len = encoder.hidden_states.shape[0]
        score = torch.cuda.FloatTensor(seq_len, batch_size)
        for t in seq_len:
            score[t] = torch.diag(torch.mm(encoder.hidden_states[t], torch.transpose(h_t, 1, 0)))
        return score # (T,B)
    

    def forward(self, input):
        # input is a LongTensor of size B
        input = self.embed(input) #(B,N)

        self.hidden = self.lstm_cell(input, self.hidden)
        

        # output has shape (B,V)
        output = self.linear_transform(self.hidden[0])
        return output
    
decoder = Decoder()
decoder.cuda()

Decoder(
  (embed): Embedding(100000, 128)
  (lstm_cell): LSTMCell(128, 1024)
  (linear_transform): Linear(in_features=1024, out_features=100000, bias=True)
)

for name, param in decoder.named_parameters():
    if param.requires_grad:
        print (name, param.data)

In [10]:
learning_rate = 4.0 #3.0, 3.5

# Filter parameters that do not require gradients
encoder_parameters = filter(lambda p: p.requires_grad, encoder.parameters())
decoder_parameters = filter(lambda p: p.requires_grad, decoder.parameters())
# Optimizers
encoder_optimizer = torch.optim.SGD(encoder_parameters, lr=learning_rate)
decoder_optimizer = torch.optim.SGD(decoder_parameters, lr=learning_rate)
# Loss function
# Way to accumulate loss on sequences with variable lengths in batches :
# size_average: By default, the losses are averaged over observations for each minibatch.
# However, if the field size_average is set to False, the losses are instead summed for each minibatch. 
# Ignored if reduce is False.
# Set size_average to False and divide the loss by the number of non-padding tokens.
# ignore_index: Specifies a target value that is ignored and does not contribute to the input gradient. 
# When size_average is True, the loss is averaged over non-ignored targets.
# Set ignore_index to the padding value
#loss_function = nn.CrossEntropyLoss(size_average=False, ignore_index=0).cuda() # 0 is the index of <pad>
loss_function = nn.NLLLoss(size_average=False, ignore_index=0).cuda() # 0 is the index of <pad>

def train_model(batch):
    loss = 0
    # Clear optimizer gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    # articles, abstracts are LongTensor vairables of shape (max_sequence_length, B)
    # containig word indices from the respective vocabs
    # lengths are LongTensor varibles of shape batch_size containing
    # lengths of all the sequences in the batch
    articles, article_lengths = batch.article
    abstracts, abstract_lengths = batch.abstract
    hiddenT, output = encoder(articles, article_lengths)
    # Initialize decoder hidden state
    decoder.hidden = hiddenT
    # First input to the decoder is the predicted word from the last state of encoder
    output = softmax(output, batch_size)
    _, input = torch.topk(output, 1, dim=1)
    # Looping over all the sequences
    for t in range(torch.max(abstract_lengths)):
        output = decoder(input)
        output = softmax(output, batch_size)
        _, input = torch.topk(output, 1, dim=1)
        loss += loss_function(output, abstracts[t])
        
    loss = loss/torch.sum(abstract_lengths)
    loss.backward()
    
    #nn.utils.clip_grad_norm(encoder.parameters(), 0.5)
    #nn.utils.clip_grad_norm(decoder.parameters(), 0.5)

    encoder_optimizer.step()
    decoder_optimizer.step()
    
    # Initialize hidden_list for next batch of inputs
    decoder.hidden_list = []
    
    return loss.data[0]

In [11]:
def validation_loss(df):
    batch_size = 1
    generator = BatchGenerator(batch_size, df)
    loss = 0
    step = 0
    while True:
        try:
            batch = generator.get_batch()
            step += 1
        except StopIteration: break
        loss += calc_loss(batch, batch_size)
    loss = loss/step
    return loss

def calc_loss(batch, batch_size):
    loss = 0
    encoder.hidden = encoder.init_hidden(batch_size, volatile=True)
    articles, article_lengths = batch.article
    abstracts, abstract_lengths = batch.abstract
    
    articles.volatile = True
    abstracts.volatile = True
        
    hiddenT, output = encoder(articles, article_lengths) ###
    for layer in range(hidden_layers):
        decoder.hidden_list.append((hiddenT[0][layer], hiddenT[1][layer])) 
    #input = Variable(torch.cuda.LongTensor(batch_size).fill_(2), volatile=True)
    input = most_likely(output, batch_size)
    
    for t in range(torch.max(abstract_lengths)):
        output = decoder(input)
        input = most_likely(output, batch_size)
        loss += loss_function(output, abstracts[t])
    loss = loss/torch.sum(abstract_lengths)
    decoder.hidden_list = []
    return loss

In [12]:
def softmax(output, batch_size):
    if batch_size > 1:
        log_softmax = nn.LogSoftmax(dim=1)
        output = log_softmax(output)
    else: 
        log_softmax = nn.LogSoftmax(dim=0)
        output = log_softmax(output)
        _, next_input = torch.topk(output, 1)##
    return output

In [13]:
cudnn.benchmark = True
cudnn.fasttest = True
epochs = 7000 #7000

encoder.train()
decoder.train()

train_df = pd.read_csv('datasets/train.csv')
val_df = pd.read_csv('datasets/val.csv')
iteration = 1

for epoch in range(epochs):
    
    if epoch % 1500 == 0: #500, 275
        learning_rate = learning_rate / 2 #2
        # Filter parameters that do not require gradients
        encoder_parameters = filter(lambda p: p.requires_grad, encoder.parameters())
        decoder_parameters = filter(lambda p: p.requires_grad, decoder.parameters())
        # Optimizers
        encoder_optimizer = torch.optim.SGD(encoder_parameters, lr=learning_rate)
        decoder_optimizer = torch.optim.SGD(decoder_parameters, lr=learning_rate)
        print('')
        print('learning rate: %f' % learning_rate)
        print('')
        
    generator = BatchGenerator(batch_size, train_df[:64]) #64

    while True:
        try: 
            batch = generator.get_batch()
        except StopIteration: break
        loss = train_model(batch)
        
        if iteration % 2 == 0:
            print('Average minibatch loss at step %d: %.3f' % (iteration, loss))
            writer.add_scalar('train_loss', loss, iteration)
            writer.export_scalars_to_json("./all_scalars.json")
        
        """if iteration % 8 == 0:    
            encoder.eval()
            decoder.eval()
            val_loss = validation_loss(val_df[:8]) # truncating validation dataframe
            print('Validation loss: %.3f' % val_loss)
            
            writer.add_scalar('valid_loss', val_loss, iteration)
            writer.export_scalars_to_json("./all_scalars.json")
            
            encoder.train()
            decoder.train()"""
        iteration += 1


learning rate: 2.000000

Average minibatch loss at step 2: 11.483
Average minibatch loss at step 4: 11.416
Average minibatch loss at step 6: 11.333
Average minibatch loss at step 8: 11.196
Average minibatch loss at step 10: 10.756
Average minibatch loss at step 12: 11.005
Average minibatch loss at step 14: 10.454
Average minibatch loss at step 16: 10.442
Average minibatch loss at step 18: 9.395
Average minibatch loss at step 20: 9.630
Average minibatch loss at step 22: 8.832
Average minibatch loss at step 24: 8.473
Average minibatch loss at step 26: 9.308
Average minibatch loss at step 28: 10.397
Average minibatch loss at step 30: 9.279
Average minibatch loss at step 32: 8.844


KeyboardInterrupt: 

torch.save(encoder.state_dict(), 'encoder')
torch.save(decoder.state_dict(), 'decoder')