In [1]:
import numpy as np
import pickle
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.backends.cudnn as cudnn

In [2]:
# 2 million word vectors trained on Common Crawl (600B tokens)
embedding_file = open('processed_data/crawl-300d-2M.vec')

In [3]:
# The first line of the file contains the number of words in the vocabulary and the size of the vectors. 
# Each line contains a word followed by its vectors, like in the default fastText text format. 
# Each value is space separated.
# Words are ordered by descending frequency.
meta = embedding_file.readline()
num_words = int(meta.split()[0])
num_dims = int(meta.split()[1])

In [4]:
# Some 'words' are not picked up by split(). Ignoring them.
# dictionary -> word : index
# reverse_dictionary -> index : word
dictionary = dict()
# To store embeddings for each word
embeddings = np.zeros((num_words, num_dims))
i = 0

# start of sequence token
dictionary['PAD_token'] = len(dictionary)
embeddings[i] = np.random.rand(num_dims)
i += 1

# start of sequence token
dictionary['Start_of_Sequence'] = len(dictionary)
embeddings[i] = np.random.rand(num_dims)
i += 1

# end of sequence token
dictionary['End_of_Sequence'] = len(dictionary)
embeddings[i] = np.random.rand(num_dims)
i += 1

# unkown word token
dictionary['UNK_token'] = len(dictionary)
embeddings[i] = np.random.rand(num_dims)
i += 1

# Skip words not separated by split()
for line in embedding_file:
    items = line.split()
    if len(items) == (num_dims + 1):
        dictionary[items[0]] = len(dictionary)
        embeddings[i] = np.asarray([float(value) for value in items[1:]], dtype=np.float64)
        i += 1
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
max_vocab = len(dictionary)
embeddings = embeddings[:max_vocab]
embedding_file.close()

print("Total words: %d" % max_vocab)
print("Number of dimensions: %d" % num_dims)

Total words: 1999999
Number of dimensions: 300


In [5]:
input_vocab_size = 40000
output_vocab_size = 40000
# Input words and embeddings
input_dictionary = dict()
input_reverse_dictionary = dict()
input_embeddings = np.zeros((input_vocab_size, num_dims))
# Output words and embeddings
output_dictionary = dict()
output_reverse_dictionary = dict()
output_embeddings = np.zeros((output_vocab_size, num_dims))
# Extract input vocabulary
for i in range(input_vocab_size):
    word = reverse_dictionary[i]
    input_dictionary[word] = i
    input_reverse_dictionary[i] = word
    input_embeddings[i] = embeddings[i]
# Extract output vocabulary
for i in range(output_vocab_size):
    word = reverse_dictionary[i]
    output_dictionary[word] = i
    output_reverse_dictionary[i] = word
    output_embeddings = embeddings[i]
    
del dictionary, reverse_dictionary, embeddings
print('Input vocabulary size: %d' % input_vocab_size)
print('Output vocabulary size: %d' % output_vocab_size)

Input vocabulary size: 40000
Output vocabulary size: 40000


In [6]:
# Load preprocessed titles and articles
titles = pickle.load(open('processed_data/titles', 'rb'))
articles = pickle.load(open('processed_data/articles', 'rb'))

In [7]:
# Number of tokens to extract from the beginning of each article
max_article_size = 80
min_article_size = 30

# Convert each title and article to lists of words
# Discard if article size is less than minimum or is less than title size
# or if title doesn't exist
temp1 = list()
temp2 = list()
max_title_size = 0
for i in range(len(articles)):
    title = titles[i].split()
    article = articles[i].split()[:max_article_size]
    if len(title) > max_title_size: max_title_size = len(title)
    if (len(article)>=min_article_size) and (len(article)>len(title)) and len(title) > 0:
        temp1.append(title)
        temp2.append(article)
titles = temp1
articles = temp2
del temp1, temp2

print('Total number of samples: %d' % len(articles))
print('Maximum article size: %d' % max_article_size)
print('Maximum title size: %d' % max_title_size)

Total number of samples: 141549
Maximum article size: 80
Maximum title size: 37


In [8]:
# Validation set size
valid_size = 1000

# Separate into training and validation sets
valid_titles = titles[:valid_size]
valid_articles = articles[:valid_size]
train_titles = titles[valid_size:]
train_articles = articles[valid_size:]
train_size = len(train_articles)
del titles, articles

print('Training set size: %d' % train_size)
print('Valid set size: %d' % valid_size)

Training set size: 140549
Valid set size: 1000


In [9]:
batch_size = 128
input_max_len = max_article_size
output_max_len = max_title_size

class BatchGenerator(object): # Needs refactoring
    
    def __init__(self, article_list, title_list, batch_size):
        self._article_list = article_list
        self._title_list = title_list
        self._list_size = len(article_list) # or title_list
        self._batch_size = batch_size
        self._input_max_len = input_max_len
        self._output_max_len = output_max_len
        segment = self._list_size // batch_size
        self._cursor = [offset*segment for offset in range(batch_size)]
        
    def _next_seq(self):
        # List to hold articles
        seq_list = list()
        # List to hold titles
        label_list = list()
        # Tensor to hold length of each sequence in a batch
        seq_lens = torch.cuda.LongTensor(self._batch_size)
        # Tensor to hold index of each word of an article
        sequence = torch.cuda.LongTensor(self._input_max_len).fill_(0)
        
        for b in range(self._batch_size):
            # Each article as a list of words
            _list = self._article_list[self._cursor[b]]
            length = len(_list)
            seq_lens[b] = length
            # Place word indices from the left
            sequence[:length] = torch.from_numpy(np.asarray([input_dictionary[word] 
                                 if word in input_dictionary 
                                 else input_dictionary['UNK_token'] for word in _list]))
            seq_list.append(sequence)
            # Corresponding title
            label_list.append(self._title_list[self._cursor[b]])
            self._cursor[b] = (self._cursor[b] + 1) % self._list_size
        if self._batch_size > 1:
            # Sort sequences in descending order
            seq_lens, indices = torch.sort(seq_lens, descending=True)
            seq_list = [seq_list[i] for i in indices]
            label_list = [label_list[i] for i in indices]
        return seq_list, seq_lens, label_list
    
    def _labels(self, label_list, hiddenT):
        hidden = hiddenT[0]
        cell = hiddenT[1]
        # List to hold titles
        seq_list = list()
        # Tensor to hold length of each sequence in a batch
        seq_lens = torch.cuda.LongTensor(self._batch_size)
        # Tensor to hold index of each word of an article
        sequence = torch.cuda.LongTensor(self._output_max_len + 2).fill_(0)
        sequence[0] = output_dictionary['Start_of_Sequence']
        
        for b in range(self._batch_size):
            length = len(label_list[b])
            seq_lens[b] = length + 2
            # Place word indices from the left
            sequence[1:length+1] = torch.from_numpy(np.asarray([output_dictionary[word] 
                                 if word in output_dictionary 
                                 else output_dictionary['UNK_token'] for word in label_list[b]]))
            sequence[length+1] = output_dictionary['End_of_Sequence']
            seq_list.append(sequence)
        if self._batch_size > 1:
            # Sort sequences in descending order
            seq_lens, indices = torch.sort(seq_lens, descending=True)
            seq_list = [seq_list[i] for i in indices]
            # Sort last hidden layer of encoder
            hidden = hidden[:,indices]
            cell = cell[:,indices]
        return seq_list, seq_lens, (hidden, cell)
            

train_batches = BatchGenerator(train_articles, train_titles, batch_size)
valid_batches = BatchGenerator(valid_articles, valid_titles, 1)

In [19]:
for i in range(1000):
    train_batches._next_seq()

In [10]:
# Number of hidden units in each hidden layer
encoder_hidden_size = 512
# Number of recurrent layers in encoder
encoder_num_layers = 2
# Encoder dropout
encoder_dropout = 0

class EncoderRNN(nn.Module):
    
    def __init__(self, embeddings, vocab_size, hidden_size, num_layers, dropout):
        super(EncoderRNN, self).__init__()
        
        # Hyperparameters
        self._vocab_size = vocab_size
        self._hidden_size = hidden_size
        self._num_layers = num_layers
        self._dropout = dropout
        
        # Hidden layer and cell state of model
        # Initialize before calling model
        self.hidden = None
        
        # Lookup table that stores word embeddings
        self.embed = nn.Embedding(self._vocab_size, num_dims).cuda()
        self.embed.weight.data.copy_(torch.from_numpy(embeddings))
        self.embed.weight.requires_grad = False
        
        # Pytorch lstm module
        self.lstm = nn.LSTM(num_dims, self._hidden_size, 
                            self._num_layers, dropout=self._dropout)
        self.lstm.cuda()
        
    def init_hidden(self, batch_size):
        return (Variable(torch.cuda.FloatTensor(self._num_layers, batch_size,
                    self._hidden_size).fill_(0), requires_grad=False), 
                Variable(torch.cuda.FloatTensor(self._num_layers, batch_size, 
                    self._hidden_size).fill_(0), 
                         requires_grad=False))
    
    def forward(self, seq_list, seq_lens):
        batch_size = len(seq_list)
        inputs = Variable(torch.cuda.FloatTensor(torch.max(seq_lens), batch_size, 
                        num_dims), requires_grad=False)
        for b in range(batch_size):
            inputs[:,b] = self.embed(Variable(seq_list[b], requires_grad=False).cuda())
        packed_input = pack_padded_sequence(inputs, seq_lens.cpu().numpy())
        output, self.hidden = self.lstm(packed_input, (self.hidden))
        # Final hidden state
        return self.hidden
    
encoder = EncoderRNN(input_embeddings, input_vocab_size,
                     encoder_hidden_size, encoder_num_layers, encoder_dropout)
encoder.cuda()

EncoderRNN(
  (embed): Embedding(40000, 300)
  (lstm): LSTM(300, 512, num_layers=2)
)

In [11]:
# Number of hidden units in each hidden layer
decoder_hidden_size = 512
# Number of recurrent layers in encoder
decoder_num_layers = 2
# Encoder dropout
decoder_dropout = 0

class DecoderRNN(nn.Module):
    
    def __init__(self, embeddings, vocab_size, hidden_size, num_layers, dropout):
        super(DecoderRNN, self).__init__()
        
        # Hyperparameters
        self._vocab_size = vocab_size
        self._hidden_size = hidden_size
        self._num_layers = num_layers
        self._dropout = dropout
        
        # Hidden layer and cell state of model
        # Initialize before calling model
        self.hidden = None
        
        # Lookup table that stores word embeddings
        self.embed = nn.Embedding(self._vocab_size, num_dims).cuda()
        self.embed.weight.data.copy_(torch.from_numpy(embeddings))
        self.embed.weight.requires_grad = False
        
        # Pytorch lstm module
        self.lstm = nn.LSTM(num_dims, self._hidden_size, 
                            self._num_layers, dropout=self._dropout)
        self.lstm.cuda()
        
        # Linear transformation 
        self.hidden2word = nn.Linear(self._hidden_size, self._vocab_size)
        
    def forward(self, seq_list, seq_lens):
        batch_size = len(seq_list)
        inputs = Variable(torch.cuda.FloatTensor(torch.max(seq_lens-1), batch_size, 
                    num_dims), requires_grad=False)
        for b in range(batch_size):
            input = seq_list[b][:-1]
            inputs[:,b] = self.embed(Variable(input, requires_grad=False).cuda())
        packed_input = pack_padded_sequence(inputs, seq_lens.cpu().numpy())
        packed_output, self.hidden = self.lstm(packed_input, (self.hidden))
        output, _ = pad_packed_sequence(packed_output)
        output = self.hidden2word(output)
        
        
        return output
        
decoder = DecoderRNN(output_embeddings, output_vocab_size,
                     decoder_hidden_size, decoder_num_layers, decoder_dropout)
decoder.cuda()

DecoderRNN(
  (embed): Embedding(40000, 300)
  (lstm): LSTM(300, 512, num_layers=2)
  (hidden2word): Linear(in_features=512, out_features=40000)
)

In [12]:
learning_rate = 0.1

# Filter parameters that do not require gradients
encoder_parameters = filter(lambda p: p.requires_grad, encoder.parameters())
decoder_parameters = filter(lambda p: p.requires_grad, decoder.parameters())
# Optimizers
encoder_optimizer = torch.optim.SGD(encoder_parameters, lr=learning_rate)
decoder_optimizer = torch.optim.SGD(decoder_parameters, lr=learning_rate)
# Loss function
loss_function = nn.CrossEntropyLoss()

def train():
    # Clear model gradients
    encoder.zero_grad()
    decoder.zero_grad()
    # Clear optimizer gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    # Clear hidden state of LSTM
    encoder.hidden = encoder.init_hidden(batch_size)
    
    # Get batch_size number of sequences, lengths of sequences and labels (titles)
    # seq_list, seq_lens sorted. labels not sorted
    seq_list, seq_lens, label_list = train_batches._next_seq()
    hiddenT = encoder(seq_list, seq_lens)
    # Sort labels and last hidden layer of encoder accordingly    
    label_list, label_lens, hiddenT = train_batches._labels(label_list, hiddenT)
    decoder.hidden = hiddenT
    output = decoder(label_list, label_lens)
    
    torch.cuda.empty_cache()

In [14]:
train()

22
Variable containing:
(  0  ,.,.) = 
  1.7053e-02  2.4089e-02 -9.1320e-04  ...   4.5620e-03 -1.6555e-02  2.2388e-02
  1.7053e-02  2.4089e-02 -9.1320e-04  ...   4.5620e-03 -1.6555e-02  2.2388e-02
  1.7053e-02  2.4089e-02 -9.1320e-04  ...   4.5620e-03 -1.6555e-02  2.2388e-02
                 ...                   ⋱                   ...                
  1.7053e-02  2.4089e-02 -9.1320e-04  ...   4.5620e-03 -1.6555e-02  2.2388e-02
  1.7053e-02  2.4089e-02 -9.1320e-04  ...   4.5620e-03 -1.6555e-02  2.2388e-02
  1.2575e-02  2.2359e-02  1.1125e-03  ...   6.2122e-03 -2.1434e-02  2.3169e-02

(  1  ,.,.) = 
  1.6120e-02  1.9810e-02 -2.3937e-04  ...   7.7645e-03 -1.6024e-02  2.8672e-02
  1.6120e-02  1.9810e-02 -2.3937e-04  ...   7.7645e-03 -1.6024e-02  2.8672e-02
  1.6120e-02  1.9810e-02 -2.3937e-04  ...   7.7645e-03 -1.6024e-02  2.8672e-02
                 ...                   ⋱                   ...                
  1.6120e-02  1.9810e-02 -2.3937e-04  ...   7.7645e-03 -1.6024e-02  2.8672e-