In [1]:
import numpy as np
import pickle
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.backends.cudnn as cudnn

In [2]:
# 2 million word vectors trained on Common Crawl (600B tokens)
embedding_file = open('processed_data/crawl-300d-2M.vec')

In [3]:
# The first line of the file contains the number of words in the vocabulary and the size of the vectors. 
# Each line contains a word followed by its vectors, like in the default fastText text format. 
# Each value is space separated.
# Words are ordered by descending frequency.
meta = embedding_file.readline()
num_words = int(meta.split()[0])
num_dims = int(meta.split()[1])

In [4]:
# Some 'words' are not picked up by split(). Hence.
# dictionary -> word : index
# reverse_dictionary -> index : word
dictionary = dict()
# To store embeddings for each word
embeddings = np.zeros((num_words, num_dims))
i = 0

# start of sequence token
dictionary['Start_of_Sequence'] = len(dictionary)
embeddings[i] = np.random.rand(num_dims)
i += 1

# end of sequence token
dictionary['End_of_Sequence'] = len(dictionary)
embeddings[i] = np.random.rand(num_dims)
i += 1

# unkown word token
dictionary['UNK_token'] = len(dictionary)
embeddings[i] = np.random.rand(num_dims)
i += 1

# Skip words not separated by split()
for line in embedding_file:
    items = line.split()
    if len(items) == (num_dims + 1):
        dictionary[items[0]] = len(dictionary)
        embeddings[i] = np.asarray([float(value) for value in items[1:]], dtype=np.float64)
        i += 1
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
max_vocab = len(dictionary)
embeddings = embeddings[:max_vocab]
embedding_file.close()

print("Total words: %d" % max_vocab)
print("Number of dimensions: %d" % num_dims)

Total words: 1999998
Number of dimensions: 300


In [1]:
## Truncate vocabulary

In [16]:
# Load preprocessed titles and articles
titles = pickle.load(open('processed_data/titles', 'rb'))
articles = pickle.load(open('processed_data/articles', 'rb'))

In [17]:
# Number of tokens to extract from the beginning of each article
article_size = 100
# Validation set size
valid_size = 1000

# Convert each title and article to lists of words
temp1 = [None] * len(articles)
temp2 = [None] * len(articles)
for i in range(len(articles)):
    temp1[i] = titles[i].split()
    temp2[i] = articles[i].split()[:article_size]
titles = temp1
articles = temp2
del temp1, temp2
# Separate into training and validation sets
valid_titles = titles[:valid_size]
valid_articles = articles[:valid_size]
train_titles = titles[valid_size:]
train_articles = articles[valid_size:]
del titles, articles

In [None]:
batch_size = 128
seq_len = 80

class BatchGenerator(object):
    
    def __init__(self, article_list, title_list, batch_size, seq_len):
        self._article_list = article_list
        self._title_list = title_list
        self._article_list_size = len(article_list)
        self._batch_size = batch_size
        self._seq_len = seq_len
        segment = self._article_list_size // batch_size
        # cursor -> list of offsets separated by segment number of words
        self._cursor = [offset*segment for offset in range(batch_size)]
        
    def _next_batch(self, step):
        batch = torch.cuda.LongTensor(self._batch_size)
        for b in range(self._batch_size):
            word = self._article_list[self.cursor[b]][step]
            if word in dictionary:
                batch[b] = dictionary[word]
            else batch[b] = dictionary['UNK_token']
        return batch
    
    def _next(self):
        batches = list()
        labels = list()
        for step in range(self._seq_len):
            batch = self._next_batch(step) 
            batches.append(batch)
        labels = [self._title_list[self.cursor[b]] for b in range(self._batch_size)]
        self._cursor= [((cursor + 1) % self._article_list_size) for cursor in self._cursor]
        return batches, labels
    
train_batches = BatchGenerator(train_articles, train_titles, batch_size, seq_len)
valid_batches = BatchGenerator(valid_articles, valid_titles, 1, 1)

![LSTM_doc.png](attachment:LSTM_doc.png)

In [None]:
# Number of hidden units in each hidden layer
hidden_size = 512
# Number of recurrent layers
num_layers = 2
# Dropout
dropout = 0

class EncoderRNN(nn.Module):
    
    def __init__(self, embeddings, hidden_size, num_layers, dropout):
        super(EncoderRNN, self).__init__()
        # Hyperparameters
        self._hidden_size = hidden_size
        self._num_layers = num_layers
        self._dropout = dropout
        
        # Hidden layer and cell state of model
        # Initialize before training, validation and testing
        self.hidden = None
        
        # Lookup table that stores word embeddings
        self.embed = nn.Embedding(vocab_size, num_dims)
        self.embed.weight.data.copy_(torch.from_numpy(embeddings))
        self.embed.weight.requires_grad = False
        
        # Pytorch lstm module
        self.lstm = nn.LSTM(num_dims, self._hidden_size, 
                            self._num_layers, dropout=self._dropout)
        
    def init_hidden(self, batch_size):
        return (Variable(torch.cuda.FloatTensor(self._num_layers, batch_size,
                    self._hidden_size).fill_(0), requires_grad=False), 
                Variable(torch.cuda.FloatTensor(self._num_layers, batch_size, 
                    self._hidden_size).fill_(0), 
                         requires_grad=False))
    