In [1]:
import numpy as np
import pickle
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.backends.cudnn as cudnn

In [2]:
# 2 million word vectors trained on Common Crawl (600B tokens)
embedding_file = open('processed_data/crawl-300d-2M.vec')

In [3]:
# The first line of the file contains the number of words in the vocabulary and the size of the vectors. 
# Each line contains a word followed by its vectors, like in the default fastText text format. 
# Each value is space separated.
# Words are ordered by descending frequency.
meta = embedding_file.readline()
num_words = int(meta.split()[0])
num_dims = int(meta.split()[1])

In [4]:
# Some 'words' are not picked up by split(). Ignoring them.
# dictionary -> word : index
# reverse_dictionary -> index : word
dictionary = dict()
# To store embeddings for each word
embeddings = np.zeros((num_words, num_dims))
i = 0

# start of sequence token
dictionary['PAD_token'] = len(dictionary)
embeddings[i] = np.random.rand(num_dims)
i += 1

# start of sequence token
dictionary['Start_of_Sequence'] = len(dictionary)
embeddings[i] = np.random.rand(num_dims)
i += 1

# end of sequence token
dictionary['End_of_Sequence'] = len(dictionary)
embeddings[i] = np.random.rand(num_dims)
i += 1

# unkown word token
dictionary['UNK_token'] = len(dictionary)
embeddings[i] = np.random.rand(num_dims)
i += 1

# Skip words not separated by split()
for line in embedding_file:
    items = line.split()
    if len(items) == (num_dims + 1):
        dictionary[items[0]] = len(dictionary)
        embeddings[i] = np.asarray([float(value) for value in items[1:]], dtype=np.float64)
        i += 1
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
max_vocab = len(dictionary)
embeddings = embeddings[:max_vocab]
embedding_file.close()

print("Total words: %d" % max_vocab)
print("Number of dimensions: %d" % num_dims)

Total words: 1999999
Number of dimensions: 300


In [5]:
input_vocab_size = 40000
output_vocab_size = 40000
# Input words and embeddings
input_dictionary = dict()
input_reverse_dictionary = dict()
input_embeddings = np.zeros((input_vocab_size, num_dims))
# Output words and embeddings
output_dictionary = dict()
output_reverse_dictionary = dict()
output_embeddings = np.zeros((output_vocab_size, num_dims))
# Extract input vocabulary
for i in range(input_vocab_size):
    word = reverse_dictionary[i]
    input_dictionary[word] = i
    input_reverse_dictionary[i] = word
    input_embeddings[i] = embeddings[i]
# Extract output vocabulary
for i in range(output_vocab_size):
    word = reverse_dictionary[i]
    output_dictionary[word] = i
    output_reverse_dictionary[i] = word
    output_embeddings = embeddings[i]
    
del dictionary, reverse_dictionary, embeddings
print('Input vocabulary size: %d' % input_vocab_size)
print('Output vocabulary size: %d' % output_vocab_size)

Input vocabulary size: 40000
Output vocabulary size: 40000


In [32]:
# Load preprocessed titles and articles
titles = pickle.load(open('processed_data/titles', 'rb'))
articles = pickle.load(open('processed_data/articles', 'rb'))

In [33]:
# Number of tokens to extract from the beginning of each article
max_article_size = 80
min_article_size = 30

# Convert each title and article to lists of words
# Discard if article size is less than minimum or is less than title size
temp1 = list()
temp2 = list()
for i in range(len(articles)):
    title = titles[i].split()
    article = articles[i].split()[:max_article_size]
    if (len(article)>=min_article_size) and (len(article)>len(title)):
        temp1.append(title)
        temp2.append(article)
titles = temp1
articles = temp2
del temp1, temp2

print('Total number of samples: %d' % len(articles))

Total number of samples: 141564


In [34]:
# Validation set size
valid_size = 1000

# Separate into training and validation sets
valid_titles = titles[:valid_size]
valid_articles = articles[:valid_size]
train_titles = titles[valid_size:]
train_articles = articles[valid_size:]
train_size = len(train_articles)
del titles, articles

print('Training set size: %d' % train_size)
print('Valid set size: %d' % valid_size)

Training set size: 140564
Valid set size: 1000
