In [1]:
import numpy as np
import pickle

In [2]:
# 2 million word vectors trained on Common Crawl (600B tokens)
embedding_file = open('processed_data/crawl-300d-2M.vec')

In [3]:
# The first line of the file contains the number of words in the vocabulary and the size of the vectors. 
# Each line contains a word followed by its vectors, like in the default fastText text format. 
# Each value is space separated.
# Words are ordered by descending frequency.
meta = embedding_file.readline()
num_words = int(meta.split()[0])
num_dims = int(meta.split()[1])

In [4]:
# Some 'words' are not picked up by split(). Hence.
# dictionary -> word : index
# reverse_dictionary -> index : word
dictionary = dict()
# To store embeddings for each word
embeddings = np.zeros((num_words, num_dims))
i = 0

# zero padding token
dictionary['PAD_token'] = len(dictionary)
embeddings[i] = np.random.rand(num_dims)
i += 1

# start of sequence token
dictionary['Start_of_Sequence'] = len(dictionary)
embeddings[i] = np.random.rand(num_dims)
i += 1

# end of sequence token
dictionary['End_of_Sequence'] = len(dictionary)
embeddings[i] = np.random.rand(num_dims)
i += 1

# unkown word token
dictionary['UNK_token'] = len(dictionary)
embeddings[i] = np.random.rand(num_dims)
i += 1

# Skip words not separated by split()
for line in embedding_file:
    items = line.split()
    if len(items) == (num_dims + 1):
        dictionary[items[0]] = len(dictionary)
        embeddings[i] = np.asarray([float(value) for value in items[1:]], dtype=np.float64)
        i += 1
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
max_vocab = len(dictionary)
embeddings = embeddings[:max_vocab]
embedding_file.close()

print("Total words: %d" % max_vocab)
print("Number of dimensions: %d" % num_dims)

Total words: 1999998
Number of dimensions: 300


In [16]:
# Load preprocessed titles and articles
titles = pickle.load(open('processed_data/titles', 'rb'))
articles = pickle.load(open('processed_data/articles', 'rb'))

In [17]:
# Number of tokens to extract from the beginning of each article
article_size = 100
# Validation set size
valid_size = 1000

# Convert each title and article to lists of words
temp1 = [None] * len(articles)
temp2 = [None] * len(articles)
for i in range(len(articles)):
    temp1[i] = titles[i].split()
    temp2[i] = articles[i].split()[:article_size]
titles = temp1
articles = temp2
del temp1, temp2
# Separate into training and validation sets
valid_titles = titles[:valid_size]
valid_articles = articles[:valid_size]
train_titles = titles[valid_size:]
train_articles = articles[valid_size:]
del titles, articles