# Get all the vocabulary

This processes the dataset and gets the tokens, and assigns and ID to each one.

Make the vocabulary key
* ID 0 is _UNKNOWN_

In [1]:
import os, sys, re
import gzip

In [2]:
import gensim

In [3]:
import numpy as np

In [4]:
# Set the path to the downloaded GenSim word vectors file that you downloaded (see README)
model = gensim.models.KeyedVectors.load_word2vec_format('/media/thomas/026919b3-ea3e-4923-96aa-7f83aae1d652/pretrained_gensim/GoogleNews-vectors-negative300.bin', binary=True)

In [5]:
# Where the txt files are
input_folder = 'data/raw'

In [6]:
output_folder = 'data/processed'

In [7]:
model_folder = 'output'

In [8]:
import pickle as pkl, gzip, pandas as pd
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+|\'')

In [9]:
def preprocess(text):
    tokens = tokenizer.tokenize(text)
    preprocessed  = []
    for token in tokens:
        if len(token) > 2 and token == token[0].upper() + token[1:].lower():
            token = "_PROPERNAME_"
        preprocessed.append(token.lower())
    return preprocessed

In [10]:
all_vocab_used = {}

In [11]:
for root, folder, files in os.walk(input_folder):
    for file_name in files:
        with open(input_folder + '/' + file_name, 'r') as f:
            all_lines = list(f)
            for l in all_lines:
                tokens = preprocess(l)
                for t in tokens:
                    if t in model.vocab:
                        if t not in all_vocab_used:
                            all_vocab_used[t] = 0
                        all_vocab_used[t] = all_vocab_used[t] + 1

In [12]:
print ("Vocab size", len(all_vocab_used))

Vocab size 22443


In [13]:
import operator
sorted_vocab = list(sorted(all_vocab_used.items(), key=operator.itemgetter(1), reverse=True))

In [14]:
sorted_vocab = [("_UNKNOWN_",-1)] + sorted_vocab[:9999]

In [15]:
sorted_vocab[:10]

[('_UNKNOWN_', -1),
 ('the', 39880),
 ('i', 33379),
 ('boundary', 20282),
 ('in', 15136),
 ('it', 12987),
 ('was', 12843),
 ('you', 12608),
 ('he', 12348),
 ('her', 12188)]

In [16]:
abridged_index2word = []
abridged_word2index = {}
ctr = 0
for w, f in sorted_vocab:
    abridged_index2word.append(w)
    abridged_word2index[w] = ctr
    ctr += 1

In [17]:
import gzip, pickle as pkl
with gzip.GzipFile("data/abridged_index2word.pkl.gz", "wb") as f:
    pkl.dump(abridged_index2word, f)
with gzip.GzipFile("data/abridged_word2index.pkl.gz", "wb") as f:
    pkl.dump(abridged_word2index, f)

In [18]:
LENGTH_OF_GENSIM_VECTOR = 300

In [19]:
# store the embeddings in a numpy array

embedding_matrix = np.zeros((len(abridged_index2word), LENGTH_OF_GENSIM_VECTOR))
for i in range(len(abridged_index2word)):
    if i == 0:
        continue #  _UNKNOWN_
    if i % 10000 == 0:
        print (i, "of", len(abridged_index2word))
    embedding_vector = model.wv[abridged_index2word[i]]
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [20]:
#free memory
del(model)

In [21]:
filter_sizes = "3,4,5"
num_filters = 128
l2_reg_lambda = 0
num_checkpoints = 10

In [22]:
import tensorflow as tf

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters


In [23]:
import text_cnn
cnn = text_cnn.TextCNN(
            sequence_length=1000,
            num_classes=2, 
            vocab_size=(len(abridged_index2word)),
            embedding_size=LENGTH_OF_GENSIM_VECTOR,
            filter_sizes=list(map(int, filter_sizes.split(","))),
            num_filters=num_filters,
            l2_reg_lambda=l2_reg_lambda)



In [24]:
global_step = tf.Variable(0, name="global_step", trainable=False)

In [25]:
sess = tf.InteractiveSession()

In [26]:
tf.initialize_all_variables()

Instructions for updating:
Use `tf.global_variables_initializer` instead.


<tf.Operation 'init' type=NoOp>

In [27]:
embedding_placeholder = tf.placeholder(tf.float32, [len(abridged_index2word), LENGTH_OF_GENSIM_VECTOR])
embedding_init = cnn.W.assign(embedding_placeholder)

In [28]:
tf.global_variables()

[<tf.Variable 'embedding/W:0' shape=(10000, 300) dtype=float32_ref>,
 <tf.Variable 'conv-maxpool-3/W:0' shape=(3, 300, 1, 128) dtype=float32_ref>,
 <tf.Variable 'conv-maxpool-3/b:0' shape=(128,) dtype=float32_ref>,
 <tf.Variable 'conv-maxpool-4/W:0' shape=(4, 300, 1, 128) dtype=float32_ref>,
 <tf.Variable 'conv-maxpool-4/b:0' shape=(128,) dtype=float32_ref>,
 <tf.Variable 'conv-maxpool-5/W:0' shape=(5, 300, 1, 128) dtype=float32_ref>,
 <tf.Variable 'conv-maxpool-5/b:0' shape=(128,) dtype=float32_ref>,
 <tf.Variable 'W:0' shape=(384, 2) dtype=float32_ref>,
 <tf.Variable 'output/b:0' shape=(2,) dtype=float32_ref>,
 <tf.Variable 'global_step:0' shape=() dtype=int32_ref>]

In [29]:
gensim_weights = [x for x in tf.global_variables() if "embedding" in str(x)]
other_weights = [x for x in tf.global_variables() if "embedding" not in str(x)]

In [30]:
gensim_weights

[<tf.Variable 'embedding/W:0' shape=(10000, 300) dtype=float32_ref>]

In [31]:
sess.run(embedding_init, feed_dict={embedding_placeholder: embedding_matrix})

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.08007812,  0.10498047,  0.04980469, ...,  0.00366211,
         0.04760742, -0.06884766],
       [-0.22558594, -0.01953125,  0.09082031, ...,  0.02819824,
        -0.17773438, -0.00604248],
       ...,
       [ 0.05664062,  0.0168457 ,  0.12890625, ..., -0.15332031,
         0.171875  , -0.20996094],
       [ 0.00202942,  0.38867188,  0.15332031, ...,  0.1640625 ,
         0.32421875, -0.03344727],
       [-0.00259399,  0.09082031, -0.07519531, ...,  0.18554688,
         0.10205078,  0.15332031]], dtype=float32)

In [32]:
saver_embedding = tf.train.Saver(gensim_weights)

In [33]:
saver_embedding.save(sess, model_folder + "/gensim_weights")

'output/gensim_weights'