In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import random
import os
import pickle
import string
import requests
import collections
import io
import tarfile
import urllib.request
from nltk.corpus import stopwords
from tensorflow.python.framework import ops
ops.reset_default_graph()
import text_helpers
sess = tf.Session()

In [2]:
data_folder_name = 'temp'
if not os.path.exists(data_folder_name):
    os.makedirs(data_folder_name)

In [3]:
batch_size = 200
embedding_size = 50
vocabulary_size = 2000
generations = 50000
model_learning_rate = 0.05
num_sampled = int(batch_size/2)
window_size = 3
save_embeddings_every = 5000
print_valid_every = 5000
print_loss_every = 1000
stops = stopwords.words('english')
valid_words = ['love', 'hate', 'happy', 'sad', 'man', 'woman']

In [4]:
texts, target = text_helpers.load_movie_data()
texts = text_helpers.normalize_text(texts, stops)
target = [target[ix] for ix, x in enumerate(texts) if len(x.split()) > 2]
texts = [x for x in texts if len(x.split()) > 2]

In [5]:
word_dictionary = text_helpers.build_dictionary(texts, vocabulary_size)
word_dictionary_rev = dict(zip(word_dictionary.values(), word_dictionary.keys()))
text_data = text_helpers.text_to_numbers(texts, word_dictionary)
valid_examples = [word_dictionary[x] for x in valid_words]

In [6]:
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
x_inputs = tf.placeholder(tf.int32, shape=[batch_size, 2*window_size])
y_target = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

In [7]:
embed = tf.zeros([batch_size, embedding_size])
for element in range(2*window_size):
    embed += tf.nn.embedding_lookup(embeddings, x_inputs[:, element])

In [8]:
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],
                                               stddev=1.0 / np.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
                                     biases=nce_biases,
                                     labels=y_target,
                                     inputs=embed,
                                     num_sampled=num_sampled,
                                     num_classes=vocabulary_size))

In [9]:
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

In [10]:
saver = tf.train.Saver({"embeddings": embeddings})

In [11]:
optimizer = tf.train.GradientDescentOptimizer(learning_rate=model_learning_rate).minimize(loss)
init = tf.global_variables_initializer()
sess.run(init)

In [12]:
text_data = [x for x in text_data if len(x)>=(2*window_size+1)]
loss_vec = []
loss_x_vec = []
for i in range(generations):
    batch_inputs, batch_labels = text_helpers.generate_batch_data(text_data, batch_size,
                                                                  window_size, method='cbow')
    feed_dict = {x_inputs : batch_inputs, y_target : batch_labels}
    sess.run(optimizer, feed_dict=feed_dict)
    if (i+1) % print_loss_every == 0:
        loss_val = sess.run(loss, feed_dict=feed_dict)
        loss_vec.append(loss_val)
        loss_x_vec.append(i+1)
        print('Loss at step {} : {}'.format(i+1, loss_val))
    if (i+1) % print_valid_every == 0:
        sim = sess.run(similarity, feed_dict=feed_dict)
        for j in range(len(valid_words)):
            valid_word = word_dictionary_rev[valid_examples[j]]
            top_k = 5
            nearest = (-sim[j, :]).argsort()[1:top_k+1]
            log_str = "Nearest to {}:".format(valid_word)
            for k in range(top_k):
                close_word = word_dictionary_rev[nearest[k]]
                log_str = '{} {},' .format(log_str, close_word)
            print(log_str)
    if (i+1) % save_embeddings_every == 0:
        with open(os.path.join(data_folder_name,'movie_vocab.pkl'), 'wb') as f:
            pickle.dump(word_dictionary, f)
        model_checkpoint_path = os.path.join(os.getcwd(),data_folder_name,'cbow_movie_embeddings.ckpt')
        save_path = saver.save(sess, model_checkpoint_path)
        print('Model saved in file: {}'.format(save_path))

Loss at step 1000 : 8.026020050048828
Loss at step 2000 : 6.135557174682617
Loss at step 3000 : 6.0761189460754395
Loss at step 4000 : 5.532122611999512
Loss at step 5000 : 5.158099174499512
Nearest to love: powerful, formula, kids, madness, extremely,
Nearest to hate: complex, country, angst, alabama, tv,
Nearest to happy: manages, potential, deep, much, unfolds,
Nearest to sad: wrong, classic, best, wont, guns,
Nearest to man: young, trying, offers, didnt, madness,
Nearest to woman: hits, average, martin, computer, stuff,
Model saved in file: C:\TensorFlow for Neural Network Solutions\Section 2\temp\cbow_movie_embeddings.ckpt
Loss at step 6000 : 5.214013576507568
Loss at step 7000 : 4.8917036056518555
Loss at step 8000 : 4.561053276062012
Loss at step 9000 : 4.625123977661133
Loss at step 10000 : 4.675836563110352
Nearest to love: powerful, formula, kids, falls, extremely,
Nearest to hate: complex, country, angst, alabama, tv,
Nearest to happy: manages, potential, deep, much, unfolds