Source: https://adventuresinmachinelearning.com/word2vec-tutorial-tensorflow/

In [1]:
import os
import json
import tensorflow as tf
import numpy as np
import collections
import math
import random
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import word_tokenize
import re
vocabulary_size = 50000

In [44]:
def remove_symbols(doc):
    cleaned_doc = []
    punctuation = ['.', '..', '...', '?', ':', '!', ',', '\'', ';', '``']
    for word in doc:
        if word not in punctuation: 
            cleaned_doc.append( re.sub(r"[,.\'?!]", "", word))
    return cleaned_doc
def clean_doc(doc):
    words = re.sub(r'\'', '', doc).lower()
    words = word_tokenize(re.sub(r'\.(?=[^ \W\d])', '. ', words))
    words = remove_symbols(words)
    return words

In [45]:
def read_data():
    data = []
    for sermon in os.listdir('../sermons/preachingtoday'):
        f=open('../sermons/preachingtoday/'+ sermon, 'rb')
        contents =f.read()
        contents = json.loads(contents)
        data += clean_doc(contents['text'])
    return data

def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

def generate_batch(data, batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    context = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window input_word skip_window ]
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips):
        target = skip_window  # input word at the center of the buffer
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]  # this is the input word
            context[i * num_skips + j, 0] = buffer[target]  # these are the context words
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, context

def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

In [46]:
vocabulary = read_data()
data, count, unused_dictionary, reverse_dictionary = build_dataset(
vocabulary, vocabulary_size)

In [9]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Zach
[nltk_data]     Dunkerton\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [48]:
batch_size = 128
data_index = 0
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1  # How many words to consider left and right.
num_skips = 2  # How many times to reuse an input to generate a label.
num_sampled = 64  # Number of negative examples to sample.
# We pick a random validation set to sample nearest neighbors. Here we limit
# the validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. These 3 variables are used only for
# displaying model accuracy, they don't affect calculation.
valid_size = 16  # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
graph = tf.Graph()

with graph.as_default():

    # Input data.
    with tf.name_scope('inputs'):
        train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
        train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
        valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    # Ops and variables pinned to the CPU because of missing GPU implementation
    with tf.device('/cpu:0'):
      # Look up embeddings for inputs.
        with tf.name_scope('embeddings'):
            embeddings = tf.Variable(
                tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
            embed = tf.nn.embedding_lookup(embeddings, train_inputs)

      # Construct the variables for the NCE loss
        with tf.name_scope('weights'):
            nce_weights = tf.Variable(
                tf.truncated_normal([vocabulary_size, embedding_size],
                                stddev=1.0 / math.sqrt(embedding_size)))
        with tf.name_scope('biases'):
            nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

    # Compute the average NCE loss for the batch.
    # tf.nce_loss automatically draws a new sample of the negative labels each
    # time we evaluate the loss.
    # Explanation of the meaning of NCE loss:
    #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
    with tf.name_scope('loss'):
        loss = tf.reduce_mean(
            tf.nn.nce_loss(
              weights=nce_weights,
              biases=nce_biases,
              labels=train_labels,
              inputs=embed,
              num_sampled=num_sampled,
              num_classes=vocabulary_size))

    # Add the loss value as a scalar to summary.
    tf.summary.scalar('loss', loss)

    # Construct the SGD optimizer using a learning rate of 1.0.
    with tf.name_scope('optimizer'):
        optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

    # Compute the cosine similarity between minibatch examples and all
    # embeddings.
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                              valid_dataset)
    similarity = tf.matmul(
        valid_embeddings, normalized_embeddings, transpose_b=True)

    # Merge all summaries.
    merged = tf.summary.merge_all()

    # Add variable initializer.
    init = tf.global_variables_initializer()

    # Create a saver.
    saver = tf.train.Saver()

In [49]:
num_steps = 100001
with tf.Session(graph=graph) as session:
    # We must initialize all variables before we use them.
    init.run()
    print('Initialized')
    skip_window = 1
    average_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_labels = generate_batch(data, batch_size, num_skips,skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

      # Define metadata variable.
        run_metadata = tf.RunMetadata()

      # We perform one update step by evaluating the optimizer op (including it
      # in the list of returned values for session.run()
      # Also, evaluate the merged op to get all summaries from the returned
      # "summary" variable. Feed metadata variable to session for visualizing
      # the graph in TensorBoard.
        _, summary, loss_val = session.run([optimizer, merged, loss],
                                         feed_dict=feed_dict,
                                         run_metadata=run_metadata)
        average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
        # The average loss is an estimate of the loss over the last 2000
        # batches.
            print('Average loss at step ', step, ': ', average_loss)
            average_loss = 0
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8  # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = 'Nearest to %s:' % valid_word
                for k in range(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = '%s %s,' % (log_str, close_word)
                print(log_str)
    if step == num_steps-1:
        final_embeddings = normalized_embeddings.eval(session = session)

Initialized
Average loss at step  0 :  268.5318908691406
Nearest to my: persia, basketball, olds, murmur, films, joi, churches, 45f,
Nearest to at: circumcise, fallenness, ijm, strict, neglect, latourette, generation—80, twelve-thirty,
Nearest to says: coupling, scented, outrank, worldview, envied, 1:19, trumpet, pistos,
Nearest to going: bradley, over—plant, confines, worrier, 5-story, exclaim, thousand, church-going,
Nearest to who: vanity, drama, overshadowed, embassy, cares, lost—just, inside, parched,
Nearest to our: strouds, classism, 89:20, ahisamach, regroup, kennedy, lennons, hupo,
Nearest to one: do—comes, oppressed, 701, voyage, capture, came—so, nightclubs, angry—that,
Nearest to not: faithfully, epidemic, roam, substitution, measure-your, fortified, oddest, scrubs,
Nearest to do: 85000, gush, grandkid, shemesh, rileys, bill—a, truenow, re-creator,
Nearest to way: hills—would, zac, glens, theater—a, edifice, fu, obsessions, sputtered,
Nearest to its: batten, 2:30–31, unstai

Average loss at step  52000 :  4.941928084492684
Average loss at step  54000 :  4.922246501445771
Average loss at step  56000 :  4.849904831886292
Average loss at step  58000 :  4.729203828215599
Average loss at step  60000 :  4.724402657151222
Nearest to my: your, his, our, their, her, the, amp, a,
Nearest to at: in, on, with, strict, for, from, there, shoe-shine,
Nearest to says: said, ostrich, ), replied, representatives, quot, seeks, costume,
Nearest to going: talking, preoccupation, confines, tia, coming, able, ahzahzel, looking,
Nearest to who: he, she, that, they, drainage, cares, mode, actually,
Nearest to our: their, your, my, his, the, her, amp, peninsula,
Nearest to one: amp, allude, settled, surrender, recycled, wilkerson, yours, stork,
Nearest to not: amp, never, faithfully, also, it, blah, gollum, walters,
Nearest to do: yeast, amp, see, did, shevardnadze, change, budgeting, if,
Nearest to way: thing, direction, june, life, reason, experience, book, baby,
Nearest to its: 

In [50]:
final_dictionary = {}
for word in range(len(reverse_dictionary)):
    final_dictionary[reverse_dictionary[word]] = final_embeddings[word]

In [51]:
import pickle
with open('embeddings.pkl', 'wb') as handle:
    pickle.dump(final_dictionary, handle)
