In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import random
import os
import pickle
import string
import collections
import io
import json
import re
import tarfile
import urllib.request
import text_helpers
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.python.framework import ops
vocabulary_size = 50000

In [6]:
def remove_symbols(doc):
    cleaned_doc = []
    punctuation = ['.', '..', '...', '?', ':', '!', ',', '\'', ';', '``']
    for word in doc:
        if word not in punctuation: 
            cleaned_doc.append( re.sub(r"[,.\'?!]", "", word))
    return cleaned_doc
def clean_doc(doc):
    words = re.sub(r'\'', '', doc).lower()
    words = word_tokenize(re.sub(r'\.(?=[^ \W\d])', '. ', words))
    words = remove_symbols(words)
    return words

def read_data():
    doc_data = []
    vocab_data = []
    for sermon in os.listdir('../sermons/preachingtoday'):
        f=open('../sermons/preachingtoday/'+ sermon, 'rb')
        contents =f.read()
        contents = json.loads(contents)
        vocab_data += clean_doc(contents['text'])
        doc_data.append(contents['text'])
    return doc_data, vocab_data

def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys(), ))
    return data, count, dictionary, reversed_dictionary

def text_to_numbers(sentences, word_dict):
    # Initialize the returned data
    data = []
    for sentence in sentences:
        sentence_data = []
        # For each word, either use selected index or rare word index
        for word in sentence.split():
            if word in word_dict:
                word_ix = word_dict[word]
            else:
                word_ix = 0
            sentence_data.append(word_ix)
        data.append(sentence_data)
    return(data)

def generate_batch_data(sentences, batch_size, window_size, method='skip_gram'):
    # Fill up data batch
    batch_data = []
    label_data = []
    while len(batch_data) < batch_size:
        # select random sentence to start
        rand_sentence_ix = int(np.random.choice(len(sentences), size=1))
        rand_sentence = sentences[rand_sentence_ix]
        # Generate consecutive windows to look at
        window_sequences = [rand_sentence[max((ix-window_size),0):(ix+window_size+1)] for ix, x in enumerate(rand_sentence)]
        # Denote which element of each window is the center word of interest
        label_indices = [ix if ix<window_size else window_size for ix,x in enumerate(window_sequences)]
        
        # Pull out center word of interest for each window and create a tuple for each window
        if method=='skip_gram':
            batch_and_labels = [(x[y], x[:y] + x[(y+1):]) for x,y in zip(window_sequences, label_indices)]
            # Make it in to a big list of tuples (target word, surrounding word)
            tuple_data = [(x, y_) for x,y in batch_and_labels for y_ in y]
            batch, labels = [list(x) for x in zip(*tuple_data)]
        elif method=='cbow':
            batch_and_labels = [(x[:y] + x[(y+1):], x[y]) for x,y in zip(window_sequences, label_indices)]
            # Only keep windows with consistent 2*window_size
            batch_and_labels = [(x,y) for x,y in batch_and_labels if len(x)==2*window_size]
            batch, labels = [list(x) for x in zip(*batch_and_labels)]
        elif method=='doc2vec':
            # For doc2vec we keep LHS window only to predict target word
            batch_and_labels = [(rand_sentence[i:i+window_size], rand_sentence[i+window_size]) for i in range(0, len(rand_sentence)-window_size)]
            batch, labels = [list(x) for x in zip(*batch_and_labels)]
            # Add document index to batch!! Remember that we must extract the last index in batch for the doc-index
            batch = [x + [rand_sentence_ix] for x in batch]
        else:
            raise ValueError('Method {} not implmented yet.'.format(method))
            
        # extract batch and labels
        batch_data.extend(batch[:batch_size])
        label_data.extend(labels[:batch_size])
    # Trim batch and label at the end
    batch_data = batch_data[:batch_size]
    label_data = label_data[:batch_size]
    
    # Convert to numpy array
    batch_data = np.array(batch_data)
    label_data = np.transpose(np.array([label_data]))
    
    return(batch_data, label_data)

In [7]:
docs, vocab = read_data()

In [8]:
data, count, word_dictionary, reverse_dictionary = build_dataset(
vocab, vocabulary_size)

In [52]:
'the' in reverse_dictionary

True

In [9]:
text_data = text_to_numbers(docs, word_dictionary)

In [39]:
valid_words = ['love', 'hate', 'happy', 'sad', 'man', 'woman']
valid_examples = [word_dictionary[x] for x in valid_words]
valid_examples

[89, 1315, 715, 1935, 102, 298]

In [41]:
for j in range(len(valid_words)):
    valid_word = reverse_dictionary[valid_examples[j]]
    print(valid_word)

love
hate
happy
sad
man
woman


In [19]:
# Start a graph session
sess = tf.Session()
data_folder_name = "embeddings"

# Declare model parameters
batch_size = 500
#vocabulary_size = 7500
generations = 100000
model_learning_rate = 0.001

embedding_size = 400  # Word embedding size
doc_embedding_size = 200  # Document embedding size
concatenated_size = embedding_size + doc_embedding_size

num_sampled = int(batch_size / 2)  # Number of negative examples to sample.
window_size = 3  # How many words to consider to the left.

# Add checkpoints to training
save_embeddings_every = 5000
print_valid_every = 5000
print_loss_every = 100

# Declare stop words
# stops = stopwords.words('english')
stops = []

# We pick a few test words for validation.
valid_words = ['love', 'hate', 'happy', 'sad', 'man', 'woman']
# Later we will have to transform these into indices

# Load the movie review data
#print('Loading Data')
#texts, target = text_helpers.load_movie_data(data_folder_name)

# Normalize text
#print('Normalizing Text Data')
#texts = text_helpers.normalize_text(texts, stops)

# Texts must contain at least 3 words
#target = [target[ix] for ix, x in enumerate(texts) if len(x.split()) > window_size]
#texts = [x for x in texts if len(x.split()) > window_size]
#assert (len(target) == len(texts))

# Build our data set and dictionaries
#print('Creating Dictionary')
#word_dictionary = text_helpers.build_dictionary(texts, vocabulary_size)
#word_dictionary_rev = dict(zip(word_dictionary.values(), word_dictionary.keys()))
#text_data = text_helpers.text_to_numbers(texts, word_dictionary)

# Get validation word keys
valid_examples = [word_dictionary[x] for x in valid_words]

print('Creating Model')
# Define Embeddings:
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
doc_embeddings = tf.Variable(tf.random_uniform([len(docs), doc_embedding_size], -1.0, 1.0))

# NCE loss parameters
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, concatenated_size],
                                              stddev=1.0 / np.sqrt(concatenated_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

# Create data/target placeholders
x_inputs = tf.placeholder(tf.int32, shape=[None, window_size + 1])  # plus 1 for doc index
y_target = tf.placeholder(tf.int32, shape=[None, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

# Lookup the word embedding
# Add together element embeddings in window:
embed = tf.zeros([batch_size, embedding_size])
for element in range(window_size):
    embed += tf.nn.embedding_lookup(embeddings, x_inputs[:, element])

doc_indices = tf.slice(x_inputs, [0, window_size], [batch_size, 1])
doc_embed = tf.nn.embedding_lookup(doc_embeddings, doc_indices)

# concatenate embedding
final_embed = tf.concat([embed, tf.squeeze(doc_embed)], 1)

# Get loss from prediction
loss = tf.reduce_mean(tf.nn.nce_loss(nce_weights, nce_biases, y_target, final_embed, num_sampled, vocabulary_size))

# Create optimizer
optimizer = tf.train.GradientDescentOptimizer(learning_rate=model_learning_rate)
train_step = optimizer.minimize(loss)

# Cosine similarity between words
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

# Create model saving operation
saver = tf.train.Saver({"embeddings": embeddings, "doc_embeddings": doc_embeddings})

# Add variable initializer.
init = tf.global_variables_initializer()
sess.run(init)

# Run the skip gram model.
print('Starting Training')
loss_vec = []
loss_x_vec = []
for i in range(generations):
    batch_inputs, batch_labels = generate_batch_data(text_data, batch_size, window_size, method='doc2vec')
    feed_dict = {x_inputs: batch_inputs, y_target: batch_labels}

    # Run the train step
    sess.run(train_step, feed_dict=feed_dict)

    # Return the loss
    if (i + 1) % print_loss_every == 0:
        loss_val = sess.run(loss, feed_dict=feed_dict)
        loss_vec.append(loss_val)
        loss_x_vec.append(i + 1)
        print('Loss at step {} : {}'.format(i + 1, loss_val))

    # Validation: Print some random words and top 5 related words
    if (i + 1) % print_valid_every == 0:
        sim = sess.run(similarity, feed_dict=feed_dict)
        for j in range(len(valid_words)):
            valid_word = reverse_dictionary[valid_examples[j]]
            top_k = 5  # number of nearest neighbors
            nearest = (-sim[j, :]).argsort()[1:top_k + 1]
            log_str = "Nearest to {}:".format(valid_word)
            for k in range(top_k):
                close_word = reverse_dictionary[nearest[k]]
                log_str = '{} {},'.format(log_str, close_word)
            print(log_str)

    # Save dictionary + embeddings
    if (i + 1) % save_embeddings_every == 0:
        # Save vocabulary dictionary
        with open('sermon_vocab.pkl', 'wb') as handle:
            pickle.dump(word_dictionary, handle)

        # Save embeddings
        model_checkpoint_path = os.path.join(os.getcwd(), data_folder_name, 'doc2vec_sermon_embeddings.ckpt')
        save_path = saver.save(sess, model_checkpoint_path)
        print('Model saved in file: {}'.format(save_path))
    if i == generations-1:
        final_embeddings = doc_embeddings.eval(session=sess)

Creating Model
Starting Training
Loss at step 100 : 871.489990234375
Loss at step 200 : 769.6008911132812
Loss at step 300 : 767.6314086914062
Loss at step 400 : 741.4744262695312
Loss at step 500 : 746.8828735351562
Loss at step 600 : 706.4240112304688
Loss at step 700 : 689.4671020507812
Loss at step 800 : 764.1255493164062
Loss at step 900 : 730.1973876953125
Loss at step 1000 : 646.8015747070312
Loss at step 1100 : 588.3591918945312
Loss at step 1200 : 779.9973754882812
Loss at step 1300 : 661.6900024414062
Loss at step 1400 : 620.6868896484375
Loss at step 1500 : 613.6605224609375
Loss at step 1600 : 569.128662109375
Loss at step 1700 : 587.5671997070312
Loss at step 1800 : 474.6956481933594
Loss at step 1900 : 549.7796020507812
Loss at step 2000 : 501.3656311035156
Loss at step 2100 : 461.3993225097656
Loss at step 2200 : 527.5021362304688
Loss at step 2300 : 492.22674560546875
Loss at step 2400 : 457.65313720703125
Loss at step 2500 : 496.4341735839844
Loss at step 2600 : 483.78

Loss at step 17100 : 97.66463470458984
Loss at step 17200 : 94.16352844238281
Loss at step 17300 : 80.04353332519531
Loss at step 17400 : 96.1478042602539
Loss at step 17500 : 127.2129135131836
Loss at step 17600 : 85.40840911865234
Loss at step 17700 : 101.59691619873047
Loss at step 17800 : 68.8856430053711
Loss at step 17900 : 114.440673828125
Loss at step 18000 : 78.1099853515625
Loss at step 18100 : 65.07344818115234
Loss at step 18200 : 120.3863296508789
Loss at step 18300 : 65.53349304199219
Loss at step 18400 : 81.8661880493164
Loss at step 18500 : 88.90116882324219
Loss at step 18600 : 135.56089782714844
Loss at step 18700 : 134.449462890625
Loss at step 18800 : 84.70638275146484
Loss at step 18900 : 68.40701293945312
Loss at step 19000 : 123.02112579345703
Loss at step 19100 : 72.5509033203125
Loss at step 19200 : 78.45741271972656
Loss at step 19300 : 75.48531341552734
Loss at step 19400 : 79.20526885986328
Loss at step 19500 : 101.80577087402344
Loss at step 19600 : 119.058

Loss at step 34100 : 61.81928253173828
Loss at step 34200 : 48.15431213378906
Loss at step 34300 : 33.78941345214844
Loss at step 34400 : 50.130516052246094
Loss at step 34500 : 55.22766876220703
Loss at step 34600 : 45.55915832519531
Loss at step 34700 : 63.98863983154297
Loss at step 34800 : 39.53145980834961
Loss at step 34900 : 57.73471450805664
Loss at step 35000 : 67.77645111083984
Nearest to love: accost, singers, 2014, most, workshop,
Nearest to hate: scandalous, epitaph, gulls, quicke, 213,
Nearest to happy: sunday—glorify, 17th, related, dragonfly, totality,
Nearest to sad: c=christmas, criticizing, portion, shell, music,
Nearest to man: that, who, UNK, menservants, on,
Nearest to woman: 11:20, century, fulfillment—the, healer, fidel,
Model saved in file: C:\Users\Zach Dunkerton\Documents\GitHub\machine_churching\Algorithms\embeddings\doc2vec_sermon_embeddings.ckpt
Loss at step 35100 : 56.83425521850586
Loss at step 35200 : 57.89409255981445
Loss at step 35300 : 82.2665100097

Loss at step 50100 : 38.711055755615234
Loss at step 50200 : 37.27855682373047
Loss at step 50300 : 36.54139709472656
Loss at step 50400 : 39.19184494018555
Loss at step 50500 : 27.339439392089844
Loss at step 50600 : 38.66625213623047
Loss at step 50700 : 28.611473083496094
Loss at step 50800 : 33.64931869506836
Loss at step 50900 : 35.5037956237793
Loss at step 51000 : 45.354766845703125
Loss at step 51100 : 33.45505142211914
Loss at step 51200 : 37.534610748291016
Loss at step 51300 : 45.8934440612793
Loss at step 51400 : 23.161954879760742
Loss at step 51500 : 32.97930145263672
Loss at step 51600 : 37.09926986694336
Loss at step 51700 : 29.898447036743164
Loss at step 51800 : 38.036373138427734
Loss at step 51900 : 23.833248138427734
Loss at step 52000 : 36.934566497802734
Loss at step 52100 : 32.20952606201172
Loss at step 52200 : 49.86213302612305
Loss at step 52300 : 39.35600662231445
Loss at step 52400 : 33.090030670166016
Loss at step 52500 : 32.19656753540039
Loss at step 526

Loss at step 67300 : 23.784318923950195
Loss at step 67400 : 27.247180938720703
Loss at step 67500 : 51.230125427246094
Loss at step 67600 : 31.554725646972656
Loss at step 67700 : 33.06382751464844
Loss at step 67800 : 38.35008239746094
Loss at step 67900 : 28.02872657775879
Loss at step 68000 : 31.427011489868164
Loss at step 68100 : 39.627037048339844
Loss at step 68200 : 26.19187355041504
Loss at step 68300 : 28.978185653686523
Loss at step 68400 : 28.20319175720215
Loss at step 68500 : 26.17270851135254
Loss at step 68600 : 18.189538955688477
Loss at step 68700 : 22.26222801208496
Loss at step 68800 : 37.334197998046875
Loss at step 68900 : 21.652828216552734
Loss at step 69000 : 30.574934005737305
Loss at step 69100 : 13.614484786987305
Loss at step 69200 : 35.06621551513672
Loss at step 69300 : 28.519834518432617
Loss at step 69400 : 27.390230178833008
Loss at step 69500 : 25.695547103881836
Loss at step 69600 : 29.095428466796875
Loss at step 69700 : 31.54185676574707
Loss at s

Loss at step 84500 : 30.38296890258789
Loss at step 84600 : 26.361303329467773
Loss at step 84700 : 30.175214767456055
Loss at step 84800 : 41.73878860473633
Loss at step 84900 : 22.896841049194336
Loss at step 85000 : 19.18286895751953
Nearest to love: most, be, accost, as, you,
Nearest to hate: scandalous, epitaph, gulls, quicke, 213,
Nearest to happy: sunday—glorify, 17th, related, dragonfly, eunuch,
Nearest to sad: c=christmas, criticizing, portion, music, shell,
Nearest to man: that, who, on, the, a,
Nearest to woman: 11:20, fulfillment—the, century, healer, missile,
Model saved in file: C:\Users\Zach Dunkerton\Documents\GitHub\machine_churching\Algorithms\embeddings\doc2vec_sermon_embeddings.ckpt
Loss at step 85100 : 29.867206573486328
Loss at step 85200 : 17.50478172302246
Loss at step 85300 : 32.36941146850586
Loss at step 85400 : 25.386396408081055
Loss at step 85500 : 19.202865600585938
Loss at step 85600 : 24.102252960205078
Loss at step 85700 : 20.85150718688965
Loss at ste

In [21]:
import pickle
with open('doc2vec_embeddings.pkl', 'wb') as handle:
    pickle.dump(final_embeddings, handle)