In [1]:
import tensorflow as tf
import math
import os
import time
import itertools
import numpy as np
from data_helpers import RWBatchGenerator

In [2]:
tf.flags.DEFINE_string('data_dir', 'data/stackexchange/datascience/', 'directory of dataset')
tf.flags.DEFINE_integer("checkpoint_every", 5000, "Save model after this many steps (default: 5000)")
tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)")

In [3]:
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")


Parameters:
CHECKPOINT_EVERY=5000
DATA_DIR=data/stackexchange/datascience/
NUM_CHECKPOINTS=5



In [4]:
data_dir = FLAGS.data_dir

batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 3       # How many words to consider left and right.
num_skips = 4  # How many times to reuse an input to generate a label.

walks = RWBatchGenerator.read_walks("{}/random_walks.txt".format(data_dir))

vocabulary_size = len(set(itertools.chain(*walks)))

generator = RWBatchGenerator(walks, batch_size, num_skips, skip_window)

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64    # Number of negative examples to sample.



In [5]:
graph = tf.Graph()

with graph.as_default():
    # Input data.
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    # Ops and variables pinned to the CPU because of missing GPU implementation
    with tf.device('/cpu:0'):
        # Look up embeddings for inputs.
        with tf.name_scope('embedding'):
            embeddings = tf.Variable(
                tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
            embed = tf.nn.embedding_lookup(embeddings, train_inputs)

        with tf.name_scope('nce'):
            # Construct the variables for the NCE loss
            nce_weights = tf.Variable(
                tf.truncated_normal([vocabulary_size, embedding_size],
                                    stddev=1.0 / math.sqrt(embedding_size)))
            nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

    # Compute the average NCE loss for the batch.
    # tf.nce_loss automatically draws a new sample of the negative labels each
    # time we evaluate the loss.
    with tf.name_scope('loss'):
        loss = tf.reduce_mean(
            tf.nn.nce_loss(weights=nce_weights,
                           biases=nce_biases,
                           labels=train_labels,
                           inputs=embed,
                           num_sampled=num_sampled,
                           num_classes=vocabulary_size))

    # Construct the SGD optimizer using a learning rate of 1.0.
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

    # Compute the cosine similarity between minibatch examples and all embeddings.
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(
        normalized_embeddings, valid_dataset)
    similarity = tf.matmul(
        valid_embeddings, normalized_embeddings, transpose_b=True)

    # Add variable initializer.
    init = tf.global_variables_initializer()

In [6]:
timestamp = str(int(time.time()))
out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", "deepwalk"))
#                                       "deepwalk-{}".format(timestamp)))
if tf.gfile.Exists(out_dir):
    tf.gfile.DeleteRecursively(out_dir)
tf.gfile.MakeDirs(out_dir)

print("Writing to {}\n".format(out_dir))

# summary config
loss_summary = tf.summary.scalar("loss", loss)
train_summary_op = tf.summary.merge([loss_summary])
train_summary_dir = os.path.join(out_dir, "summaries", "train")

# checkpoint config
checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
checkpoint_prefix = os.path.join(checkpoint_dir, "model")
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

Writing to /home/cloud-user/code/network_embedding/runs/deepwalk



In [None]:
from tensorflow.contrib.tensorboard.plugins import projector

def save_embedding_for_viz(embeddings):
    embeddings_val = embeddings.eval()        

    embedding_var = tf.Variable(embeddings_val,  name='node_embedding')
    session.run(embedding_var.initializer)
    
    # Format: tensorflow/tensorboard/plugins/projector/projector_config.proto
    config = projector.ProjectorConfig()

    # You can add multiple embeddings. Here we add only one.
    embedding = config.embeddings.add()
    embedding.tensor_name = embedding_var.name

    # Link this tensor to its metadata file (e.g. labels).
    embedding.metadata_path = '/home/cloud-user/code/network_embedding/{}/labels_for_visualization.tsv'.format(data_dir)

    # Use the same LOG_DIR where you stored your checkpoint.
    summary_writer = tf.summary.FileWriter(checkpoint_dir)

    # The next line writes a projector_config.pbtxt in the LOG_DIR. TensorBoard will
    # read this file during startup.
    projector.visualize_embeddings(summary_writer, config)
    
    saver = tf.train.Saver([embedding_var])
    saver.save(session, os.path.join(checkpoint_dir, 'model2.ckpt'), 1)    
    print('embedding for visualization saved')


In [None]:
# Step 5: Begin training.
num_steps = 9999999

with tf.Session(graph=graph) as session:
    train_summary_writer = tf.summary.FileWriter(train_summary_dir, session.graph)
    saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)    
    
    
    # We must initialize all variables before we use them.
    init.run()
    print('Initialized')

    average_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_labels = generator.next_batch()
        
        feed_dict = {train_inputs: batch_inputs,
                     train_labels: np.expand_dims(np.array(batch_labels), -1)}

        # We perform one update step by evaluating the optimizer op (including it
        # in the list of returned values for session.run()
        _, loss_val, summaries = session.run([optimizer, loss, train_summary_op], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step ', step, ': ', average_loss)
            average_loss = 0

        # Note that this is expensive (~20% slowdown if computed every 500 steps)
        if step % 10000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = valid_examples[i]
                top_k = 8  # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = 'Nearest to %s:' % valid_word
                for k in range(top_k):
                    close_word = nearest[k]
                    log_str = '%s %s,' % (log_str, close_word)
                print(log_str)

        if step % FLAGS.checkpoint_every == 0 and step > 0:
            path = saver.save(session, checkpoint_prefix, global_step=step)
            
            print("Saved model checkpoint to {}\n".format(path))                
            save_embedding_for_viz(normalized_embeddings)
            
        train_summary_writer.add_summary(summaries, step)
       


Initialized
Average loss at step  0 :  218.67350769
Nearest to 18: 754, 1908, 1589, 260, 3766, 1044, 2935, 3877,
Nearest to 15: 5084, 1641, 2722, 1618, 1075, 131, 4130, 295,
Nearest to 63: 641, 1188, 2909, 5000, 68, 2019, 3872, 313,
Nearest to 88: 3010, 3964, 340, 1050, 3135, 1833, 1590, 1139,
Nearest to 27: 4473, 2594, 2144, 3180, 3538, 2999, 787, 2258,
Nearest to 30: 5074, 4943, 1914, 1367, 1283, 949, 1204, 4516,
Nearest to 95: 3073, 4566, 2534, 4756, 1373, 40, 1365, 4727,
Nearest to 24: 3873, 2856, 854, 3201, 3798, 3989, 4370, 1913,
Nearest to 21: 2867, 2400, 3312, 1507, 3138, 4674, 3493, 2733,
Nearest to 32: 3323, 2393, 4291, 4009, 1399, 2913, 4908, 3084,
Nearest to 45: 940, 4705, 1379, 3612, 1876, 639, 3521, 811,
Nearest to 13: 3524, 418, 229, 4351, 1650, 4120, 890, 4511,
Nearest to 49: 2572, 566, 803, 5050, 1229, 1968, 3005, 4826,
Nearest to 71: 69, 4257, 664, 4519, 1999, 3610, 1125, 4198,
Nearest to 76: 2365, 5015, 3489, 260, 2850, 3109, 2795, 3353,
Nearest to 92: 1159, 1663, 42