In [1]:
import os
import time
import pickle as pkl
import numpy as np
import tensorflow as tf
import datetime
import pandas as pd
import itertools


from scipy.sparse import csr_matrix
from sklearn.cross_validation import train_test_split
from tensorflow.contrib import learn
from itertools import repeat, chain

from kim_cnn import KimCNN
from word2vec import Word2Vec
from combined import Combined
from eval_helpers import label_lists_to_sparse_tuple
from data_helpers import batch_iter, RWBatchGenerator
from tf_helpers import save_embedding_for_viz

In [2]:
tf.flags.DEFINE_string('data_dir', 'data/stackexchange/datascience/', 'directory of dataset')
tf.flags.DEFINE_integer('tag_freq_threshold', 5, 'minimum frequency of a tag')

tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation")
tf.flags.DEFINE_float("max_document_length", 2000, "Maximum length of document, exceeding part is truncated")

# Architecutural parameters for KimCNN

tf.flags.DEFINE_string("loss_function", 'sigmoid', "loss function: (softmax|sigmoid) (Default: sigmoid)")

# Model Hyperparameters
tf.flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of character embedding (default: 128)")
tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')")
tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)")
tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularization lambda (default: 0.0)")

# Training parameters
tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")


In [3]:
tf.flags.DEFINE_integer("dw_batch_size", 128, "Batch Size for deep walk model (default: 128)")
tf.flags.DEFINE_integer("dw_skip_window", 3, "How many words to consider left and right. (default: 3)")
tf.flags.DEFINE_integer("dw_num_skips", 4, "How many times to reuse an input to generate a label. (default: 4)")
tf.flags.DEFINE_integer("dw_embedding_size", 128, "Dimensionality of node embedding. (default: 128)")
tf.flags.DEFINE_integer("dw_num_negative_samples", 64, "Number of negative examples to sample. (default: 64)")

In [4]:
# global training parameter
tf.flags.DEFINE_integer("num_epochs", 200, "Number of training epochs (default: 200)")
tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)")
tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)")
tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)")
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

In [5]:
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

data_dir = FLAGS.data_dir


Parameters:
ALLOW_SOFT_PLACEMENT=True
BATCH_SIZE=64
CHECKPOINT_EVERY=1000
DATA_DIR=data/stackexchange/datascience/
DEV_SAMPLE_PERCENTAGE=0.1
DROPOUT_KEEP_PROB=0.5
DW_BATCH_SIZE=128
DW_EMBEDDING_SIZE=128
DW_NUM_NEGATIVE_SAMPLES=64
DW_NUM_SKIPS=4
DW_SKIP_WINDOW=3
EMBEDDING_DIM=128
EVALUATE_EVERY=100
FILTER_SIZES=3,4,5
L2_REG_LAMBDA=0.0
LOG_DEVICE_PLACEMENT=False
LOSS_FUNCTION=sigmoid
MAX_DOCUMENT_LENGTH=2000
NUM_CHECKPOINTS=5
NUM_EPOCHS=200
NUM_FILTERS=128
TAG_FREQ_THRESHOLD=5



In [6]:
# load text data and label information

text_path = os.path.join(data_dir, "input_text.csv")
tdf = pd.read_csv(text_path, header=None)
x_text = tdf[1]


vocab_processor = learn.preprocessing.VocabularyProcessor(FLAGS.max_document_length)
X = np.array(list(vocab_processor.fit_transform(x_text)))
node_ids = np.arange(X.shape[0])

# load train/test data
Y_labels = pkl.load(open(os.path.join(data_dir, "Y.pkl"), 'rb'))


size = sum(len(ls) for ls in Y_labels)
row_indx = list(chain(*[list(repeat(i, len(ls))) for i, ls in enumerate(Y_labels)]))
col_indx = list(chain(*Y_labels))
Y_binary = csr_matrix((np.ones(size), (row_indx, col_indx)),
                      shape=(len(Y_labels), len(set(col_indx)))).toarray()


# split data
x_train, x_dev, y_train_binary, y_dev_binary, y_train_labels, y_dev_labels, train_node_ids, dev_node_ids = train_test_split(
    X, Y_binary, Y_labels, node_ids, train_size=1 - FLAGS.dev_sample_percentage, random_state=42)
print("Train/Dev split: {:d}/{:d}".format(len(x_train), len(x_dev)))

num_classes = y_train_binary.shape[1]
print("num of classes: {:d}".format(num_classes))

Train/Dev split: 4630/515
num of classes: 328


In [7]:
# load node embedding data

walks = RWBatchGenerator.read_walks("{}/random_walks.txt".format(data_dir))

vocabulary_size = len(set(itertools.chain(*walks)))

dw_data_generator = RWBatchGenerator(
    walks, FLAGS.dw_batch_size, FLAGS.dw_num_skips, FLAGS.dw_skip_window)


In [8]:
# Training
# ==================================================


with tf.Graph().as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)

    with sess.as_default():
        with tf.name_scope('kim_cnn'):
            cnn = KimCNN(
                sequence_length=x_train.shape[1],
                num_classes=num_classes,
                vocab_size=len(vocab_processor.vocabulary_),
                embedding_size=FLAGS.embedding_dim,
                filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
                num_filters=FLAGS.num_filters,
                l2_reg_lambda=FLAGS.l2_reg_lambda,
                loss_function=FLAGS.loss_function,
                redefine_output_layer=True)

        with tf.name_scope('dw'):
            dw = Word2Vec(FLAGS.dw_num_negative_samples,
                          vocabulary_size,
                          FLAGS.dw_embedding_size)
        
        with tf.name_scope('combined'):
            model = Combined(cnn, dw)

        global_step = tf.Variable(0, name="global_step", trainable=False)
        
        label_train_op = tf.train.AdamOptimizer(1e-3).minimize(model.label_loss)        
        graph_train_op = tf.train.GradientDescentOptimizer(1.0).minimize(model.graph_loss)


        # Output directory for models and summaries
        out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", 'combined'))
        print("Writing to {}\n".format(out_dir))

        if tf.gfile.Exists(out_dir):
            print('cleaning ', out_dir)
            tf.gfile.DeleteRecursively(out_dir)
        tf.gfile.MakeDirs(out_dir)
        
        # Summaries for loss and precision
        label_loss_summary = tf.summary.scalar("label_loss", model.label_loss)
        graph_loss_summary = tf.summary.scalar("graph_loss", model.graph_loss)        
        p1 = tf.summary.scalar("p1", model.p1)
        p3 = tf.summary.scalar("p3", model.p3)
        p5 = tf.summary.scalar("p5", model.p5)

        # Train Summaries
        train_summary_op = tf.summary.merge([label_loss_summary, graph_loss_summary,
                                             p1, p3, p5])
        
        train_summary_dir = os.path.join(out_dir, "summaries", "train")
        train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

        # Dev summaries
        dev_summary_op = tf.summary.merge([label_loss_summary, graph_loss_summary,
                                           p1, p3, p5])
        dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
        dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)

        # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)

        # Write vocabulary
        vocab_processor.save(os.path.join(out_dir, "vocab"))
        
        sess.run(tf.global_variables_initializer())        
        
        def train_label_step(x_batch, y_batch_binary, y_batch_labels, node_ids, writer):
            """
            one training step for the label part
            """
            feed_dict = {
              model.cnn.input_x: x_batch,
              model.cnn.input_y_binary: y_batch_binary,
              model.cnn.input_y_labels: label_lists_to_sparse_tuple(
                  y_batch_labels, num_classes),  # needs some conversion
              model.node_ids: node_ids, # node ids
              model.cnn.dropout_keep_prob: FLAGS.dropout_keep_prob,
                
              # the following is in vain
              # tf requires all placeholder to be provided some value                
              model.dw.train_inputs: [0],
              model.dw.train_labels: [[0]],
            }
            _, step, summaries, label_loss, p1, p3, p5 = sess.run(
                [label_train_op, global_step, train_summary_op, model.label_loss, model.p1, model.p3, model.p5],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, label loss {:g}, p1 {:g}, p3 {:g}, p5 {:g}".format(
                time_str, step, label_loss, p1, p3, p5))
            train_summary_writer.add_summary(summaries, step)

        def train_graph_step(x_batch, batch_labels, writer):
            """
            one training step for the graph part
            """
            feed_dict = {
              model.dw.train_inputs: x_batch,
              model.dw.train_labels: np.expand_dims(np.array(batch_labels), -1),
                
              # the following is in vain
              # tf requires all placeholder to be provided some value
              model.cnn.input_x: list(vocab_processor.transform(["asdfkjahdkfhakslfh"])),  # non-sense stuff
              model.cnn.input_y_binary: [[0] * num_classes],  # with no label
              model.cnn.input_y_labels: label_lists_to_sparse_tuple(
                  [[0]], num_classes),  # needs some conversion
              model.node_ids: [0], # node ids
              model.cnn.dropout_keep_prob: FLAGS.dropout_keep_prob,
                
            }
            _, step, summaries, graph_loss = sess.run(
                [graph_train_op, global_step, train_summary_op, model.graph_loss],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, graph loss {:g}".format(
                time_str, step, graph_loss))
            writer.add_summary(summaries, step)

            
        def dev_step(x_batch, y_batch_binary, y_batch_labels, node_ids, writer):
            """
            Evaluates model on a dev set
            """
            feed_dict = {
              model.cnn.input_x: x_batch,
              model.cnn.input_y_binary: y_batch_binary,
              model.cnn.input_y_labels: label_lists_to_sparse_tuple(
                  y_batch_labels, num_classes),  # needs some conversion
              model.node_ids: node_ids, # node ids                
              model.cnn.dropout_keep_prob: 1.0,
                
              # in vain
              model.dw.train_inputs: [0],
              model.dw.train_labels: [[0]],                
            }
            step, summaries, label_loss, p1, p3, p5 = sess.run(
                [global_step, dev_summary_op, model.label_loss, model.p1, model.p3, model.p5],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, label loss {:g}, p1 {:g}, p3 {:g}, p5 {:g}".format(
                time_str, step, label_loss, p1, p3, p5))
            
            writer.add_summary(summaries, step)

        batches = batch_iter(
            list(zip(x_train, y_train_binary, y_train_labels, train_node_ids)), FLAGS.batch_size, FLAGS.num_epochs)

        for batch in batches:
            # train label part
            x_batch, y_batch_binary, y_train_labels, x_node_ids = zip(*batch)
            train_label_step(x_batch, y_batch_binary, y_train_labels, x_node_ids, train_summary_writer)
            current_step = tf.train.global_step(sess, global_step)  # one step for label training
            
            # train graph part
            batch_inputs, batch_labels = dw_data_generator.next_batch()
            train_graph_step(batch_inputs, batch_labels, train_summary_writer)
            
            current_step = tf.train.global_step(sess, global_step)  # one step for graph training
            
            if current_step % FLAGS.evaluate_every == 0:
                print("\nEvaluation:")
                dev_step(x_dev, y_dev_binary, y_dev_labels, dev_node_ids, dev_summary_writer)
                print("")
                
            if current_step % FLAGS.checkpoint_every == 0:
                path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                print("Saved model checkpoint to {}\n".format(path))
            
            global_step += 1

cnn: <class 'kim_cnn.KimCNN'>
dw: <class 'word2vec.Word2Vec'>
use sigmoid xentropy
Writing to /home/cloud-user/code/network_embedding/runs/combined

2017-09-23T16:13:52.641463: step 0, label loss 1.85806, p1 0.015625, p3 0.015625, p5 0.0125
2017-09-23T16:13:52.686052: step 0, graph loss 210.697

Evaluation:
2017-09-23T16:13:56.735411: step 0, label loss 0.938104, p1 0.023301, p3 0.00906149, p5 0.00776699

Saved model checkpoint to /home/cloud-user/code/network_embedding/runs/combined/checkpoints/model-0

2017-09-23T16:13:58.539058: step 1, label loss 1.35152, p1 0, p3 0, p5 0
2017-09-23T16:13:58.585387: step 1, graph loss 194.461
2017-09-23T16:14:00.221543: step 2, label loss 0.932381, p1 0, p3 0, p5 0
2017-09-23T16:14:00.264123: step 2, graph loss 179.424
2017-09-23T16:14:01.924874: step 3, label loss 0.646928, p1 0.015625, p3 0.00520833, p5 0.00625
2017-09-23T16:14:01.967379: step 3, graph loss 210.598
2017-09-23T16:14:03.616980: step 4, label loss 0.432819, p1 0.015625, p3 0.0052083

2017-09-23T16:15:26.994647: step 53, label loss 0.0640461, p1 0.125, p3 0.0989583, p5 0.096875
2017-09-23T16:15:27.039386: step 53, graph loss 119.315
2017-09-23T16:15:28.663191: step 54, label loss 0.0644611, p1 0.171875, p3 0.125, p5 0.10625
2017-09-23T16:15:28.711150: step 54, graph loss 141.159
2017-09-23T16:15:30.356055: step 55, label loss 0.0602297, p1 0.1875, p3 0.130208, p5 0.103125
2017-09-23T16:15:30.401563: step 55, graph loss 125.379
2017-09-23T16:15:32.043772: step 56, label loss 0.0638242, p1 0.21875, p3 0.130208, p5 0.103125
2017-09-23T16:15:32.091411: step 56, graph loss 145.913
2017-09-23T16:15:33.763648: step 57, label loss 0.0599512, p1 0.171875, p3 0.130208, p5 0.10625
2017-09-23T16:15:33.811067: step 57, graph loss 134.918
2017-09-23T16:15:35.472328: step 58, label loss 0.0650965, p1 0.21875, p3 0.135417, p5 0.109375
2017-09-23T16:15:35.519970: step 58, graph loss 141.108
2017-09-23T16:15:37.146324: step 59, label loss 0.0603348, p1 0.25, p3 0.15625, p5 0.11875
20

2017-09-23T16:17:02.104260: step 107, label loss 0.0532578, p1 0.140625, p3 0.109375, p5 0.10625
2017-09-23T16:17:02.159429: step 107, graph loss 106.569
2017-09-23T16:17:03.818466: step 108, label loss 0.0582029, p1 0.0625, p3 0.0989583, p5 0.09375
2017-09-23T16:17:03.878189: step 108, graph loss 109.457
2017-09-23T16:17:05.506372: step 109, label loss 0.0529901, p1 0.171875, p3 0.109375, p5 0.096875
2017-09-23T16:17:05.561482: step 109, graph loss 95.3499
2017-09-23T16:17:07.244152: step 110, label loss 0.050586, p1 0.09375, p3 0.0729167, p5 0.08125
2017-09-23T16:17:07.298398: step 110, graph loss 89.3025
2017-09-23T16:17:09.017414: step 111, label loss 0.0546352, p1 0.15625, p3 0.140625, p5 0.1125
2017-09-23T16:17:09.073174: step 111, graph loss 120.993
2017-09-23T16:17:10.764453: step 112, label loss 0.0553882, p1 0.109375, p3 0.0885417, p5 0.084375
2017-09-23T16:17:10.818508: step 112, graph loss 129.256
2017-09-23T16:17:12.493678: step 113, label loss 0.0537053, p1 0.171875, p3 0

2017-09-23T16:18:33.560709: step 161, label loss 0.0527037, p1 0.125, p3 0.0989583, p5 0.096875
2017-09-23T16:18:33.620712: step 161, graph loss 99.178
2017-09-23T16:18:35.240543: step 162, label loss 0.0473902, p1 0.140625, p3 0.0989583, p5 0.1
2017-09-23T16:18:35.305962: step 162, graph loss 104.828
2017-09-23T16:18:36.936738: step 163, label loss 0.0502322, p1 0.21875, p3 0.130208, p5 0.11875
2017-09-23T16:18:37.000001: step 163, graph loss 85.8571
2017-09-23T16:18:38.620950: step 164, label loss 0.0524488, p1 0.15625, p3 0.145833, p5 0.14375
2017-09-23T16:18:38.683015: step 164, graph loss 99.4061
2017-09-23T16:18:40.309425: step 165, label loss 0.0516906, p1 0.140625, p3 0.135417, p5 0.134375
2017-09-23T16:18:40.369779: step 165, graph loss 105.829
2017-09-23T16:18:41.992866: step 166, label loss 0.0503789, p1 0.0625, p3 0.078125, p5 0.0875
2017-09-23T16:18:42.057130: step 166, graph loss 102.402
2017-09-23T16:18:43.685766: step 167, label loss 0.0524008, p1 0.1875, p3 0.114583, p

2017-09-23T16:20:08.330878: step 214, label loss 0.0523593, p1 0.15625, p3 0.104167, p5 0.115625
2017-09-23T16:20:08.395140: step 214, graph loss 62.746
2017-09-23T16:20:10.056365: step 215, label loss 0.0473336, p1 0.15625, p3 0.140625, p5 0.115625
2017-09-23T16:20:10.121439: step 215, graph loss 97.2988
2017-09-23T16:20:11.769026: step 216, label loss 0.0511587, p1 0.25, p3 0.161458, p5 0.128125
2017-09-23T16:20:11.834038: step 216, graph loss 85.1765
2017-09-23T16:20:13.459272: step 217, label loss 0.0459071, p1 0.1875, p3 0.140625, p5 0.10625
2017-09-23T16:20:13.524506: step 217, graph loss 101.784
2017-09-23T16:20:14.146365: step 218, label loss 0.0509102, p1 0.136364, p3 0.0757576, p5 0.0727273
2017-09-23T16:20:14.212829: step 218, graph loss 99.8138
2017-09-23T16:20:15.873478: step 219, label loss 0.0462685, p1 0.25, p3 0.151042, p5 0.134375
2017-09-23T16:20:15.947543: step 219, graph loss 87.0208
2017-09-23T16:20:17.655428: step 220, label loss 0.0445792, p1 0.203125, p3 0.1406

2017-09-23T16:21:41.763807: step 268, label loss 0.0487234, p1 0.1875, p3 0.171875, p5 0.13125
2017-09-23T16:21:41.837898: step 268, graph loss 77.1163
2017-09-23T16:21:43.462766: step 269, label loss 0.0451714, p1 0.171875, p3 0.109375, p5 0.084375
2017-09-23T16:21:43.535756: step 269, graph loss 68.1483
2017-09-23T16:21:45.175264: step 270, label loss 0.0468382, p1 0.15625, p3 0.119792, p5 0.103125
2017-09-23T16:21:45.247323: step 270, graph loss 95.0827
2017-09-23T16:21:46.883942: step 271, label loss 0.0402676, p1 0.28125, p3 0.161458, p5 0.128125
2017-09-23T16:21:46.953071: step 271, graph loss 95.2819
2017-09-23T16:21:48.581670: step 272, label loss 0.046704, p1 0.265625, p3 0.192708, p5 0.1625
2017-09-23T16:21:48.662359: step 272, graph loss 95.0237
2017-09-23T16:21:50.303783: step 273, label loss 0.0469143, p1 0.375, p3 0.21875, p5 0.1625
2017-09-23T16:21:50.386014: step 273, graph loss 95.8912
2017-09-23T16:21:52.005807: step 274, label loss 0.0445276, p1 0.171875, p3 0.15625,

2017-09-23T16:23:17.120478: step 321, label loss 0.0416058, p1 0.234375, p3 0.1875, p5 0.175
2017-09-23T16:23:17.201201: step 321, graph loss 69.5841
2017-09-23T16:23:18.901790: step 322, label loss 0.0434087, p1 0.234375, p3 0.166667, p5 0.13125
2017-09-23T16:23:18.980938: step 322, graph loss 84.3389
2017-09-23T16:23:20.686048: step 323, label loss 0.0462446, p1 0.203125, p3 0.166667, p5 0.153125
2017-09-23T16:23:20.768140: step 323, graph loss 61.0291
2017-09-23T16:23:22.411857: step 324, label loss 0.0433876, p1 0.15625, p3 0.166667, p5 0.15
2017-09-23T16:23:22.498293: step 324, graph loss 76.9003
2017-09-23T16:23:24.204735: step 325, label loss 0.039667, p1 0.171875, p3 0.140625, p5 0.121875
2017-09-23T16:23:24.289476: step 325, graph loss 60.3389
2017-09-23T16:23:25.953371: step 326, label loss 0.0443136, p1 0.265625, p3 0.177083, p5 0.146875
2017-09-23T16:23:26.037760: step 326, graph loss 74.1743
2017-09-23T16:23:27.754932: step 327, label loss 0.0376907, p1 0.28125, p3 0.21354

KeyboardInterrupt: 