In [43]:
batch_size = 200
num_hidden1 = 500 # 1st layer
num_hidden2 = 500 # 2nd layer

log_dir = "./log"

max_features = 500 # Tfidf features

max_steps = 10000

In [44]:
from blt_dataset import CATEGORIES, export_events_data

events = export_events_data("../datasets/events1.json")

In [None]:
# How to represent cyclic input?
#   For example, how to represent that 31.21 is close to 01.01? Or we should leave it to NN?
# Discretize!
# span in days from the beginning of each month
# span in hours from daily hour, whether event starts in the morning or notwis
# add dropout
# add tfidf from title

In [45]:
raw_texts = [e['description'] for e in events]

In [46]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

# nltk.download() # needs 'punkt' package


def tokenizer(text):
    words = nltk.word_tokenize(text)
    return words

# Create TF-IDF of texts
tfidf = TfidfVectorizer(tokenizer=tokenizer, stop_words=None, max_features=max_features)
sparse_tfidf_texts = tfidf.fit_transform(raw_texts)

In [47]:
import numpy as np

def flattener(left, right):
    try:
        res = reduce(flattener, right, left)
    except TypeError:
        left.append(right)
        res = left
    return res

def flatten(seq):
    return reduce(flattener, seq, [])

attrs_bool = np.nan_to_num(np.array([e['attrs_bool'] for e in events], dtype = np.float32))
attrs_bool.shape
print(np.sum(attrs_bool))

# attrs_scale01 = np.array([np.concatenate(e['attrs_scale01']).tolist() for e in events], dtype = np.float32)
# [np.concatenate(e['attrs_scale01']).tolist() for e in events]
attrs_scale01 = np.array([flatten(e['attrs_scale01']) for e in events], dtype = np.float32)
attrs_scale01 /= np.max(attrs_scale01, axis = 0)
attrs_scale01.shape
print(np.sum(attrs_scale01))

attrs_logscale01 = np.ma.log(np.array([flatten(e['attrs_logscale01']) for e in events], dtype = np.float32)).filled(0)
attrs_logscale01 /= np.max(attrs_logscale01, axis = 0)
attrs_logscale01.shape
print(np.sum(attrs_logscale01))

193847.0
172139.0
82787.9


In [48]:
features = np.hstack([
    sparse_tfidf_texts.todense(),
    attrs_bool,
    attrs_scale01,
    attrs_logscale01,
])
print(features.shape)

target = np.array([e['category'] for e in events])
print(target.shape)

np.sum(features)

(107838, 519)
(107838,)


960172.26506490353

In [49]:
import numpy as np

num_samples, num_features = features.shape

train_indices = np.random.choice(num_samples, int(0.8*num_samples), replace=False)
test_indices = np.array(list(set(range(num_samples)) - set(train_indices)))

features_train = features[train_indices]
features_test = features[test_indices]

target_train = np.array([x for ix, x in enumerate(target) if ix in train_indices])
target_test = np.array([x for ix, x in enumerate(target) if ix in test_indices])

In [50]:
import tensorflow as tf
import math
import time
from datetime import datetime
import os.path
from tensorflow.python.framework import ops
ops.reset_default_graph()   

NUM_CLASSES = len(CATEGORIES)

current_log_dir = os.path.join(log_dir, datetime.now().strftime("%Y-%m-%d-%H_%M_%S"))
tf.gfile.MakeDirs(current_log_dir)

with tf.Graph().as_default():
    x_data = tf.placeholder(shape=[None, num_features], dtype=tf.float32)
    y_target = tf.placeholder(tf.int32, shape=(None))

    keep_prob = tf.placeholder(tf.float32, name = 'keep_prob')
    
    with tf.name_scope('hidden1'):
        weights1 = tf.Variable(
            tf.truncated_normal([num_features, num_hidden1],
                            stddev=1.0 / math.sqrt(float(num_features))),
                            name='weights')
        biases1 = tf.Variable(tf.zeros([num_hidden1]),
                             name='biases')
        hidden_relu1 = tf.nn.relu(tf.matmul(x_data, weights1) + biases1)
        hidden1 = tf.nn.dropout(hidden_relu1, keep_prob, name = 'dropout')

    with tf.name_scope('hidden2'):
        weights2 = tf.Variable(
            tf.truncated_normal([num_hidden1, num_hidden2],
                            stddev=1.0 / math.sqrt(float(num_hidden1))),
                            name='weights')
        biases2 = tf.Variable(tf.zeros([num_hidden2]),
                             name='biases')
        hidden_relu2 = tf.nn.relu(tf.matmul(hidden1, weights2) + biases2)
        hidden2 = tf.nn.dropout(hidden_relu2, keep_prob, name = 'dropout')

    with tf.name_scope('softmax_linear'):
        weights_sm = tf.Variable(
            tf.truncated_normal([num_hidden2, NUM_CLASSES],
                            stddev=1.0 / math.sqrt(float(num_hidden2))),
                            name='weights')
        biases_sm = tf.Variable(tf.zeros([NUM_CLASSES]),
                             name='biases')
        logits = tf.matmul(hidden2, weights_sm) + biases_sm

    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=y_target,
        logits=logits,
        name='xentropy')
    loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
    tf.summary.scalar('loss', loss)

    # optimizer = tf.train.AdamOptimizer(0.00025)
    optimizer = tf.train.GradientDescentOptimizer(0.00025)
    global_step = tf.Variable(0, name='global_step', trainable=False)
    train_op = optimizer.minimize(loss, global_step=global_step)

    prediction = tf.nn.in_top_k(logits, y_target, 1)
    predictions_correct = tf.cast(prediction, tf.float32)
    accuracy = tf.reduce_mean(predictions_correct)
    tf.summary.scalar('accuracy', accuracy)
    
    summary = tf.summary.merge_all()
    
    # Create a saver for writing training checkpoints.
    saver = tf.train.Saver()

    sess = tf.Session()
    
    # Instantiate a SummaryWriter to output summaries and the Graph.
    summary_writer = tf.summary.FileWriter(current_log_dir, sess.graph)
    
    init = tf.global_variables_initializer()

    sess.run(init)

    for i in range(max_steps):
        rand_index = np.random.choice(features_train.shape[0], size=batch_size)
        rand_x = features_train[rand_index]
        rand_y = np.transpose([target_train[rand_index]]).ravel()
        feed_dict = {
            x_data: rand_x,
            y_target: rand_y,
            keep_prob: 0.5,
        }
        
        sess.run(train_op, feed_dict=feed_dict)

        # Only record loss and accuracy every 100 generations
        if (i+1)%100==0:
            feed_dict_train = {
                x_data: rand_x,
                y_target: rand_y,
                keep_prob: 1.0,
            }

            feed_dict_test = {
                x_data: features_test,
                y_target: np.transpose([target_test]).ravel(),
                keep_prob: 1.0,
            }

            train_loss_temp = sess.run(loss, feed_dict=feed_dict_train)

            test_loss_temp = sess.run(loss, feed_dict=feed_dict_test)

            train_acc_temp = sess.run(accuracy, feed_dict=feed_dict_train)

            test_acc_temp = sess.run(accuracy, feed_dict=feed_dict_test)
            
            summary_str = sess.run(summary, feed_dict=feed_dict)
            summary_writer.add_summary(summary_str, i)
            summary_writer.flush()

        if (i+1)%500==0:
            acc_and_loss = [i+1, train_loss_temp, test_loss_temp, train_acc_temp * 100, test_acc_temp * 100]
            acc_and_loss = [np.round(x,3) for x in acc_and_loss]
            print('Generation # {}. Train Loss (Test Loss): {:.3f} ({:.3f}). Train Acc (Test Acc): {:.3f} ({:.3f})'.format(*acc_and_loss))
            
            checkpoint_file = os.path.join(current_log_dir, 'model.ckpt')
            saver.save(sess, checkpoint_file, global_step=i)
    
print("Done.")

Generation # 500. Train Loss (Test Loss): 2.624 (2.625). Train Acc (Test Acc): 0.185 (0.178)
Generation # 1000. Train Loss (Test Loss): 2.606 (2.610). Train Acc (Test Acc): 0.245 (0.198)
Generation # 1500. Train Loss (Test Loss): 2.599 (2.596). Train Acc (Test Acc): 0.205 (0.199)
Generation # 2000. Train Loss (Test Loss): 2.592 (2.583). Train Acc (Test Acc): 0.170 (0.200)
Generation # 2500. Train Loss (Test Loss): 2.571 (2.570). Train Acc (Test Acc): 0.220 (0.200)
Generation # 3000. Train Loss (Test Loss): 2.559 (2.556). Train Acc (Test Acc): 0.220 (0.200)
Generation # 3500. Train Loss (Test Loss): 2.538 (2.543). Train Acc (Test Acc): 0.225 (0.201)
Generation # 4000. Train Loss (Test Loss): 2.536 (2.530). Train Acc (Test Acc): 0.200 (0.201)
Generation # 4500. Train Loss (Test Loss): 2.523 (2.517). Train Acc (Test Acc): 0.220 (0.203)
Generation # 5000. Train Loss (Test Loss): 2.494 (2.504). Train Acc (Test Acc): 0.255 (0.203)
Generation # 5500. Train Loss (Test Loss): 2.491 (2.491). Tra