In [10]:
batch_size = 200
num_hidden1 = 500 # 1st layer
num_hidden2 = 500 # 2nd layer

log_dir = "./log"

max_features = 500 # Tfidf features

min_steps = 20000

In [2]:
from blt_dataset import CATEGORIES, export_events_data

events = export_events_data("../datasets/events1.json")

In [3]:
# How to represent cyclic input?
#   For example, how to represent that 31.12 is close to 01.01? Or we should leave it to NN?
# Discretize!
# span in days from the beginning of each month
# span in hours from daily hour, whether event starts in the morning or notwis

In [4]:
raw_desc_texts = [e['description'] for e in events]
raw_title_texts = [e['title'] for e in events]

In [5]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

# nltk.download() # needs 'punkt' package


def tokenizer(text):
    words = nltk.word_tokenize(text)
    return words

# Create TF-IDF of texts
tfidf_desc = TfidfVectorizer(tokenizer=tokenizer, stop_words=None, max_features=max_features)
sparse_tfidf_desc = tfidf_desc.fit_transform(raw_desc_texts)

tfidf_title = TfidfVectorizer(tokenizer=tokenizer, stop_words=None, max_features=max_features)
sparse_tfidf_title = tfidf_title.fit_transform(raw_title_texts)

In [6]:
import numpy as np

attrs_bool = np.nan_to_num(np.array([e['attrs_bool'] for e in events], dtype = np.float32))
attrs_bool.shape
print(np.sum(attrs_bool))

attrs_scale01 = np.array([e['attrs_scale01'] for e in events], dtype = np.float32)
attrs_scale01 /= np.max(attrs_scale01, axis = 0)
attrs_scale01.shape
print(np.sum(attrs_scale01))

attrs_logscale01 = np.ma.log(np.array([e['attrs_logscale01'] for e in events], dtype = np.float32)).filled(0)
attrs_logscale01 /= np.max(attrs_logscale01, axis = 0)
attrs_logscale01.shape
print(np.sum(attrs_logscale01))

193847.0
172139.0
82787.9


In [7]:
features = np.hstack([
    sparse_tfidf_desc.todense(),
    sparse_tfidf_title.todense(),
    attrs_bool,
    attrs_scale01,
    attrs_logscale01,
])
print(features.shape)

target = np.array([e['category'] for e in events])
print(target.shape)

np.sum(features)

(107838, 1019)
(107838,)


1091126.0498753795

In [8]:
import numpy as np

num_samples, num_features = features.shape

train_indices = np.random.choice(num_samples, int(0.8*num_samples), replace=False)
test_indices = np.array(list(set(range(num_samples)) - set(train_indices)))

features_train = features[train_indices]
features_test = features[test_indices]

target_train = np.array([x for ix, x in enumerate(target) if ix in train_indices])
target_test = np.array([x for ix, x in enumerate(target) if ix in test_indices])

In [11]:
import tensorflow as tf
import math
import time
from datetime import datetime
import os.path
from tensorflow.python.framework import ops
ops.reset_default_graph()   

num_classes = len(CATEGORIES)

current_log_dir = os.path.join(log_dir, datetime.now().strftime("%Y-%m-%d-%H_%M_%S"))
tf.gfile.MakeDirs(current_log_dir)

with tf.Graph().as_default():
    x_data = tf.placeholder(shape=[None, num_features], dtype=tf.float32)
    y_target = tf.placeholder(tf.int32, shape=(None))

    keep_prob = tf.placeholder(tf.float32, name = 'keep_prob')
    
    with tf.name_scope('hidden1'):
        weights1 = tf.Variable(
            tf.truncated_normal([num_features, num_hidden1],
                            stddev=1.0 / math.sqrt(float(num_features))),
                            name='weights')
        biases1 = tf.Variable(tf.zeros([num_hidden1]),
                             name='biases')
        hidden_relu1 = tf.nn.relu(tf.matmul(x_data, weights1) + biases1)
        hidden1 = tf.nn.dropout(hidden_relu1, keep_prob, name = 'dropout')

    with tf.name_scope('hidden2'):
        weights2 = tf.Variable(
            tf.truncated_normal([num_hidden1, num_hidden2],
                            stddev=1.0 / math.sqrt(float(num_hidden1))),
                            name='weights')
        biases2 = tf.Variable(tf.zeros([num_hidden2]),
                             name='biases')
        hidden_relu2 = tf.nn.relu(tf.matmul(hidden1, weights2) + biases2)
        hidden2 = tf.nn.dropout(hidden_relu2, keep_prob, name = 'dropout')

    with tf.name_scope('softmax_linear'):
        weights_sm = tf.Variable(
            tf.truncated_normal([num_hidden2, num_classes],
                            stddev=1.0 / math.sqrt(float(num_hidden2))),
                            name='weights')
        biases_sm = tf.Variable(tf.zeros([num_classes]),
                             name='biases')
        logits = tf.matmul(hidden2, weights_sm) + biases_sm

    with tf.name_scope('loss'):
        cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=y_target,
            logits=logits,
            name='xentropy')
        loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
        tf.summary.scalar('loss', loss)

    with tf.name_scope('train'):
        # optimizer = tf.train.AdamOptimizer(0.00025)
        optimizer = tf.train.GradientDescentOptimizer(0.00025)
        global_step = tf.Variable(0, name='global_step', trainable=False)
        train_op = optimizer.minimize(loss, global_step=global_step)

    with tf.name_scope('accuracy1'):
        prediction1 = tf.nn.in_top_k(logits, y_target, 1)
        accuracy1 = tf.reduce_mean(tf.cast(prediction1, tf.float32))
        tf.summary.scalar('accuracy1', accuracy1)
    
    with tf.name_scope('accuracy2'):
        prediction2 = tf.nn.in_top_k(logits, y_target, 2)
        accuracy2 = tf.reduce_mean(tf.cast(prediction2, tf.float32))
        tf.summary.scalar('accuracy2', accuracy2)
    
    summary = tf.summary.merge_all()
    
    saver = tf.train.Saver()

    sess = tf.Session()
    
    train_writer = tf.summary.FileWriter(current_log_dir + '/train', sess.graph)
    test_writer = tf.summary.FileWriter(current_log_dir + '/test')
    
    init = tf.global_variables_initializer()

    sess.run(init)
    
    best_result = 0.0
    max_steps = min_steps
    i = 0
    while i < max_steps:
        rand_index = np.random.choice(features_train.shape[0], size=batch_size)
        rand_x = features_train[rand_index]
        rand_y = np.transpose([target_train[rand_index]]).ravel()
        feed_dict = {
            x_data: rand_x,
            y_target: rand_y,
            keep_prob: 0.5,
        }
        
        sess.run(train_op, feed_dict=feed_dict)

        # Only record loss and accuracy every 100 generations
        if (i+1)%100==0:
            feed_dict_train = {
                x_data: rand_x,
                y_target: rand_y,
                keep_prob: 1.0,
            }

            feed_dict_test = {
                x_data: features_test,
                y_target: np.transpose([target_test]).ravel(),
                keep_prob: 1.0,
            }

            train_loss_temp = sess.run(loss, feed_dict=feed_dict_train)
            train_acc_temp = sess.run(accuracy1, feed_dict=feed_dict_train)
            train_acc2_temp = sess.run(accuracy2, feed_dict=feed_dict_train)
            summary_str = sess.run(summary, feed_dict=feed_dict_train)
            train_writer.add_summary(summary_str, i)
            train_writer.flush()

            test_loss_temp = sess.run(loss, feed_dict=feed_dict_test)
            test_acc_temp = sess.run(accuracy1, feed_dict=feed_dict_test)
            test_acc2_temp = sess.run(accuracy2, feed_dict=feed_dict_train)
            summary_str = sess.run(summary, feed_dict=feed_dict_test)
            test_writer.add_summary(summary_str, i)
            test_writer.flush()

        if (i+1)%500==0:
            acc_and_loss = [i+1, train_loss_temp, test_loss_temp, train_acc_temp * 100, test_acc_temp * 100]
            acc_and_loss = [np.round(x,3) for x in acc_and_loss]
            print('Generation # {}. Train Loss (Test Loss): {:.3f} ({:.3f}). Train Acc (Test Acc): {:.3f} ({:.3f})'.format(*acc_and_loss))
            
            saver.save(sess, current_log_dir + '/model.ckpt', global_step=i)
            
        if test_acc_temp > best_result:
            best_result = test_acc_temp
            max_steps = np.max([max_steps, i * 2.0])
            print("Accuracy improved, now steps count is %d" % max_steps)
            
        i += 1
    
print("Done.")

Accuracy improved, now steps count is 20000
Generation # 500. Train Loss (Test Loss): 2.633 (2.631). Train Acc (Test Acc): 9.000 (9.737)
Generation # 1000. Train Loss (Test Loss): 2.621 (2.619). Train Acc (Test Acc): 16.500 (15.996)
Generation # 1500. Train Loss (Test Loss): 2.612 (2.608). Train Acc (Test Acc): 14.500 (19.891)
Generation # 2000. Train Loss (Test Loss): 2.597 (2.596). Train Acc (Test Acc): 24.000 (20.391)
Generation # 2500. Train Loss (Test Loss): 2.581 (2.585). Train Acc (Test Acc): 21.500 (20.530)
Generation # 3000. Train Loss (Test Loss): 2.578 (2.574). Train Acc (Test Acc): 22.500 (20.498)
Generation # 3500. Train Loss (Test Loss): 2.567 (2.562). Train Acc (Test Acc): 21.000 (20.605)
Generation # 4000. Train Loss (Test Loss): 2.559 (2.551). Train Acc (Test Acc): 20.000 (20.591)
Generation # 4500. Train Loss (Test Loss): 2.539 (2.540). Train Acc (Test Acc): 20.000 (20.707)
Generation # 5000. Train Loss (Test Loss): 2.534 (2.529). Train Acc (Test Acc): 19.000 (20.758)