# W266 Term Project: Event Temporal State Identification

## CNN Model

### John Chiang, Vincent Chu

In [10]:
#! /usr/bin/env python

import numpy as np
import os
import time
import datetime
#import data_helpers

# scikit-learn
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# Tensorflow
import tensorflow as tf
from tensorflow.contrib import learn

# CNN
from text_cnn import TextCNN
from nlp_cnn import NLPCNN

# Custom libraries
import societal_data_processor as sdp

### Parameters for Data Loading, CNN Model Ops and CNN Model Training

In [2]:
#===========================================================================================================
# Parameters
#===========================================================================================================

try:
    # Data loading params
    tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation")
    tf.flags.DEFINE_string("data_dir", '/home/vslchu/w266/project/data/eventstatus_eng/', "Directory for Annotated Societal Events Data")
    
    # Model Hyperparameters
    tf.flags.DEFINE_integer("embedding_dim", 50, "Dimensionality of character embedding (default: 128)") 
    tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')")
    tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)") #50
    tf.flags.DEFINE_float("dropout_keep_prob", 1, "Dropout keep probability (default: 1)") #0.5
    tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularization lambda (default: 0.0)")

    # Training parameters
    tf.flags.DEFINE_integer("batch_size", 50, "Batch Size (default: 64)")
    tf.flags.DEFINE_integer("num_epochs", 5, "Number of training epochs (default: 200)")
    tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)")
    tf.flags.DEFINE_integer("checkpoint_every", 500, "Save model after this many steps (default: 100)")
    tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)")
    
    # Misc Parameters
    tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
    tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
except:
    print "Tf Flags already defined"

params = tf.flags.FLAGS
params._parse_flags()
print("\nParameters:")
for attr, value in sorted(params.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")


Parameters:
ALLOW_SOFT_PLACEMENT=True
BATCH_SIZE=50
CHECKPOINT_EVERY=500
DATA_DIR=/home/vslchu/w266/project/data/eventstatus_eng/
DEV_SAMPLE_PERCENTAGE=0.1
DROPOUT_KEEP_PROB=1
EMBEDDING_DIM=50
EVALUATE_EVERY=100
FILTER_SIZES=3,4,5
L2_REG_LAMBDA=0.0
LOG_DEVICE_PLACEMENT=False
NUM_CHECKPOINTS=5
NUM_EPOCHS=5
NUM_FILTERS=128



### Function Definitions

In [3]:
#===========================================================================================================
# Functions
#===========================================================================================================

def load_text_data(data_dir, 
                   processor_ver, 
                   remove_stopwords = False, 
                   replace_num = False, 
                   remove_non_alpha = False, 
                   to_lower = False, 
                   to_subwords = False):
    
    # Load data from the annotated files
    print("Loading data...")
    (original_chunks, clean_chunks, clean_chunk_sents, temporal_states, event_files) = \
    sdp.get_chunks_n_annotations(data_dir, 
                                 processor_ver, 
                                 remove_stopwords, 
                                 replace_num, 
                                 remove_non_alpha, 
                                 to_lower, 
                                 to_subwords)

    # Tranform annotations into lists of binaries
    y = sdp.transform_annotations_to_binary(temporal_states)

    # Build vocabulary
    max_chunk_length = max([len(x.split(" ")) for x in clean_chunks])
    print "max_chunk_length = ", max_chunk_length

    vocab_processor = learn.preprocessing.VocabularyProcessor(max_chunk_length)
    x = np.array(list(vocab_processor.fit_transform(clean_chunks)))

    (x_train, x_test) = sdp.split_train_test_data(x, params.dev_sample_percentage)
    (y_train, y_test) = sdp.split_train_test_data(y, params.dev_sample_percentage)
    (y_orig_train, y_orig_test) = sdp.split_train_test_data(temporal_states, params.dev_sample_percentage)

    x_train = np.array(x_train)
    x_test = np.array(x_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)

    print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
    print("Train/Dev split on data (x): {:d}/{:d}".format(len(x_train), len(x_test)))
    print("Train/Dev split on labels (y): {:d}/{:d}".format(len(y_train), len(y_test)))
    
    return (x_train, x_test, y_train, y_test, y_orig_train, y_orig_test, vocab_processor)

# A single training step on a batch from the training data set
def train_step(sess, cnn, x_batch, y_batch, summaries_on = False):

    feed_dict = {
      cnn.input_x: x_batch,
      cnn.input_y: y_batch,
      cnn.dropout_keep_prob: params.dropout_keep_prob
    }

    if summaries_on:
        _, step, summaries, loss, accuracy, predictions = sess.run(
            [cnn.train_op, cnn.global_step, cnn.train_summary_op, cnn.loss, cnn.accuracy, cnn.predictions],
            feed_dict)
        
        train_summary_writer = tf.summary.FileWriter(cnn.train_summary_dir, sess.graph)        
        train_summary_writer.add_summary(summaries, step)        
    else:
        _, step, loss, accuracy, predictions = sess.run(
            [cnn.train_op, cnn.global_step, cnn.loss, cnn.accuracy, cnn.predictions],
            feed_dict)
        
    time_str = datetime.datetime.now().isoformat()
    print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))

    return (loss, accuracy, predictions)

# Evaluates CNN model on the test data set
def test_step(sess, cnn, x_batch, y_batch, writer = None, summaries_on = False):

    feed_dict = {
      cnn.input_x: x_batch,
      cnn.input_y: y_batch,
      cnn.dropout_keep_prob: 1.0
    }
    
    if summaries_on:    
        step, summaries, loss, accuracy, predictions = sess.run(
            [cnn.global_step, cnn.test_summary_op, cnn.loss, cnn.accuracy, cnn.predictions],
            feed_dict)
        
        if writer:
            writer.add_summary(summaries, step)       
    else:
        step, loss, accuracy, predictions = sess.run(
            [cnn.global_step, cnn.loss, cnn.accuracy, cnn.predictions],
            feed_dict)
        
    time_str = datetime.datetime.now().isoformat()
    print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
    
    return (loss, accuracy, predictions) 

def run_cnn(x_train, y_train, x_test, y_test, vocab_processor): 
    
    train_preds = []
    test_preds = []

    with tf.Graph().as_default():

        session_conf = tf.ConfigProto(
          allow_soft_placement=params.allow_soft_placement,
          log_device_placement=params.log_device_placement)

        sess = tf.Session(config=session_conf)

        with sess.as_default():
            cnn = NLPCNN(sequence_length = x_train.shape[1],
                         num_classes = y_train.shape[1],
                         vocab_size = len(vocab_processor.vocabulary_),
                         embedding_size = params.embedding_dim,
                         filter_sizes = list(map(int, params.filter_sizes.split(","))),
                         num_filters = params.num_filters,
                         l2_reg_lambda = params.l2_reg_lambda)
            
            cnn.build_core_graph()
            cnn.build_train_test_graph()
            
            print "cnn.out_dir = ", cnn.out_dir

            # Write vocabulary
            #vocab_processor.save(os.path.join(cnn.out_dir, "vocab"))   
            
            checkpoint_dir = os.path.abspath(os.path.join(cnn.out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(), max_to_keep = cnn.num_checkpoints)            

            # Initialize all variables
            sess.run(tf.global_variables_initializer())                     

            # Generate batches
            batches = sdp.batch_iter(
                list(zip(x_train, y_train)), params.batch_size, params.num_epochs)

            batch_count = 0
            for batch in batches:
                x_batch, y_batch = zip(*batch)
                loss, accuracy, predictions = train_step(sess, cnn, x_batch, y_batch)

                current_step = tf.train.global_step(sess, cnn.global_step)                
                print "current_step: ", current_step

                for i in range(len(predictions)):
                    train_preds.append(predictions[i])

                if current_step % params.evaluate_every == 0:
                    print "\nPredicting annotation for test data:"
                    #test_summary_writer = tf.summary.FileWriter(cnn.test_summary_dir, sess.graph)
                    #loss, accuracy, predictions = test_step(sess, cnn, x_test, y_test, writer = test_summary_writer)
                    loss, accuracy, predictions = test_step(sess, cnn, x_test, y_test, summaries_on = False)
                    test_preds.append(list(predictions))
                    print

                if current_step % params.checkpoint_every == 0:
                    path = saver.save(sess, checkpoint_prefix, global_step = current_step)
                    print "Saved model checkpoint to {}\n".format(path)

                batch_count += 1
                
            #print "\nFinal round of predicting annotation for test data:"
            #test_summary_writer = tf.summary.FileWriter(cnn.test_summary_dir, sess.graph)
            #loss, accuracy, predictions = test_step(sess, cnn, x_test, y_test, writer = test_summary_writer)
            #loss, accuracy, predictions = test_step(sess, cnn, x_test, y_test, summaries_on = False)
            #test_preds.append(list(predictions))

            print "\nRan %d batches during training and created %d rounds of predictions" % (batch_count, len(test_preds))

    return test_preds

############################################################################################################
# Function Name: eval_preds
# Description  :
# Parameters   :
#   test_preds_list: list of predictions from various evaluation checkpoint during the training cycle
#   test_labels    : Labels for the test data set
############################################################################################################

def eval_preds(test_preds_list, test_labels):

    #reload(sdp)
    test_pred_annotations = []
    test_pred_eval = []
    
    for i in range(len(test_preds_list)):
        temp_test_pred_annotations = sdp.transform_digits_to_annotations(test_preds_list[i])
        test_pred_annotations.append(temp_test_pred_annotations)
                
        ### Evaluate Performance of model        
        f1 = f1_score(test_labels, temp_test_pred_annotations, average='weighted')
        precision = precision_score(test_labels, temp_test_pred_annotations, average='weighted')
        recall = recall_score(test_labels, temp_test_pred_annotations, average='weighted')
        
        print "\nPerformance Evaluation of CNN Model (i = %d):" % i
        print "F1 Score = %f" % f1
        print "Precision Score = %f" % precision
        print "Recall Score = %f" % recall 

        test_pred_eval.append((f1, precision, recall))
        
    return test_pred_eval

### Running various scenarios

In [4]:
############################################################################################################
# Subword-level Data Processor v1 with stopwords but without non-alpha words
############################################################################################################

x_train, x_test, y_train, y_test, y_orig_train, y_orig_test, vocab_processor = \
    load_text_data(params.data_dir, 1, remove_non_alpha = True, to_subwords = True)
test_preds = run_cnn(x_train, y_train, x_test, y_test, vocab_processor)
test_eval = eval_preds(test_preds, y_orig_test)

x_train = None
x_test = None
y_train = None
y_test = None
y_orig_train = None
y_orig_test = None
vocab_processor = None

Loading data...
max_chunk_length =  643
Vocabulary Size: 8522
Train/Dev split on data (x): 5059/562
Train/Dev split on labels (y): 5059/562
Writing to /home/vslchu/w266/project/code/runs/20170822_0515_UTC

grads_and_vars.shape =  (9, 2)
cnn.out_dir =  /home/vslchu/w266/project/code/runs/20170822_0515_UTC
2017-08-22T05:15:16.328927: step 1, loss 3.67979, acc 0.04
current_step:  1
2017-08-22T05:15:16.657553: step 2, loss 2.43752, acc 0.04
current_step:  2
2017-08-22T05:15:16.981103: step 3, loss 1.47213, acc 0.38
current_step:  3
2017-08-22T05:15:17.305200: step 4, loss 1.65705, acc 0.5
current_step:  4
2017-08-22T05:15:17.619444: step 5, loss 1.96576, acc 0.32
current_step:  5
2017-08-22T05:15:18.135756: step 6, loss 1.24015, acc 0.62
current_step:  6
2017-08-22T05:15:18.661324: step 7, loss 1.60093, acc 0.42
current_step:  7
2017-08-22T05:15:19.182539: step 8, loss 1.34761, acc 0.4
current_step:  8
2017-08-22T05:15:19.707670: step 9, loss 1.66573, acc 0.44
current_step:  9
2017-08-22T0

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [5]:
############################################################################################################
# Subword-level Data Processor v2 with stopwords but without non-alpha words
############################################################################################################

x_train, x_test, y_train, y_test, y_orig_train, y_orig_test, vocab_processor = \
    load_text_data(params.data_dir, 2, remove_non_alpha = True, to_subwords = True)
test_preds = run_cnn(x_train, y_train, x_test, y_test, vocab_processor)
test_eval = eval_preds(test_preds, y_orig_test)

x_train = None
x_test = None
y_train = None
y_test = None
y_orig_train = None
y_orig_test = None
vocab_processor = None

Loading data...
max_chunk_length =  411
Vocabulary Size: 6361
Train/Dev split on data (x): 5059/562
Train/Dev split on labels (y): 5059/562
Writing to /home/vslchu/w266/project/code/runs/20170822_0520_UTC

grads_and_vars.shape =  (9, 2)
cnn.out_dir =  /home/vslchu/w266/project/code/runs/20170822_0520_UTC
2017-08-22T05:20:13.514490: step 1, loss 3.15613, acc 0.1
current_step:  1
2017-08-22T05:20:13.711429: step 2, loss 3.20766, acc 0.18
current_step:  2
2017-08-22T05:20:13.903543: step 3, loss 2.27633, acc 0.32
current_step:  3
2017-08-22T05:20:14.096070: step 4, loss 1.7865, acc 0.24
current_step:  4
2017-08-22T05:20:14.289697: step 5, loss 1.50657, acc 0.46
current_step:  5
2017-08-22T05:20:14.506130: step 6, loss 1.26867, acc 0.5
current_step:  6
2017-08-22T05:20:14.705718: step 7, loss 1.54296, acc 0.52
current_step:  7
2017-08-22T05:20:14.899494: step 8, loss 1.33171, acc 0.56
current_step:  8
2017-08-22T05:20:15.093951: step 9, loss 1.26375, acc 0.62
current_step:  9
2017-08-22T05

In [6]:
############################################################################################################
# Subword-level Data Processor v3 with stopwords but without non-alpha words
############################################################################################################

x_train, x_test, y_train, y_test, y_orig_train, y_orig_test, vocab_processor = \
    load_text_data(params.data_dir, 3, remove_non_alpha = True, to_subwords = True)
test_preds = run_cnn(x_train, y_train, x_test, y_test, vocab_processor)
test_eval = eval_preds(test_preds, y_orig_test)

x_train = None
x_test = None
y_train = None
y_test = None
y_orig_train = None
y_orig_test = None
vocab_processor = None

Loading data...
max_chunk_length =  264
Vocabulary Size: 6361
Train/Dev split on data (x): 5059/562
Train/Dev split on labels (y): 5059/562
Writing to /home/vslchu/w266/project/code/runs/20170822_0523_UTC

grads_and_vars.shape =  (9, 2)
cnn.out_dir =  /home/vslchu/w266/project/code/runs/20170822_0523_UTC
2017-08-22T05:23:29.083129: step 1, loss 1.81808, acc 0.2
current_step:  1
2017-08-22T05:23:29.218174: step 2, loss 1.34052, acc 0.4
current_step:  2
2017-08-22T05:23:29.351141: step 3, loss 1.30688, acc 0.54
current_step:  3
2017-08-22T05:23:29.480881: step 4, loss 1.58922, acc 0.52
current_step:  4
2017-08-22T05:23:29.610732: step 5, loss 1.64245, acc 0.46
current_step:  5
2017-08-22T05:23:29.740156: step 6, loss 1.57429, acc 0.42
current_step:  6
2017-08-22T05:23:29.869275: step 7, loss 1.33345, acc 0.5
current_step:  7
2017-08-22T05:23:29.999285: step 8, loss 1.35292, acc 0.6
current_step:  8
2017-08-22T05:23:30.128618: step 9, loss 1.39272, acc 0.56
current_step:  9
2017-08-22T05:

In [7]:
############################################################################################################
# Word-level Data Processor v1 with stopwords but without non-alpha words
############################################################################################################

x_train, x_test, y_train, y_test, y_orig_train, y_orig_test, vocab_processor = \
    load_text_data(params.data_dir, 1, remove_non_alpha = True)
test_preds = run_cnn(x_train, y_train, x_test, y_test, vocab_processor)
test_eval = eval_preds(test_preds, y_orig_test)

x_train = None
x_test = None
y_train = None
y_test = None
y_orig_train = None
y_orig_test = None
vocab_processor = None

Loading data...
max_chunk_length =  320
Vocabulary Size: 20222
Train/Dev split on data (x): 5059/562
Train/Dev split on labels (y): 5059/562
Writing to /home/vslchu/w266/project/code/runs/20170822_0525_UTC

grads_and_vars.shape =  (9, 2)
cnn.out_dir =  /home/vslchu/w266/project/code/runs/20170822_0525_UTC
2017-08-22T05:25:39.936838: step 1, loss 3.53635, acc 0.12
current_step:  1
2017-08-22T05:25:40.091791: step 2, loss 2.57076, acc 0.24
current_step:  2
2017-08-22T05:25:40.248731: step 3, loss 1.8837, acc 0.24
current_step:  3
2017-08-22T05:25:40.402638: step 4, loss 1.43903, acc 0.44
current_step:  4
2017-08-22T05:25:40.558776: step 5, loss 1.18597, acc 0.5
current_step:  5
2017-08-22T05:25:40.712883: step 6, loss 1.2527, acc 0.54
current_step:  6
2017-08-22T05:25:40.865792: step 7, loss 1.47984, acc 0.56
current_step:  7
2017-08-22T05:25:41.020903: step 8, loss 2.05342, acc 0.44
current_step:  8
2017-08-22T05:25:41.176350: step 9, loss 1.91719, acc 0.44
current_step:  9
2017-08-22T0

In [8]:
############################################################################################################
# Word-level Data Processor v2 with stopwords but without non-alpha words
############################################################################################################

x_train, x_test, y_train, y_test, y_orig_train, y_orig_test, vocab_processor = \
    load_text_data(params.data_dir, 2, remove_non_alpha = True)
test_preds = run_cnn(x_train, y_train, x_test, y_test, vocab_processor)
test_eval = eval_preds(test_preds, y_orig_test)

x_train = None
x_test = None
y_train = None
y_test = None
y_orig_train = None
y_orig_test = None
vocab_processor = None

Loading data...
max_chunk_length =  205
Vocabulary Size: 13764
Train/Dev split on data (x): 5059/562
Train/Dev split on labels (y): 5059/562
Writing to /home/vslchu/w266/project/code/runs/20170822_0528_UTC

grads_and_vars.shape =  (9, 2)
cnn.out_dir =  /home/vslchu/w266/project/code/runs/20170822_0528_UTC
2017-08-22T05:28:14.099204: step 1, loss 1.82973, acc 0.58
current_step:  1
2017-08-22T05:28:14.201874: step 2, loss 1.70845, acc 0.52
current_step:  2
2017-08-22T05:28:14.303171: step 3, loss 1.26824, acc 0.46
current_step:  3
2017-08-22T05:28:14.405191: step 4, loss 1.38592, acc 0.48
current_step:  4
2017-08-22T05:28:14.504166: step 5, loss 1.33992, acc 0.44
current_step:  5
2017-08-22T05:28:14.605351: step 6, loss 1.35852, acc 0.48
current_step:  6
2017-08-22T05:28:14.705420: step 7, loss 1.21025, acc 0.38
current_step:  7
2017-08-22T05:28:14.805403: step 8, loss 1.21597, acc 0.54
current_step:  8
2017-08-22T05:28:14.904699: step 9, loss 1.18647, acc 0.58
current_step:  9
2017-08-2

In [9]:
############################################################################################################
# Word-level Data Processor v3 with stopwords but without non-alpha words
############################################################################################################

x_train, x_test, y_train, y_test, y_orig_train, y_orig_test, vocab_processor = \
    load_text_data(params.data_dir, 3, remove_non_alpha = True)
test_preds = run_cnn(x_train, y_train, x_test, y_test, vocab_processor)
test_eval = eval_preds(test_preds, y_orig_test)

x_train = None
x_test = None
y_train = None
y_test = None
y_orig_train = None
y_orig_test = None
vocab_processor = None

Loading data...
max_chunk_length =  149
Vocabulary Size: 13762
Train/Dev split on data (x): 5059/562
Train/Dev split on labels (y): 5059/562
Writing to /home/vslchu/w266/project/code/runs/20170822_0529_UTC

grads_and_vars.shape =  (9, 2)
cnn.out_dir =  /home/vslchu/w266/project/code/runs/20170822_0529_UTC
2017-08-22T05:29:59.526612: step 1, loss 3.20563, acc 0.02
current_step:  1
2017-08-22T05:29:59.603924: step 2, loss 1.96764, acc 0.34
current_step:  2
2017-08-22T05:29:59.678169: step 3, loss 1.86751, acc 0.4
current_step:  3
2017-08-22T05:29:59.754751: step 4, loss 1.60973, acc 0.4
current_step:  4
2017-08-22T05:29:59.828186: step 5, loss 1.26315, acc 0.56
current_step:  5
2017-08-22T05:29:59.901422: step 6, loss 1.55993, acc 0.4
current_step:  6
2017-08-22T05:29:59.977269: step 7, loss 1.43233, acc 0.46
current_step:  7
2017-08-22T05:30:00.052249: step 8, loss 1.80465, acc 0.42
current_step:  8
2017-08-22T05:30:00.128540: step 9, loss 1.65463, acc 0.52
current_step:  9
2017-08-22T0