In [0]:
import re
import pandas as pd
import numpy as np
import json
import logging
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [0]:
DATA_IN_PATH = './data_in/'

INPUT_TRAIN_DATA_FILE_NAME='train_input.npy'
LABEL_TRAIN_DATA_FILE_NAME='train_label.npy'
DATA_CONFIGS_FILE_NAME='data_configs.json'


train_input = np.load(open(DATA_IN_PATH + INPUT_TRAIN_DATA_FILE_NAME, 'rb'))
train_label = np.load(open(DATA_IN_PATH + LABEL_TRAIN_DATA_FILE_NAME, 'rb'))

prepro_configs = None

with open(DATA_IN_PATH + DATA_CONFIGS_FILE_NAME, 'r') as f:
  prepro_configs = json.load(f)

In [0]:
TEST_SPLIT = 0.1
RANDOM_SEED = 13371447

input_train, input_eval, label_train, label_eval = train_test_split(train_input, train_label, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

In [0]:
BATCH_SIZE = 16
NUM_EPOCHS = 3

def mapping_fn(X, Y):
  inputs, labels = { 'x' : X }, Y
  return inputs, labels

def train_input_fn():
  dataset = tf.data.Dataset.from_tensor_slices((input_train, label_train))
  dataset = dataset.shuffle(buffer_size=10000)
  dataset = dataset.batch(BATCH_SIZE)
  dataset = dataset.map(mapping_fn)
  dataset.repeat(count=NUM_EPOCHS)
  iterator = dataset.make_one_shot_iterator()
  
  return iterator.get_next()

def eval_input_fn():
  dataset = tf.data.Dataset.from_tensor_slices((input_train, label_train))
  dataset = dataset.map(mapping_fn)
  dataset = dataset.batch(BATCH_SIZE)
  iterator = dataset.make_one_shot_iterator()
  
  return iterator.get_next()


In [0]:
VOCAB_SIZE = prepro_configs['vocab_size']

WORD_EMBEDDING_DIM = 100
HIDDEN_STATE_DIM = 150
DENSE_FEATURE_DIM = 150

learning_rate = 0.001


In [0]:
def model_fn(features, labels, mode):
  TRAIN = mode == tf.estimator.ModeKeys.TRAIN
  EVAL = mode == tf.estimator.ModeKeys.EVAL
  PREDICT = mode == tf.estimator.ModeKeys.PREDICT
  
  embedding_layer = tf.keras.layers.Embedding(
                      VOCAB_SIZE,
                      WORD_EMBEDDING_DIM)(features['x'])
  
  embedding_layer = tf.keras.layers.Dropout(0.2)(embedding_layer)
  
  rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in [HIDDEN_STATE_DIM, HIDDEN_STATE_DIM]]
  
  multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
  
  outputs, state = tf.nn.dynamic_rnn(cell=multi_rnn_cell,
                                     inputs=embedding_layer,
                                     dtype=tf.float32)
  
  outputs = tf.keras.layers.Dropout(0.2)(outputs)
  hidden_layer = tf.keras.layers.Dense(DENSE_FEATURE_DIM, activation=tf.nn.tanh)(outputs[:,-1,:])
  hidden_layer = tf.keras.layers.Dropout(0.2)(hidden_layer)
  logits = tf.keras.layers.Dense(1)(hidden_layer)
  logits = tf.squeeze(logits, axis=-1)
  
  sigmoid_logits = tf.nn.sigmoid(logits)
  
  if PREDICT :
    predictions = {'sentiment' : sigmoid_logits}
    return tf.estimator.EstimatorSpec(
                mode=mode,
                predictions=predictions)
  
  loss =  tf.losses.sigmoid_cross_entropy(labels, logits)
  
  if EVAL:
    accuracy = tf.metrics.accuracy(labels, tf.round(sigmoid_logits))
    eval_metric_ops = {'acc':accuracy}
    
    return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=eval_metric_ops)
    
  if TRAIN:
    global_step = tf.train.get_global_step()
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss, global_step)
    
    return tf.estimator.EstimatorSpec(
            mode = mode,
            train_op=train_op,
            loss=loss)

In [33]:
est = tf.estimator.Estimator(model_fn, model_dir='./checkpoint')

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './checkpoint', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fe14ecfa0b8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [34]:
est.train(train_input_fn)

INFO:tensorflow:Calling model_fn.
Instructions for updating:
Use tf.cast instead.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into ./checkpoint/model.ckpt.
INFO:tensorflow:loss = 0.6919471, step = 1
INFO:tensorflow:Saving checkpoints for 88 into ./checkpoint/model.ckpt.
INFO:tensorflow:Loss for final step: 0.6833544.


<tensorflow_estimator.python.estimator.estimator.Estimator at 0x7fe14f834ef0>

In [36]:
est.evaluate(eval_input_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-05-20T01:00:39Z
INFO:tensorflow:Graph was finalized.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from ./checkpoint/model.ckpt-88
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-05-20-01:00:46
INFO:tensorflow:Saving dict for global step 88: acc = 0.4996426, global_step = 88, loss = 0.6927466
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 88: ./checkpoint/model.ckpt-88


{'acc': 0.4996426, 'global_step': 88, 'loss': 0.6927466}