In [67]:
import pandas as pd
import numpy as np
import json

DATA_IN_PATH='./data_in/'
DATA_OUT_PATH='./data_out/'

INPUT_TRAIN_DATA_FILE_NAME='train_input.npy'
LABEL_TRAIN_DATA_FILE_NAME='train_label.npy'
DATA_CONFIGS_FILE_NAME='data_configs.json'

input_data=np.load(open(DATA_IN_PATH+INPUT_TRAIN_DATA_FILE_NAME,'rb'))
label_data=np.load(open(DATA_IN_PATH+LABEL_TRAIN_DATA_FILE_NAME,'rb'))

prepro_config=None

with open(DATA_IN_PATH+DATA_CONFIGS_FILE_NAME,'r') as f:
    prepro_configs=json.load(f)

In [68]:
from sklearn.model_selection import train_test_split

TEST_SPLIT=0.1
RANDOM_SEED=13371447

train_input, test_input, train_label, test_label=train_test_split(input_data,label_data,test_size=TEST_SPLIT,random_state=RANDOM_SEED)

In [69]:
import tensorflow as tf

BATCH_SIZE=16
NUM_EPOCHS=3

def mapping_fn(X, Y):
    inputs, labels={'x':X},Y
    return inputs, labels

def train_input_fn():
    dataset=tf.data.Dataset.from_tensor_slices((train_input, train_label))
    dataset=dataset.shuffle(buffer_size=1000)
    dataset=dataset.batch(BATCH_SIZE)
    
    dataset=dataset.map(mapping_fn)
    
    dataset=dataset.repeat(count=NUM_EPOCHS)
    
    iterator=tf.compat.v1.data.make_one_shot_iterator(dataset)
    #dataset.make_one_shot_iterator()

    return iterator.get_next()

def eval_input_fn():
    dataset=tf.compat.v1.data.Dataset.from_tensor_slices((test_input,test_label))
    dataset=dataset.map(mapping_fn)
    dataset=dataset.batch(BATCH_SIZE)
    iterator=dataset.make_one_shot_iterator()
    
    return iterator.get_next()

In [70]:
VOCAB_SIZE=prepro_configs['vocab_size']

In [71]:
WORD_EMBEDDING_DIM=100
HIDDEN_STATE_DIM=150
DENSE_FEATURE_DIM=150

learning_rate=0.001

In [72]:
def model_fn(features, labels, mode):
    TRAIN=mode==tf.estimator.ModeKeys.TRAIN
    EVAL=mode==tf.estimator.ModeKeys.EVAL
    PREDICT=mode==tf.estimator.ModeKeys.PREDICT
    
    embedding_layer=tf.keras.layers.Embedding(VOCAB_SIZE,WORD_EMBEDDING_DIM)(features['x'])
    
    embedding_layer=tf.keras.layers.Dropout(0.2)(embedding_layer)
    
    #rnn_layers=[tf.nn.rnn_cell.LSTMCell(size) for size in [HIDDEN_STATE_DIM, HIDDEN_STATE_DIM]]
    rnn_layers=[tf.compat.v1.nn.rnn_cell.LSTMCell(size) for size in [HIDDEN_STATE_DIM, HIDDEN_STATE_DIM]]
    
    #multi_rnn_cell=tf.nn.rnn_cell.MultiRNNCell(rnn_layers)
    multi_rnn_cell=tf.compat.v1.nn.rnn_cell.MultiRNNCell(rnn_layers)
    
    outputs,state=tf.compat.v1.nn.dynamic_rnn(cell=multi_rnn_cell,inputs=embedding_layer,dtype=tf.float32)
    
    outputs=tf.keras.layers.Dropout(0.2)(outputs)
    hidden_layer=tf.keras.layers.Dense(DENSE_FEATURE_DIM,activation=tf.nn.tanh)(outputs[:,-1,:])
    hidden_layer=tf.keras.layers.Dropout(0.2)(hidden_layer)
    logits=tf.keras.layers.Dense(1)(hidden_layer)
    logits=tf.squeeze(logits, axis=-1)
    
    sigmoid_logits=tf.nn.sigmoid(logits)
    
    if PREDICT:
        predictions={'sentiment':sigmoid_logits}
        
        return tf.estimator.EstimatorSpec(mode=mode,predictions=predictions)
    
    loss=tf.compat.v1.losses.sigmoid_cross_entropy(labels, logits)
    
    if EVAL:
        accuracy=tf.compat.v1.metrics.accuracy(labels, tf.round(sigmoid_logits))
        eval_metric_ops={'acc':accuracy}
        
        return tf.estimator.EstimatorSpec(mode, loss=loss,eval_metric_ops=eval_metric_ops)
    
    if TRAIN:
        global_step=tf.compat.v1.train.get_global_step()
        train_op=tf.compat.v1.train.AdamOptimizer(learning_rate).minimize(loss, global_step)
        
        return tf.estimator.EstimatorSpec(mode=mode,train_op=train_op,loss=loss)

In [73]:
import os

if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)
    
est=tf.estimator.Estimator(model_fn,model_dir=DATA_OUT_PATH+'checkpoint/rnn')

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './data_out/checkpoint/rnn', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [74]:
est.train(train_input_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./data_out/checkpoint/rnn\model.ckpt-4221
Instructions for updating:
Use standard file utilities to get mtimes.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 4221...
INFO:tensorflow:Saving checkpoints for 4221 into ./data_out/checkpoint/rnn\model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 4221...
INFO:tensorflow:loss = 0.69468683, step = 4221
INFO:tensorflow:global_step/sec: 6.58446
INFO:tensorflow:loss = 0.71780556, step = 4321 (15.189 sec)
INFO:tensorflow:global_step/sec: 5.7776
INFO:tensorflow:loss = 0.6612749, step = 4421 (17.308 sec)
INFO:tensorflow:global_step/sec: 5.57587
INFO:tensorflow:loss = 0.70439285, step = 4521 (17.933 sec)
INFO:tensorflow:glo

<tensorflow_estimator.python.estimator.estimator.EstimatorV2 at 0x23db79b1908>

In [75]:
est.evaluate(eval_input_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-12-30T14:09:01Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./data_out/checkpoint/rnn\model.ckpt-8442
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Inference Time : 4.08215s
INFO:tensorflow:Finished evaluation at 2020-12-30-14:09:05
INFO:tensorflow:Saving dict for global step 8442: acc = 0.8056, global_step = 8442, loss = 0.4432413
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 8442: ./data_out/checkpoint/rnn\model.ckpt-8442


{'acc': 0.8056, 'loss': 0.4432413, 'global_step': 8442}

In [77]:
TEST_INPUT_DATA='test_input.npy'

test_input_data=np.load(open(DATA_IN_PATH+TEST_INPUT_DATA,'rb'))

In [80]:
predict_input_fn=tf.compat.v1.estimator.inputs.numpy_input_fn(x={"x":test_input_data},shuffle=False)




In [81]:
predictions=np.array([p['sentiment'] for p in est.predict(input_fn=predict_input_fn)])

Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./data_out/checkpoint/rnn\model.ckpt-8442
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.


In [88]:
TEST_ID_DATA='test_id.npy'
test_id=np.load(open(DATA_IN_PATH+TEST_ID_DATA,'rb'),allow_pickle=True)
output=pd.DataFrame(data={"id":test_id,"sentiment":list(predictions)})
output.to_csv(DATA_OUT_PATH+"rnn_predict.csv",index=False,quoting=3)