In [3]:
import os
from datetime import datetime
import tensorflow as tf
import numpy as np
import json
from sklearn.model_selection import train_test_split

In [4]:
DATA_IN_PATH = './data/'
DATA_OUT_PATH = './data/'
INPUT_TRAIN_DATA = 'nsmc_train_input.npy'
LABEL_TRAIN_DATA = 'nsmc_train_label.npy'
DATA_CONFIGS = 'data_configs.json'

input_data = np.load(open(DATA_IN_PATH + INPUT_TRAIN_DATA, 'rb'))
label_data = np.load(open(DATA_IN_PATH + LABEL_TRAIN_DATA, 'rb'))
prepro_configs = json.load(open(DATA_IN_PATH + DATA_CONFIGS, 'r'))

In [5]:
TEST_SPLIT = 0.1
RNG_SEED = 13371447
VOCAB_SIZE = prepro_configs['vocab_size']
EMB_SIZE = 128
BATCH_SIZE = 16
NUM_EPOCHS = 1

input_train, input_eval, label_train, label_eval = train_test_split(input_data, label_data, test_size=TEST_SPLIT, random_state=RNG_SEED)

In [6]:
def mapping_fn(X, Y):
    input, label = {'x': X}, Y
    return input, label

def train_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((input_train, label_train))
    dataset = dataset.shuffle(buffer_size=len(input_train))
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(mapping_fn)
    dataset = dataset.repeat(count=NUM_EPOCHS)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

def eval_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((input_eval, label_eval))
    dataset = dataset.shuffle(buffer_size=len(input_eval))
    dataset = dataset.batch(16)
    dataset = dataset.map(mapping_fn)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

In [7]:
def model_fn(features, labels, mode, params):
    TRAIN = mode == tf.estimator.ModeKeys.TRAIN
    EVAL = mode == tf.estimator.ModeKeys.EVAL
    PREDICT = mode == tf.estimator.ModeKeys.PREDICT

    embedding_layer = tf.keras.layers.Embedding(
                    VOCAB_SIZE + 1,
                    EMB_SIZE)(features['x'])

    dropout_emb = tf.keras.layers.Dropout(rate = 0.2)(embedding_layer)
    
    conv = tf.layers.conv1d(
           inputs=dropout_emb,
           filters=32,
           kernel_size=3,
           padding='same',
           activation=tf.nn.relu)
  
    pool = tf.keras.layers.GlobalMaxPool1D()(conv)

    hidden = tf.keras.layers.Dense(units=250, activation=tf.nn.relu)(pool)   


    dropout_hidden = tf.keras.layers.Dropout(rate=0.2)(hidden, training = TRAIN)
    logits = tf.keras.layers.Dense(units=1)(dropout_hidden)

    if labels is not None:
        labels = tf.reshape(labels, [-1, 1])
        
    if TRAIN:
        global_step = tf.train.get_global_step()
        loss = tf.losses.sigmoid_cross_entropy(labels, logits)
        train_op = tf.train.AdamOptimizer(0.001).minimize(loss, global_step)

        return tf.estimator.EstimatorSpec(mode=mode, train_op=train_op, loss = loss)
    
    elif EVAL:
        loss = tf.losses.sigmoid_cross_entropy(labels, logits)
        pred = tf.nn.sigmoid(logits)
        accuracy = tf.metrics.accuracy(labels, tf.round(pred))
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops={'acc': accuracy})
        
    elif PREDICT:
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={
                'prob': tf.nn.sigmoid(logits),
            }
        )

In [9]:
est = tf.estimator.Estimator(model_fn, model_dir="data/checkpoint/cnn_model")

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'data/checkpoint/cnn_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f69a406e240>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [12]:
time_start = datetime.utcnow()
print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
print(".......................................") 

est.train(train_input_fn)

time_end = datetime.utcnow()
print(".......................................")
print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
print("")
time_elapsed = time_end - time_start
print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))

Experiment started at 02:16:56
.......................................
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into data/checkpoint/cnn_model/model.ckpt.
INFO:tensorflow:loss = 0.68810225, step = 1
INFO:tensorflow:global_step/sec: 36.0173
INFO:tensorflow:loss = 0.4824096, step = 101 (2.781 sec)
INFO:tensorflow:global_step/sec: 40.0906
INFO:tensorflow:loss = 0.09750176, step = 201 (2.496 sec)
INFO:tensorflow:global_step/sec: 52.5717
INFO:tensorflow:loss = 0.63134986, step = 301 (1.907 sec)
INFO:tensorflow:global_step/sec: 59.951
INFO:tensorflow:loss = 0.23599344, step = 401 (1.658 sec)
INFO:tensorflow:global_step/sec: 85.8338
INFO:tensorflow:loss = 0.15393466, step = 501 (1.169 sec)
INFO:tensorflow:global_step/sec: 76.1902
INFO:tensorflow:loss = 0.21929

INFO:tensorflow:loss = 0.16489297, step = 7901 (1.248 sec)
INFO:tensorflow:global_step/sec: 82.3864
INFO:tensorflow:loss = 5.46847, step = 8001 (1.213 sec)
INFO:tensorflow:global_step/sec: 83.0669
INFO:tensorflow:loss = 0.1167956, step = 8101 (1.201 sec)
INFO:tensorflow:global_step/sec: 52.4509
INFO:tensorflow:loss = -2576.6685, step = 8201 (1.912 sec)
INFO:tensorflow:global_step/sec: 68.1882
INFO:tensorflow:loss = -849.2069, step = 8301 (1.461 sec)
INFO:tensorflow:global_step/sec: 68.2396
INFO:tensorflow:loss = 0.42143255, step = 8401 (1.469 sec)
INFO:tensorflow:global_step/sec: 55.0109
INFO:tensorflow:loss = 2779.6108, step = 8501 (1.813 sec)
INFO:tensorflow:global_step/sec: 73.4446
INFO:tensorflow:loss = 0.10283862, step = 8601 (1.362 sec)
INFO:tensorflow:global_step/sec: 87.8382
INFO:tensorflow:loss = -178.1823, step = 8701 (1.142 sec)
INFO:tensorflow:global_step/sec: 89.8109
INFO:tensorflow:loss = 0.09579904, step = 8801 (1.112 sec)
INFO:tensorflow:global_step/sec: 60.6131
INFO:te

In [25]:
import pickle

In [15]:
with open('./data/tf_idf_model.pkl', 'wb') as f:
    pickle.dump(est, f)

In [26]:
with open('./data/tf_idf_model.pkl', 'rb') as f:
    test = pickle.load(f)

In [27]:
test

<tensorflow.python.estimator.estimator.Estimator at 0x7f699c4c4c18>

In [28]:
with open('./tf_idf_model.pkl', 'rb') as f:
    ctest = pickle.load(f)



In [29]:
ctest

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.0,
        ngram_range=(1, 5), norm='l2', preprocessor=None, smooth_idf=True,
...nalty='l2', random_state=0,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [10]:
valid = est.evaluate(eval_input_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-05-24-03:22:47
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from data/checkpoint/cnn_model/model.ckpt-9400
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-05-24-03:22:52
INFO:tensorflow:Saving dict for global step 9400: acc = 0.90742624, global_step = 9400, loss = -841.4986
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 9400: data/checkpoint/cnn_model/model.ckpt-9400


In [11]:
INPUT_TEST_DATA = 'nsmc_test_input.npy'
LABEL_TEST_DATA = 'nsmc_test_label.npy'

test_input_data = np.load(open(DATA_IN_PATH + INPUT_TEST_DATA, 'rb'))
test_label_data = np.load(open(DATA_IN_PATH + LABEL_TEST_DATA, 'rb'))

In [12]:
def test_input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((test_input_data, test_label_data))
    dataset = dataset.batch(16)
    dataset = dataset.map(mapping_fn)
    iterator = dataset.make_one_shot_iterator()
    
    return iterator.get_next()

In [13]:
predict = est.evaluate(test_input_fn) 

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-05-24-03:23:08
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from data/checkpoint/cnn_model/model.ckpt-9400
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-05-24-03:23:33
INFO:tensorflow:Saving dict for global step 9400: acc = 0.90574473, global_step = 9400, loss = -1424.6682
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 9400: data/checkpoint/cnn_model/model.ckpt-9400


In [14]:
predict

{'acc': 0.90574473, 'loss': -1424.6682, 'global_step': 9400}

In [15]:
def predict_pos_neg(review):
    token = tokenize(review)
    tf = term_frequency(token)
    data = np.expand_dims(np.asarray(tf).astype('float32'), axis=0)
    score = float(model.predict(data))
    if(score > 0.5):
        print("[{}]는 {:.2f}% 확률로 긍정 리뷰이지 않을까 추측해봅니다.^^\n".format(review, score * 100))
    else:
        print("[{}]는 {:.2f}% 확률로 부정 리뷰이지 않을까 추측해봅니다.^^;\n".format(review, (1 - score) * 100))

In [17]:
def tokenize(doc):
    # norm은 정규화, stem은 근어로 표시하기를 나타냄
    return ['/'.join(t) for t in okt.pos(doc, norm=True, stem=True)]

In [18]:
def term_frequency(doc):
    return [doc.count(word) for word in selected_words]

In [22]:
from konlpy.tag import Okt
okt = Okt()

In [23]:
predict_pos_neg("올해 최고의 영화! 세 번 넘게 봐도 질리지가 않네요.")
predict_pos_neg("배경 음악이 영화의 분위기랑 너무 안 맞았습니다. 몰입에 방해가 됩니다.")
predict_pos_neg("주연 배우가 신인인데 연기를 진짜 잘 하네요. 몰입감 ㅎㄷㄷ")
predict_pos_neg("믿고 보는 감독이지만 이번에는 아니네요")
predict_pos_neg("주연배우 때문에 봤어요")

NameError: name 'selected_words' is not defined