In [0]:
import os
import tensorflow as tf
import numpy as np
import json
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [0]:
DATA_IN_PATH = r'/data_in/'
DATA_OUT_PATH = r'/data_out/'

TRAIN_INPUT_DATA = 'nsmc_train_input.npy'
TRAIN_LABEL_DATA = 'nsmc_train_label.npy'
DATA_CONFIGS = 'data_configs.json'

input_data = np.load(open(DATA_IN_PATH + TRAIN_INPUT_DATA, 'rb'))
label_data = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA, 'rb'))
prepro_configs = json.load(open(DATA_IN_PATH + DATA_CONFIGS, 'r'))

In [0]:
TEST_SPLIT = 0.1
RNG_SEED = 13371447
VOCAB_SIZE = prepro_configs['vocab_size']+1
EMB_SIZE = 128
BATCH_SIZE = 16
EPOCHS = 1

train_input, eval_input, train_label, eval_label = train_test_split(input_data, label_data,
                                                                    test_size=TEST_SPLIT, random_state=RNG_SEED)

In [0]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(VOCAB_SIZE, EMB_SIZE))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Conv1D(filters=32,
                                 kernel_size=3,
                                 padding='same',
                                 activation=tf.nn.relu))
model.add(tf.keras.layers.GlobalMaxPool1D())
model.add(tf.keras.layers.Dense(250, activation=tf.nn.relu))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(1, activation=tf.nn.sigmoid))
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 128)         5600896   
_________________________________________________________________
dropout (Dropout)            (None, None, 128)         0         
_________________________________________________________________
conv1d (Conv1D)              (None, None, 32)          12320     
_________________________________________________________________
global_max_pooling1d (Global (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 250)               8250      
_________________________________________________________________
dropout_1 (Dropout)  

In [0]:
model.compile(optimizer='Adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [0]:
history = model.fit(train_input,
                    train_label,
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    validation_data=(eval_input, eval_label))

Train on 135000 samples, validate on 15000 samples
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


In [0]:
TEST_INPUT_DATA = 'nsmc_test_input.npy'
TEST_LABEL_DATA = 'nsmc_test_label.npy'

test_input_data = np.load(open(DATA_IN_PATH + TEST_INPUT_DATA, 'rb'))
test_label_data = np.load(open(DATA_IN_PATH + TEST_LABEL_DATA, 'rb'))

In [0]:
result = model.evaluate(test_input_data, test_label_data)



In [0]:
result

[0.38803097864151, 0.82388]

In [0]:
model.predict(test_input_data)

array([[0.9702003 ],
       [0.53041416],
       [0.6950706 ],
       ...,
       [0.8840578 ],
       [0.38493398],
       [0.6457652 ]], dtype=float32)