In [1]:
import sys
import os
import json

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.utils import to_categorical
from tensorflow.python.keras.layers import Input, Embedding, Activation, Flatten, Dense, Conv1D, MaxPooling1D, Dropout
from tensorflow.python.keras.models import Model

In [2]:
def preprocessing(review): 
    # 불용어 제거는 옵션으로 선택 가능하다.
    
    # 1. HTML 태그 제거
    review_text = BeautifulSoup(review, "html.parser").get_text()	

    # 2. 대문자들을 소문자로 바꾸고 공백단위로 텍스트들 나눠서 리스트로 만든다.
    words = review_text.lower().split()

    return ' '.join(words)

In [3]:
DATA_IN_PATH = './data_in/'

train_df = pd.read_csv( DATA_IN_PATH + 'labeledTrainData.tsv', header = 0, delimiter = '\t', quoting = 3)
reviews = []
for review in train_df['review']:
    reviews.append(preprocessing(review))

FileNotFoundError: [Errno 2] File b'./data_in/labeledTrainData.tsv' does not exist: b'./data_in/labeledTrainData.tsv'

In [None]:
# Comparison between original data and clean data
print(train_df['review'][0])
print()
print(reviews[0])

In [None]:
train_texts = np.array(reviews)

In [None]:
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(train_texts)

In [None]:
tk.get_config()

In [None]:
# Make Character Dict
alphabet = "abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
char_dict = {}
for i, char in enumerate(alphabet):
    char_dict[char] = i + 1
tk.word_index = char_dict.copy()
tk.word_index[tk.oov_token] = len(char_dict) + 1

In [None]:
# Make Train data
train_sequence = tk.texts_to_sequences(train_texts)
train_sequence_pad = pad_sequences(train_sequence, maxlen=1014, padding='post')
train_data = np.array(train_sequence_pad, dtype='float32')

In [None]:
train_sequence

In [10]:
# Make label data
train_label_np = train_df['sentiment'].values
train_label_one_hot = to_categorical(train_label_np)

In [12]:
# Parameter
input_size = 1014
VALID_SPLIT = 0.1
dropout_prob = 0.5
embedding_size = 69
num_of_classes = len(train_label_one_hot[0])
characters_size = len(tk.word_index)

In [13]:
train_input, val_input, train_label, val_label = train_test_split(train_data, train_label_one_hot, test_size=VALID_SPLIT, shuffle=True)

In [14]:
train_input

array([[ 9., 12., 13., ..., 25., 39., 44.],
       [18., 15., 14., ...,  1., 20., 44.],
       [ 7., 69.,  3., ..., 19., 39., 44.],
       ...,
       [15., 18., 69., ...,  3., 59., 44.],
       [44., 20.,  8., ...,  0.,  0.,  0.],
       [44.,  9., 69., ...,  0.,  0.,  0.]], dtype=float32)

In [15]:
# Embedding weights
embedding_weights = []  # (70, 69)
embedding_weights.append(np.zeros(characters_size))  # (0, 69)
for char, i in tk.word_index.items():  # from index 1 to 69
    onehot = np.zeros(characters_size)
    onehot[i - 1] = 1
    embedding_weights.append(onehot)
embedding_weights = np.array(embedding_weights)

In [16]:
# Embedding layer Initialization
embedding_layer = Embedding(characters_size + 1,
                            embedding_size,
                            input_length=input_size,
                            weights=[embedding_weights])

In [17]:
# Input
inputs = Input(shape=(input_size,), name='input', dtype='int64')  # shape=(?, 1014)

# Embedding
x = embedding_layer(inputs)

# Convolutional Layer

#-------Layer 1-------#
x = Conv1D(256, 7)(x)
x = Activation('relu')(x)
x = MaxPooling1D(pool_size=3)(x)

#-------Layer 2-------#
x = Conv1D(256, 7)(x)
x = Activation('relu')(x)
x = MaxPooling1D(pool_size=3)(x)

#-------Layer 3-------#
x = Conv1D(256, 7)(x)
x = Activation('relu')(x)

#-------Layer 4-------#
x = Conv1D(256, 7)(x)
x = Activation('relu')(x)

#-------Layer 5-------#
x = Conv1D(256, 7)(x)
x = Activation('relu')(x)

#-------Layer 6-------#
x = Conv1D(256, 7)(x)
x = Activation('relu')(x)
x = MaxPooling1D(pool_size=3)(x)

x = Flatten()(x)  # (None, 8704)

# Fully connected layers

#-------Layer 7-------#
x = Dense(1024)(x)
x = Activation('relu')(x)
x = Dropout(dropout_prob)(x)

#-------Layer 8-------#
x = Dense(1024)(x)
x = Activation('relu')(x)
x = Dropout(dropout_prob)(x)

#-------Layer 9-------#
outputs = Dense(num_of_classes, activation='softmax')(x)

# Build model
model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])  

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [18]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           (None, 1014)              0         
_________________________________________________________________
embedding (Embedding)        (None, 1014, 69)          4830      
_________________________________________________________________
conv1d (Conv1D)              (None, 1008, 256)         123904    
_________________________________________________________________
activation (Activation)      (None, 1008, 256)         0         
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 336, 256)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 330, 256)          459008    
_________________________________________________________________
activation_1 (Activation)    (None, 330, 256)          0         
__________

In [19]:
char_cnn = tf.keras.estimator.model_to_estimator(keras_model=model)

train_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={'input': train_input},
    y=train_label,
    num_epochs=10,
    shuffle=False
)



INFO:tensorflow:Using default config.
INFO:tensorflow:Using the Keras model provided.
Instructions for updating:
Use tf.cast instead.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\solugate\\AppData\\Local\\Temp\\tmpwsoa_3ns', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000002240C3BA6D8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [22]:
val_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={'input': val_input},
    y=val_label,
    shuffle=False
)

In [20]:
char_cnn.train(input_fn=train_input_fn)

Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Warm-starting with WarmStartSettings: WarmStartSettings(ckpt_to_initialize_from='C:\\Users\\solugate\\AppData\\Local\\Temp\\tmpwsoa_3ns\\keras\\keras_model.ckpt', vars_to_warm_start='.*', var_name_to_vocab_info={}, var_name_to_prev_var_name={})
INFO:tensorflow:Warm-starting from: ('C:\\Users\\solugate\\AppData\\Local\\Temp\\tmpwsoa_3ns\\keras\\keras_model.ckpt',)
INFO:tensorflow:Warm-starting variable: embedding/embeddings; prev_var_name: Unchanged
INFO:tensorflow:Warm-starting variable: conv1d/kernel; prev_var_name: Unchanged
INFO:tensorflow:Warm-starting variable: conv1d/bias; prev_var_name: Unchanged
INFO:tensorflow:Warm-starting variable: conv1d_1/kernel; prev_var_name: Unchanged
INFO:tensorflow:Warm-starting variable: conv1d_1/bia

<tensorflow_estimator.python.estimator.estimator.Estimator at 0x225086304a8>

In [23]:
char_cnn.evaluate(input_fn=val_input_fn)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
Instructions for updating:
Use tf.cast instead.
INFO:tensorflow:Starting evaluation at 2019-03-08T09:02:35Z
INFO:tensorflow:Graph was finalized.
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from C:\Users\solugate\AppData\Local\Temp\tmpwsoa_3ns\model.ckpt-1759
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-03-08-09:02:43
INFO:tensorflow:Saving dict for global step 1759: categorical_accuracy = 0.5164, global_step = 1759, loss = 0.6926784
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1759: C:\Users\solugate\AppData\Local\Temp\tmpwsoa_3ns\model.ckpt-1759


{'categorical_accuracy': 0.5164, 'loss': 0.6926784, 'global_step': 1759}