In [1]:
import pandas as pd
import numpy as np

In [2]:
with open('../data/Korean_movie_reviews_2016.txt', encoding='utf-8') as f:
    docs = [doc.strip().split('\t') for doc in f]
    docs = [(doc[0], int(doc[1])) for doc in docs if len(doc) == 2]
    texts, labels = zip(*docs)

In [3]:
words_list = [doc.strip().split() for doc in texts]

In [4]:
print(words_list[:2])

In [5]:
total_words = []
for words in words_list:
    total_words.extend(words)

In [6]:
from collections import Counter
c = Counter(total_words)

In [7]:
max_features = 10000
common_words = [ word for word, count in c.most_common(max_features)]
# 빈도를 기준으로 상위 10000개의 단어들만 선택

In [8]:
# 각 단어에 대해서 index 생성하기
words_dic ={}
for index, word in enumerate(common_words):
    words_dic[word]=index+1

In [9]:
# 각 문서를 상위 10000개 단어들에 대해서 index 번호로 표현하기
filtered_indexed_words = []
for review in words_list:
    indexed_words=[]
    for word in review:
        try:
            indexed_words.append(words_dic[word])
        except:
            pass
    filtered_indexed_words.append(indexed_words)

In [10]:
filtered_indexed_words[0]

In [11]:
docs_len = [len(doc)for doc in filtered_indexed_words]

In [12]:
df = pd.DataFrame(docs_len, columns=["doc_len"])

In [13]:
df.hist()

In [14]:
df.describe()

In [15]:
from tensorflow.keras.preprocessing import sequence
max_len = 40
X = sequence.pad_sequences(filtered_indexed_words, maxlen=max_len)

In [16]:
from tensorflow.keras.utils import to_categorical
y_one_hot = to_categorical(labels)

In [17]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y_one_hot, test_size=0.1)

In [18]:
len(y_test)

In [19]:
from tensorflow.keras import layers
from tensorflow.keras import models

In [20]:
model = models.Sequential()
model.add(layers.Embedding(max_features+1, 128, input_length=max_len))
model.add(layers.Conv1D(32, 5, activation='relu')) 
model.add(layers.MaxPool1D(2)) 
model.add(layers.Conv1D(16, 3, activation='relu')) 
model.add(layers.MaxPool1D(2)) 
model.add(layers.Flatten())
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(2, activation='softmax'))
model.summary()

In [21]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
checkpoint_filepath = './temp/checkpoint_kr'
mc = ModelCheckpoint(checkpoint_filepath, monitor='val_loss', mode='min', 
                     save_weights_only=True, save_best_only=True)

In [22]:
from tensorflow.keras.optimizers import RMSprop
model.compile(optimizer=RMSprop(learning_rate=0.001), loss='binary_crossentropy', metrics='accuracy')

In [23]:
history = model.fit(X_train, y_train, epochs=20, batch_size=128, validation_split=0.1, 
                   callbacks=[es, mc])

In [24]:
import matplotlib.pyplot as plt
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend(['train','val'])
plt.show()

In [25]:
model.load_weights(checkpoint_filepath)

In [26]:
test_loss, test_acc = model.evaluate(X_test,y_test)

In [27]:
test_acc

In [28]:
model.predict(X_test[0:1])

In [29]:
y_test[0]