### GRU

In [12]:
from tensorflow.keras.datasets import imdb

In [26]:
pd.set_option('display.max.colwidth', None)
pd.set_option('display.max_rows', 100)

In [13]:
(X_train, y_train), (X_test, y_test) = imdb.load_data()

print('훈련용 리뷰 개수 : {}'.format(len(X_train)))
print('테스트용 리뷰 개수 : {}'.format(len(X_test)))
num_classes = len(set(y_train))
print('카테고리 : {}'.format(num_classes))

훈련용 리뷰 개수 : 25000
테스트용 리뷰 개수 : 25000
카테고리 : 2


In [14]:
reviews_length = [len(review) for review in X_train]

unique_elements, counts_elements = np.unique(y_train, return_counts=True)
print("각 레이블에 대한 빈도수:")
print(np.asarray((unique_elements, counts_elements)))

각 레이블에 대한 빈도수:
[[    0     1]
 [12500 12500]]


In [15]:
word_to_index = imdb.get_word_index()
index_to_word = {}
for key, value in word_to_index.items():
    index_to_word[value+3] = key

In [16]:
for index, token in enumerate(("<pad>", "<sos>", "<unk>")):
    index_to_word[index] = token

print(' '.join([index_to_word[index] for index in X_train[0]]))

<sos> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert redford's is an amazing actor and now the same being director norman's father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for retail and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also congratulations to the two little boy's that played the part's of norman and paul they were just brilliant children are often left out of the praising list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and shoul

In [17]:
import tensorflow as tf

In [18]:
import re
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model

vocab_size = 10000
max_len = 500

(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocab_size)

X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [10]:
embedding_dim = 100
hidden_units = 128

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(GRU(hidden_units))
model.add(Dense(1, activation='sigmoid'))

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
mc = ModelCheckpoint('GRU_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train, epochs=20, callbacks=[es, mc], batch_size=64, validation_split=0.2)

Epoch 1/20
Epoch 1: val_acc improved from -inf to 0.50140, saving model to GRU_model.h5
Epoch 2/20
Epoch 2: val_acc improved from 0.50140 to 0.51440, saving model to GRU_model.h5
Epoch 3/20
Epoch 3: val_acc did not improve from 0.51440
Epoch 4/20
Epoch 4: val_acc did not improve from 0.51440
Epoch 5/20
Epoch 5: val_acc improved from 0.51440 to 0.51860, saving model to GRU_model.h5
Epoch 6/20
Epoch 6: val_acc improved from 0.51860 to 0.52340, saving model to GRU_model.h5
Epoch 7/20
Epoch 7: val_acc improved from 0.52340 to 0.52460, saving model to GRU_model.h5
Epoch 8/20
Epoch 8: val_acc improved from 0.52460 to 0.52560, saving model to GRU_model.h5
Epoch 9/20

KeyboardInterrupt: 

In [19]:
loaded_model = load_model('/Users/suchan/study/파이널 프로젝트/0506_GRU_model.h5')
print("\n 테스트 정확도: %.4f" % (loaded_model.evaluate(X_test, y_test)[1]))


 테스트 정확도: 0.8950


In [20]:
def sentiment_predict(new_sentence):
    # 알파벳과 숫자를 제외하고 모두 제거 및 알파벳 소문자화
    new_sentence = re.sub('[^0-9a-zA-Z ]', '', new_sentence).lower()
    encoded = []

    # 띄어쓰기 단위 토큰화 후 정수 인코딩
    for word in new_sentence.split():
        try:
            # 단어 집합의 크기를 10,000으로 제한.
            if word_to_index[word] <= 10000:
                encoded.append(word_to_index[word]+3)
            else:
                # 10,000 이상의 숫자는 <unk> 토큰으로 변환.
                encoded.append(2)
        # 단어 집합에 없는 단어는 <unk> 토큰으로 변환.
        except:
            encoded.append(2)

    pad_sequence = pad_sequences([encoded], maxlen=max_len)
    score = float(loaded_model.predict(pad_sequence))  # 예측

    if(score > 0.6):
        return 0
    else:
        return 1

In [21]:
import pandas as pd
eng = pd.read_csv('/Users/suchan/study/파이널 프로젝트/0505_ENG_Korean.csv')

In [None]:
# senti_final = []
# for i in range(0, 5):
#   test_input = eng['reviews'][i]
#   re_senti = sentiment_predict(test_input)
#   senti_final.append(re_senti)
# senti_final

In [22]:
from tqdm import tqdm

In [23]:
pos_neg = []
for i in tqdm(range(len(eng['reviews']))):
    sentiment = sentiment_predict(eng['reviews'][i])
    pos_neg.append(sentiment)

100%|██████████| 27699/27699 [28:28<00:00, 16.21it/s]


In [24]:
eng['pos_neg'] = pos_neg

In [32]:
for title in eng.title.unique():
    print(title)
    print(eng[eng.title == title].pos_neg.value_counts())

악의마음을읽는자들
0    121
1     57
Name: pos_neg, dtype: int64
라이프온마스
0    73
1    21
Name: pos_neg, dtype: int64
옷소매붉은끝동
0    886
1    510
Name: pos_neg, dtype: int64
이구역의미친X
0    70
1    31
Name: pos_neg, dtype: int64
어느날우리집현관으로멸망이들어왔다
0    397
1    229
Name: pos_neg, dtype: int64
스물다섯스물하나
0    877
1    502
Name: pos_neg, dtype: int64
무브투헤븐:나는유품정리사입니다
0    155
1     41
Name: pos_neg, dtype: int64
호텔델루나
0    588
1    258
Name: pos_neg, dtype: int64
하늘에서내리는일억개의별
0    159
1     86
Name: pos_neg, dtype: int64
기름진멜로
0    35
1    28
Name: pos_neg, dtype: int64
악의꽃
0    512
1    161
Name: pos_neg, dtype: int64
서른이지만열일곱입니다
0    109
1     21
Name: pos_neg, dtype: int64
킹덤
0    280
1     68
Name: pos_neg, dtype: int64
작은신의아이들
1    11
0     9
Name: pos_neg, dtype: int64
검색어를입력하세요WWW
0    130
1     45
Name: pos_neg, dtype: int64
술꾼도시여자들
0    25
1    21
Name: pos_neg, dtype: int64
한번다녀왔습니다
0    53
1    28
Name: pos_neg, dtype: int64
서른아홉
0    57
1    47
Name: pos_neg, dtype: int64
오월의청춘
0    240
1     8

In [58]:
eng.to_csv('0506_ENG_Korean_plus_Sentiment.csv',index=False)