# IMdb dataset
으로 감성분류(이항분류) - LSTM+Dens, Conv+Dense

In [None]:
from keras.datasets import imdb
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout

(x_train, y_train),(x_test,y_test) = imdb.load_data(num_words=10000) # 숫자 제한 10000
print(x_train.shape, y_train.shape,x_test.shape,y_test.shape) # (25000,) (25000,) (25000,) (25000,)
print(x_train[:1])
print(y_train[:1], set(y_train)) #{0, 1}

In [None]:
import matplotlib.pyplot as plt

len_data = [len(i) for i in x_train]
print('요소 최대 크기 : ', np.max(len_data)) # 2494
print('요소 크기 평균 : ', np.mean(len_data)) # 238.71364
# plt.boxplot(len_data)
# plt.show()

#x_train에 등록된 인덱스에 해당하는 단어 출력
word_to_index=imdb.get_word_index()
index_to_word = {}
for k, v in word_to_index.items():
    # print(k)
    # print(v)
    index_to_word[v + 3]=k

print(index_to_word) #{34701: 'fawn', 52006: 'tsukino', 52007: 'nunnery',
print('빈도수 1등 : ', index_to_word[4])
print('빈도수 100등 : ', index_to_word[103])
print()
# imdb는 pad 부분은 0, 문장 시작은 1, unknown은 2로 채워져 있다.
for idx, token in enumerate(('<pad>','<sos>','<unk>')): #애네 회사가 이렇게 만들었기에 우리도 이렇게 쓸수 밖에 없음
    index_to_word[idx]=token

print(' '.join([index_to_word[i] for i in x_train[0]]))

In [None]:
from keras.utils import pad_sequences
# early stopping주기
from keras.callbacks import EarlyStopping, ModelCheckpoint

max_len = 500  #리뷰 최대길이는 500으로 제한

x_train = pad_sequences(x_train, maxlen=max_len)
x_test = pad_sequences(x_test, maxlen=max_len)
# print(x_train[:1])

# 모델 작성 방법1 : LSTM + Dense
model = Sequential()
model.add(Embedding(10001,200,input_length=max_len))
model.add(LSTM(128,activation='tanh'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary()) #Total params: 2168777 (8.27 MB)

model.compile(optimizer='adam',loss='binary_crossentropy', metrics=['acc'])

es = EarlyStopping(monitor ='val_loss', mode='auto', patience=5, baseline=0.01)
#baseline :특정값을 정해놓고 이 값에 도달하면 patience를 종료하는 것
mc= ModelCheckpoint('tf37m1.hdf5',monitor='val_loss',save_best_only=True)


history = model.fit(x_train,y_train, validation_split=0.2, batch_size=64, epochs=100, callbacks=[es, mc], verbose=2)

print('acc: ',history.history['acc'])
print('loss: ',history.history['loss'])
print('evaluate: ', model.evaluate(x_test,y_test)) #evaluate:  [0.7049450874328613, 0.8182799816131592]



In [None]:
#pred
from keras.models import load_model
mymodel = load_model('tf37m1.hdf5')
pred=mymodel.predict(x_test)
print('예측값 : ', pred[:10].flatten())
print('실제값 : ',y_test[:10])


In [None]:
# 모델 작성 방법2 : Conv1D + Dense
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D, Dropout

model = Sequential()
model.add(Embedding(10001,200,input_length=max_len))
model.add(Conv1D(filters=128,kernel_size=3, padding='valid',strides=1,activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary()) #Total params: 2085449 (7.96 MB)

model.compile(optimizer='adam',loss='binary_crossentropy', metrics=['acc'])

es = EarlyStopping(monitor ='val_loss', mode='auto', patience=5, baseline=0.01)
#baseline :특정값을 정해놓고 이 값에 도달하면 patience를 종료하는 것
mc= ModelCheckpoint('tf37m2.hdf5',monitor='val_loss',save_best_only=True)


history = model.fit(x_train,y_train, validation_split=0.2, batch_size=64, epochs=100, callbacks=[es, mc], verbose=2)

print('acc: ',history.history['acc'])
print('loss: ',history.history['loss'])
print('evaluate: ', model.evaluate(x_test,y_test)) #evaluate:  [0.7049450874328613, 0.8182799816131592]

In [None]:
# 시각화
vloss= history.history['val_loss']
loss=history.history['loss']
epoch = np.arange(len(loss))
plt.plot(epoch, vloss, marker='.',c='red',label='val_loss')
plt.plot(epoch, loss, marker='s',c='blue',label='loss')
plt.legend()

plt.grid()
plt.xlabel('epoch')
plt.ylabel('loss')
plt.show()

#전통적인 방법 LSTM빼고 Dense만 써주어야 한다.

In [None]:
#pred
mymodel = load_model('tf37m2.hdf5')
pred=mymodel.predict(x_test)
print('예측값 : ', pred[:10].flatten())
print('실제값 : ',y_test[:10])

In [None]:
#새로운 영화평 값으로 감성분류 tensorflow -p21
# IMDB dataset 긍부정 예측 함수
import re
def sentiment_predict(new_sentence):
  new_sentence = re.sub('[^0-9a-zA-Z ]', '', new_sentence).lower()
  # 정수 인코딩
  encoded = []
  for word in new_sentence.split():
    # 단어 집합의 크기를 10,000으로 제한.
    try :
      if word_to_index[word] <= 10000:
         encoded.append(word_to_index[word]+3)
      else:
         encoded.append(2)   # 10,000 이상의 숫자는 <unk> 토큰으로 취급.
    except KeyError:
      encoded.append(2)     # 단어 집합에 없는 단어는 <unk> 토큰으로 취급.

  pad_new = pad_sequences([encoded], maxlen = max_len)  # 패딩

  # 예측하기
  score = float(mymodel.predict(pad_new))
  if(score > 0.5):
     print("{:.2f}% 확률로 긍정!.".format(score * 100))
  else:
     print("{:.2f}% 확률로 부정!".format((1 - score) * 100))

# 긍/부정 분류 예측
temp_str = "This movie was just way too overrated. The fighting was not professional."
sentiment_predict(temp_str)

temp_str = "good"
sentiment_predict(temp_str)

temp_str = "I was lucky enough to be included in the group to see the advanced screening in Seoul. And,  I need to say a big thank-you to Marvel Studios."
sentiment_predict(temp_str)

temp_str = "bad"
sentiment_predict(temp_str)

# 이미지 뿐만 아니라 자연어도 잘 처리한다.
# rnn이 이것뿐만 아니라 서로 잘 처리한다.
# dense에 넘기기전에 cnn 이냐 rnn인지 판단해야 하는데 이것은 데이터에 따라 달라진다.
