**<font size='6' color='red'>ch05. RNN으로 영화평 구분하기</font>**
- 5만개 영화 감상평 : 타겟변수로 부정/긍정

In [1]:
# 1. 패키지 수입
import numpy as np
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from time import time # 70.1.1 부터 현재까지 몇초지났는지

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

from sklearn.metrics import confusion_matrix, f1_score

In [2]:
# 2. 하이퍼 파라미터 설정 (이 파라미터를 바꾸면 정확도나 속도에 차이남) 
MY_WORDS = 10000 # imdb 데이터 안의 단어 수
MY_LENGTH = 80 # 영화평 단어 수 80 개만 독립변수로 사용
MY_EMBED = 32 # Embedding 결과 차원
MY_HIDDEN = 64 # LSTM의 units 차원

MY_EPOCH = 10 # 반복 학습수(fit)
MY_BATCH = 200 # 배치 사이즈(fit)

In [3]:
# 3. 데이터 불러오기
(X_train, y_train),(X_test, y_test) = imdb.load_data(num_words=MY_WORDS)

In [4]:
print('학습용 입력데이터(독립변수) 모양 :', X_train.shape)
print('학습용 출력데이터(종속변수) 모양 :', y_train.shape)
print('학습용 입력데이터 샘플 :',len(X_train[0]), '-', X_train[0])
print('학습용 출력데이터 샘플(0:부정/ 1:긍정) :', y_train[0])

print('테스트용 입력데이터(독립변수) 모양 :', X_test.shape)
print('테스트용 출력데이터(종속변수) 모양 :', y_test.shape)
print('테스트용 입력데이터 샘플 :', len(X_test[0]), '-', X_test[0])
print('테스트용 출력데이터 샘플(0:부정/ 1:긍정) :', y_test[0])

학습용 입력데이터(독립변수) 모양 : (25000,)
학습용 출력데이터(종속변수) 모양 : (25000,)
학습용 입력데이터 샘플 : 218 - [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16

In [5]:
# 긍정 갯수
print('학습용 데이터의 긍정 갯수 :', y_train.sum())
print('테스트용 데이터의 긍정 갯수 :', y_test.sum())

학습용 데이터의 긍정 갯수 : 12500
테스트용 데이터의 긍정 갯수 : 12500


In [6]:
# 4. 영화평 학습용 데이터 처음 10개 길이 출력 함수
def show_length():
    print('첫 10개 영화평의 길이')
    for i in range(10):
        print(f'{i+1}번째 {len(X_train[i])}')
show_length()

첫 10개 영화평의 길이
1번째 218
2번째 189
3번째 141
4번째 550
5번째 147
6번째 43
7번째 123
8번째 562
9번째 233
10번째 130


In [7]:
# 5. 문자 단어 -> 정수
word_to_id = imdb.get_word_index() # dict (문자단어 : 정수 id)
print(word_to_id['movie'])
print(word_to_id['film'])
# 정수 -> 문자 단어
id_to_word = {} # dict (정수id : 문자단어)
for key, val in word_to_id.items():
    id_to_word[val] = key
print(id_to_word[17])
print(id_to_word[19])

17
19
movie
film


In [8]:
msg = 'What a wonderful movie'
msg = msg.lower().split()
# 1: 리뷰시작을 알리는 숫자, 2: 문자가 짤려서 잘못 읽어옴, 3:
data = [1] + [word_to_id.get(m, -1)+3 for m in msg]
print('원 후기 내용 :', msg)
print('encoded된 data :', data)
print('data 추정 :', ' '.join([id_to_word.get(d-3, '???') for d in data]))

원 후기 내용 : ['what', 'a', 'wonderful', 'movie']
encoded된 data : [1, 51, 6, 389, 20]
data 추정 : ??? what a wonderful movie


In [9]:
# 6. 숫자 영화평 -> 자연어 영화평 함수
def decoding(review_num):
    decoded = []
    for num in review_num:
        word = id_to_word.get(num-3, '???')
        decoded.append(word)
    print(' '.join(decoded))
decoding(X_train[0])

??? this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert ??? is an amazing actor and now the same being director ??? father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for ??? and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also ??? to the two little boy's that played the ??? of norman and paul they were just brilliant children are often left out of the ??? list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done

In [10]:
print('pad_sequence 작업전')
show_length()

pad_sequence 작업전
첫 10개 영화평의 길이
1번째 218
2번째 189
3번째 141
4번째 550
5번째 147
6번째 43
7번째 123
8번째 562
9번째 233
10번째 130


In [11]:
# 7. 모든 영화평의 길이를 동일하게 만들기
X_train = pad_sequences(X_train,  # 2차원
                        padding='post',
                        truncating='post',  # 뒷부분을 자르고 앞부분을 남김
                        maxlen=MY_LENGTH)
X_test = pad_sequences(X_test,
                       padding='post',
                       truncating='post',
                       maxlen=MY_LENGTH)
show_length()

첫 10개 영화평의 길이
1번째 80
2번째 80
3번째 80
4번째 80
5번째 80
6번째 80
7번째 80
8번째 80
9번째 80
10번째 80


In [12]:
# 8. 최종 데이터 shape 확인
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((25000, 80), (25000,), (25000, 80), (25000,))

In [13]:
# 9. 모델 생성 및 구현
model = Sequential()
model.add(Embedding(input_dim=MY_WORDS,
                   output_dim=MY_EMBED,
                   input_length=MY_LENGTH))
model.add(LSTM(units=MY_HIDDEN,
              input_shape=(MY_LENGTH, MY_EMBED)))
model.add(Dense(units=1,
               activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 80, 32)            320000    
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 344,897
Trainable params: 344,897
Non-trainable params: 0
_________________________________________________________________


In [14]:
# 10. 학습 환경 설정 및 학습하기
model.compile(loss='binary_crossentropy',  # 이중분류 시 손실함수
              optimizer='adam', metrics=['accuracy'])
begin = time()  # 70.1.1 ~ 현재시점까지의 초
model.fit(x=X_train, y=y_train, epochs=MY_EPOCH, batch_size=MY_BATCH, verbose=1, validation_split=0.2)
end = time()  # 70.1.1 ~ 현재시점까지의 초
print('총 학습 시간 : {:.2f}'.format(end-begin))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
총 학습 시간 : 24.18


In [15]:
# 11. 모델 평가
loss, acc = model.evaluate(X_test, y_test, verbose=1)
print('test데이터 정확도(acc) :', acc)

test데이터 정확도(acc) : 0.7557600140571594


In [16]:
# 혼돈 행렬
pred = model.predict(X_test)
y_hat = (pred>0.5).astype(int).reshape(-1)
y_hat



array([0, 1, 1, ..., 0, 1, 1])

In [17]:
y_test.shape, y_hat.shape

((25000,), (25000,))

In [18]:
confusion_matrix(y_test, y_hat)

array([[9461, 3039],
       [3067, 9433]], dtype=int64)

In [19]:
# accuracy (전체 중 정답을 맞춘 비율) :
#          = (TN+TP) / (TN+FP+FN+TP)
# precision(정밀도, 민감도 - True로 예측한 것중 맞춘 비율)
#          = TP / FP+TP
# recall(재현율 - 실제값이 True인 것 중 True로 맞춘 비율)
#          = TP / (FN/TP)

In [32]:
# 12. 모델 사용하기
review = """What a wonderful movie
The actors roles were good the content was good and the sound was good 
There was a disadvantage that I wanted to go to the bathroom in the middle because the running time was long
The reservation rate was not as high as I thought so I did a good job with the reservation
I highly recommend it
You won't regret it if you watch it"""
review = review.lower().split()
review = [1]+[word_to_id.get(r, -1)+3 for r in review]
print(len(review),[review])
review = np.array(review).reshape(-1, len(review))
review.shape

71 [[1, 51, 6, 389, 20, 4, 156, 555, 71, 52, 4, 1500, 16, 52, 5, 4, 481, 16, 52, 50, 16, 6, 23667, 15, 13, 473, 8, 140, 8, 4, 3868, 11, 4, 655, 88, 4, 620, 58, 16, 196, 4, 14802, 967, 16, 24, 17, 312, 17, 13, 197, 38, 13, 122, 6, 52, 292, 19, 4, 14802, 13, 545, 386, 12, 25, 528, 2598, 12, 48, 25, 106, 12]]


(1, 71)

In [33]:
input_data = pad_sequences(review,
                          padding='post',
                          maxlen=MY_LENGTH,
                          truncating='post' # 80개 단어 이상일 경우 짜르기
                          )
input_data.shape

(1, 80)

In [35]:
(model.predict(input_data)>0.5).astype('int8')



array([[1]], dtype=int8)