# IMDB 영화 리뷰 감성 분석

In [1]:
import numpy as np 
import tensorflow as tf
seed = 2021
np.random.seed(seed)
tf.random.set_seed(seed)

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

In [3]:
from tensorflow.keras.datasets import imdb
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=None)
X_train.shape, X_test.shape

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


((25000,), (25000,))

In [4]:
index_dict = {}
for key, value in imdb.get_word_index().items():
    index_dict[value] = key
len(index_dict)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


88584

In [5]:
print('영화평 최대 길이:', max(len(l) for l in X_train))
print('영화평 평균 길이:', sum(map(len, X_train))/len(X_train))

영화평 최대 길이: 2494
영화평 평균 길이: 238.71364


### LSTM으로 IMDB 리뷰 감성 분류
- 단어 빈도수: 5,000 (총 단어수: 88,584)
- 문장의 단어수: 500단어 (최대: 2,494)
- Test data중 10000개는 검증 데이터로

In [6]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [7]:
# 단어 빈도수: 5,000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=5000)

In [8]:
# 문장의 단어수: 500단어
max_len = 500
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [9]:
# Test data중 10000개는 검증 데이터로
from sklearn.model_selection import train_test_split
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, stratify=y_test, test_size=0.4, random_state=seed
)
X_test.shape, X_val.shape, y_test.shape, y_val.shape

((15000, 500), (10000, 500), (15000,), (10000,))

### 모델 정의/설정/학습/평가

In [10]:
model = Sequential([ 
    Embedding(5000, 120),
    LSTM(120),
    Dense(1, activation='sigmoid')
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 120)         600000    
_________________________________________________________________
lstm (LSTM)                  (None, 120)               115680    
_________________________________________________________________
dense (Dense)                (None, 1)                 121       
Total params: 715,801
Trainable params: 715,801
Non-trainable params: 0
_________________________________________________________________


In [11]:
model.compile(
    optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']
)

In [12]:
# Callback 함수
import os
if not os.path.exists('model'):
    os.mkdir('model')
model_file = 'model/best_imdb_lstm.h5'
mc = ModelCheckpoint(model_file, save_best_only=True, verbose=1)
es = EarlyStopping(patience=5)

In [13]:
history = model.fit(
    X_train, y_train, batch_size=100, epochs=50,
    validation_data=(X_val, y_val), verbose=0, callbacks=[mc, es]
)


Epoch 00001: val_loss improved from inf to 0.31794, saving model to model/best_imdb_lstm.h5

Epoch 00002: val_loss improved from 0.31794 to 0.31515, saving model to model/best_imdb_lstm.h5

Epoch 00003: val_loss did not improve from 0.31515

Epoch 00004: val_loss did not improve from 0.31515

Epoch 00005: val_loss did not improve from 0.31515

Epoch 00006: val_loss did not improve from 0.31515

Epoch 00007: val_loss did not improve from 0.31515


In [14]:
best_model = load_model(model_file)
best_model.evaluate(X_test, y_test)



[0.31531721353530884, 0.8687333464622498]