# 네이버 영화 리뷰 감성 분류

- LSTM 활용
- 출처: 위키독스
- 데이터 다운로드 링크: https://github.com/e9t/nsmc/

In [1]:
# Konlpy 설치
!pip install Konlpy

Collecting Konlpy
[?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
[K     |████████████████████████████████| 19.4MB 168kB/s 
[?25hCollecting tweepy>=3.7.0
  Downloading https://files.pythonhosted.org/packages/67/c3/6bed87f3b1e5ed2f34bd58bf7978e308c86e255193916be76e5a5ce5dfca/tweepy-3.10.0-py2.py3-none-any.whl
Collecting JPype1>=0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/de/af/93f92b38ec1ff3091cd38982ed19cea2800fefb609b5801c41fc43c0781e/JPype1-1.2.1-cp36-cp36m-manylinux2010_x86_64.whl (457kB)
[K     |████████████████████████████████| 460kB 55.3MB/s 
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Collecting beautifulsoup4==4.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e2270723

In [2]:
import konlpy
konlpy.__version__

'0.5.2'

In [5]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

seed = 2021
np.random.seed(seed)
tf.random.set_seed(seed)

In [6]:
from google.colab import files
uploaded = files.upload()
filename = list(uploaded.keys())[0]

Saving ratings_train.txt to ratings_train.txt


In [7]:
from google.colab import files
uploaded = files.upload()
testfile = list(uploaded.keys())[0]

Saving ratings_test.txt to ratings_test.txt


## 데이터 전처리

In [42]:
import pandas as pd
train_df = pd.read_table(filename)
test_df = pd.read_table(testfile)
train_df.shape, test_df.shape

((150000, 3), (50000, 3))

In [43]:
train_df.head(1)

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0


In [44]:
# 중복 여부 확인
train_df['document'].nunique()

146182

In [45]:
# 중복 샘플 제거
train_df.drop_duplicates(subset=['document'], inplace=True)
train_df.shape

(146183, 3)

In [46]:
# Null값 확인
train_df.isnull().sum()

id          0
document    1
label       0
dtype: int64

In [47]:
# Null값 제거
train_df = train_df.dropna(how='any')
train_df.shape

(146182, 3)

In [48]:
# 긍정, 부정 레이블 값의 갯수
train_df.label.value_counts()

0    73342
1    72840
Name: label, dtype: int64

## 테스트 데이터 셋에도 마찬가지로 적용

In [49]:
# 중복 샘플 제거
test_df.drop_duplicates(subset=['document'], inplace=True)
print(test_df.shape)

# Null값 제거
test_df = test_df.dropna(how='any')
print(test_df.shape)

# 긍정, 부정 레이블 값의 갯수
test_df.label.value_counts()

(49158, 3)
(49157, 3)


1    24711
0    24446
Name: label, dtype: int64

## 한글 텍스트 전처리

In [50]:
train_df['document'] = train_df['document'].str.replace("[^ ㄱ-ㅣ가-힣]", "")
train_df.head(1)

Unnamed: 0,id,document,label
0,9976970,아 더빙 진짜 짜증나네요 목소리,0


In [51]:
train_df['document'].replace("", np.nan, inplace=True)
train_df.isnull().sum()

id            0
document    391
label         0
dtype: int64

In [52]:
train_df = train_df.dropna(how='any')
train_df.shape

(145791, 3)

In [53]:
test_df['document'] = test_df['document'].str.replace("[^ ㄱ-ㅣ가-힣]","")
test_df['document'].replace('', np.nan, inplace=True)
test_df.isnull().sum()

id            0
document    162
label         0
dtype: int64

In [54]:
test_df = test_df.dropna(how='any')
test_df.shape

(48995, 3)

## 한글 형태소 분석

In [65]:
# 토큰화와 불용어 제거
from konlpy.tag import Okt
import tqdm.notebook as tn
okt = Okt()
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

In [85]:
X_train=[]
for sentence in tn.tqdm(train_df['document']):
  temp_X = []
  temp_X = okt.morphs(sentence, stem=True) # 토큰화, step:어간제거 - 동사원형으로 추출
  temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
  X_train.append(temp_X)

HBox(children=(FloatProgress(value=0.0, max=145791.0), HTML(value='')))




In [86]:
X_test=[]
for sentence in tn.tqdm(test_df['document']):
  temp_X = []
  temp_X = okt.morphs(sentence, stem=True) # 토큰화
  temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
  X_test.append(temp_X)

HBox(children=(FloatProgress(value=0.0, max=48995.0), HTML(value='')))




In [87]:
X_train[:3]

[['아', '더빙', '진짜', '짜증나다', '목소리'],
 ['흠', '포스터', '보고', '초딩', '영화', '줄', '오버', '연기', '조차', '가볍다', '않다'],
 ['너', '무재', '밓었', '다그', '래서', '보다', '추천', '다']]

## 케라스 인코딩

In [88]:
# 정수 인코딩
max_words = 15000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [89]:
X_train[:3]

[[50, 454, 16, 260, 659],
 [933, 457, 41, 602, 1, 214, 1449, 24, 961, 675, 19],
 [386, 2444, 2315, 5671, 2, 222, 9]]

In [90]:
# 전체 데이터의 길이 분포
print('리뷰의 최대 길이: ', max(len(s) for s in X_train))
print('리뷰의 평균 길이: ', sum(map(len, X_train))/len(X_train))

리뷰의 최대 길이:  69
리뷰의 평균 길이:  10.66632370996838


In [86]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.hist([len(s) for s in X_train], bins=)
plt.xlabel('')
plt.ylabel('')
plt.show()

In [91]:
# X_train과 X_test의 모든 샘플의 길이를 동일하게 30으로 셋팅
max_len=30

# 전체 데이터의 길이는 30으로 맞춘다.
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [93]:
y_train = train_df['label'].values
y_test = test_df['label'].values

## LSTM 모델 정의/설정/학습/평가

In [94]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [114]:
model = Sequential()
model.add(Embedding(max_words, 100))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         1500000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 1,617,377
Trainable params: 1,617,377
Non-trainable params: 0
_________________________________________________________________


In [118]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [119]:
from tensorflow.keras.callbacks import ModelCheckpoint
checkpointer = ModelCheckpoint(filepath='naver-lstm-best-model.h5',
                               monitor='val_loss',
                               verbose=1, save_best_only=True)

In [120]:
history = model.fit(X_train, y_train, 
                    epochs=4, 
                    batch_size=60,
                    validation_split=0.2, 
                    verbose=1,
                    callbacks=[checkpointer])

Epoch 1/4

Epoch 00001: val_loss improved from inf to 0.35298, saving model to naver-lstm-best-model.h5
Epoch 2/4

Epoch 00002: val_loss improved from 0.35298 to 0.34117, saving model to naver-lstm-best-model.h5
Epoch 3/4

Epoch 00003: val_loss did not improve from 0.34117
Epoch 4/4

Epoch 00004: val_loss did not improve from 0.34117


In [121]:
# Bset Model 선택
from tensorflow.keras.models import load_model
best_model = load_model('naver-lstm-best-model.h5')

In [124]:
acc = best_model.evaluate(X_test, y_test, verbose=2)[1]
print(f'Accuracy: {acc:.4f}')

1532/1532 - 4s - loss: 0.3468 - accuracy: 0.8485
Accuracy: 0.8485


In [123]:
# 테스트셋
y_vloss = history.history['val_loss']
y_vacc = history.history['val_accuracy']

# 학습셋
y_loss = history.history['loss']
y_acc = history.history['accuracy']

## Simple RNN

In [128]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.callbacks import ModelCheckpoint

del rnn_model
rnn_model = Sequential()
rnn_model.add(Embedding(max_words, 100))
rnn_model.add(SimpleRNN(128))
rnn_model.add(Dense(1, activation='sigmoid'))
rnn_model.summary()

rnn_model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)


rnn_checkpointer = ModelCheckpoint(
    filepath='naver-lstm-best-rnn_model.h5',
    monitor='val_loss',
    verbose=1, 
    save_best_only=True
)

rnn_history = rnn_model.fit(X_train, y_train, 
                    epochs=4, 
                    batch_size=60,
                    validation_split=0.2, 
                    verbose=1,
                    callbacks=[rnn_checkpointer])

# Bset Model 선택
best_rnn_model = load_model('naver-lstm-best-rnn_model.h5')
acc = best_rnn_model.evaluate(X_test, y_test, verbose=2)[1]
print(f'Accuracy: {acc:.4f}')

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 100)         1500000   
_________________________________________________________________
simple_rnn_3 (SimpleRNN)     (None, 128)               29312     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 129       
Total params: 1,529,441
Trainable params: 1,529,441
Non-trainable params: 0
_________________________________________________________________
Epoch 1/4

Epoch 00001: val_loss improved from inf to 0.38822, saving model to naver-lstm-best-rnn-model.h5
Epoch 2/4

Epoch 00002: val_loss improved from 0.38822 to 0.37067, saving model to naver-lstm-best-rnn-model.h5
Epoch 3/4

Epoch 00003: val_loss did not improve from 0.37067
Epoch 4/4

Epoch 00004: val_loss did not improve from 0.37067
1532/1532 - 5s - loss: 0.3752 

## LSTM + CNN

In [132]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Conv1D, Dropout, MaxPooling1D
from tensorflow.keras.layers import Embedding, Dense, LSTM, SimpleRNN
from tensorflow.keras.callbacks import ModelCheckpoint

del cnn_model
cnn_model = Sequential()
cnn_model.add(Embedding(max_words, 100))
cnn_model.add(Dropout(0.5))
cnn_model.add(Conv1D(64, 5, padding='valid', activation='relu'))
cnn_model.add(MaxPooling1D(pool_size=4))
cnn_model.add(LSTM(60))
cnn_model.add(Dense(1, activation='sigmoid'))
cnn_model.summary()

cnn_model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

cnn_checkpointer = ModelCheckpoint(
    filepath='naver-lstm-best-cnn_model.h5',
    monitor='val_loss',
    verbose=1, 
    save_best_only=True
)

cnn_history = cnn_model.fit(X_train, y_train, 
                    epochs=4, 
                    batch_size=60,
                    validation_split=0.2, 
                    verbose=1,
                    callbacks=[cnn_checkpointer])

# Bset Model 선택
best_cnn_model = load_model('naver-lstm-best-cnn_model.h5')
acc = best_cnn_model.evaluate(X_test, y_test, verbose=2)[1]
print(f'Accuracy: {acc:.4f}')

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, None, 100)         1500000   
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 100)         0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, None, 64)          32064     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, None, 64)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 60)                30000     
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 61        
Total params: 1,562,125
Trainable params: 1,562,125
Non-trainable params: 0
____________________________________________