### 네이버 영화리뷰 데이터로 감성분석 해보자(RNN)


In [1]:
# 한국어 형태소 분석기 설치
!pip install Konlpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 1.2 MB/s 
Collecting JPype1>=0.7.0
  Downloading JPype1-1.4.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (453 kB)
[K     |████████████████████████████████| 453 kB 67.9 MB/s 
Installing collected packages: JPype1, Konlpy
Successfully installed JPype1-1.4.0 Konlpy-0.6.0


In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM, GRU, Dropout, InputLayer #  Dropout: 과대적합 막아줌
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import pickle
from konlpy.tag import Okt
from tqdm import tqdm

### 데이터 로딩

In [8]:
%cd drive/MyDrive/Colab\ Notebooks

/content/drive/MyDrive/Colab Notebooks


In [9]:
%pwd

'/content/drive/MyDrive/Colab Notebooks'

In [12]:
train = pd.read_csv('./data/ratings_train.csv')
test = pd.read_csv('./data/ratings_test.csv')

In [13]:
train.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [15]:
# 결측치 제거
train.dropna(inplace=True)
test.dropna(inplace=True)

In [16]:
X_train = train['document']
y_train = train['label']
X_test = test['document']
y_test = test['label']

In [17]:
(X_train.shape, y_train.shape), (X_test.shape, y_test.shape)

(((149995,), (149995,)), ((49997,), (49997,)))

#### 형태소 추출

In [18]:
okt = Okt() # 형태소 분석기 생성

In [19]:
X_train_morphs = []

for doc in tqdm(X_train): # tqdm을 이용해서 반복횟수 시각화
  tmp = okt.morphs(doc) # 형태소 추출
  X_train_morphs.append(' '.join(tmp))


100%|██████████| 149995/149995 [08:49<00:00, 283.16it/s]


In [21]:
X_test_morphs = []

for doc in tqdm(X_test): # tqdm을 이용해서 반복횟수 시각화
  tmp = okt.morphs(doc) # 형태소 추출
  X_test_morphs.append(' '.join(tmp))

100%|██████████| 49997/49997 [02:49<00:00, 295.39it/s]


In [70]:
with open('./data/X_train_morphs.pkl','wb') as f:
  pickle.dump(X_train_morphs,f)

with open('./data/X_test_morphs.pkl','wb') as f:
  pickle.dump(X_test_morphs,f)

In [71]:
# pickle파일 로딩
with open('./data/X_train_morphs.pkl','rb') as f:
  x_train = pickle.load(f)

In [72]:
# pickle파일 로딩
with open('./data/X_test_morphs.pkl','rb') as f:
  x_test = pickle.load(f)

#### 빈도단위의 인코딩 진행

In [73]:
# 띄어쓰기 단위로 토큰화 및 단어 빈도를 기반으로 랭킹화(인코딩)
tokenizer = Tokenizer()

In [74]:
tokenizer.fit_on_texts(X_train)

In [75]:
tokenizer.word_index # 각 토큰들의 랭킹

{'영화': 1,
 '너무': 2,
 '정말': 3,
 '진짜': 4,
 '이': 5,
 '그냥': 6,
 '왜': 7,
 '이런': 8,
 '더': 9,
 '수': 10,
 '영화를': 11,
 '다': 12,
 '잘': 13,
 '좀': 14,
 '보고': 15,
 'ㅋㅋ': 16,
 '그': 17,
 '영화는': 18,
 '영화가': 19,
 '본': 20,
 '봤는데': 21,
 '최고의': 22,
 '아': 23,
 '이건': 24,
 '내가': 25,
 '없는': 26,
 '없다': 27,
 '드라마': 28,
 '완전': 29,
 '이렇게': 30,
 '참': 31,
 '이거': 32,
 '평점': 33,
 '그리고': 34,
 '이게': 35,
 '좋은': 36,
 '있는': 37,
 '연기': 38,
 '내': 39,
 '평점이': 40,
 '보는': 41,
 '다시': 42,
 '스토리': 43,
 '역시': 44,
 '최고': 45,
 '쓰레기': 46,
 '난': 47,
 '많이': 48,
 'ㅋ': 49,
 '것': 50,
 'ㅋㅋㅋ': 51,
 '한': 52,
 'ㅠㅠ': 53,
 '재밌게': 54,
 '없고': 55,
 '또': 56,
 '하는': 57,
 '아깝다': 58,
 '꼭': 59,
 '보면': 60,
 '가장': 61,
 '마지막': 62,
 '뭐': 63,
 '영화다': 64,
 '무슨': 65,
 '하지만': 66,
 'ㅡㅡ': 67,
 'ㅎㅎ': 68,
 '10점': 69,
 '별로': 70,
 '같은': 71,
 '작품': 72,
 '솔직히': 73,
 '끝까지': 74,
 '볼': 75,
 '넘': 76,
 '안': 77,
 '대한': 78,
 '만든': 79,
 '와': 80,
 '봐도': 81,
 'ㅠ': 82,
 '전혀': 83,
 '그래도': 84,
 '같다': 85,
 '말이': 86,
 '좋다': 87,
 '지금': 88,
 '아주': 89,
 '뭔가': 90,
 '있다': 91,
 '영화의': 92,

In [31]:
tokenizer.word_counts

OrderedDict([('아', 2164),
             ('더빙', 131),
             ('진짜', 6682),
             ('짜증나네요', 18),
             ('목소리', 113),
             ('흠', 190),
             ('포스터보고', 20),
             ('초딩영화줄', 1),
             ('오버연기조차', 1),
             ('가볍지', 17),
             ('않구나', 3),
             ('너무재밓었다그래서보는것을추천한다', 1),
             ('교도소', 4),
             ('이야기구먼', 1),
             ('솔직히', 993),
             ('재미는', 309),
             ('없다', 2019),
             ('평점', 1862),
             ('조정', 11),
             ('사이몬페그의', 1),
             ('익살스런', 2),
             ('연기가', 767),
             ('돋보였던', 23),
             ('영화', 18995),
             ('스파이더맨에서', 1),
             ('늙어보이기만', 1),
             ('했던', 140),
             ('커스틴', 4),
             ('던스트가', 1),
             ('너무나도', 198),
             ('이뻐보였다', 1),
             ('막', 193),
             ('걸음마', 1),
             ('뗀', 2),
             ('3세부터', 1),
             ('초등학교', 55),
             ('1학년생인', 1),
     

In [32]:
# 전체 단어토큰 확인
len(tokenizer.word_index)

296310

#### 빈도가 낮은 제거할 단어들을 알아보자

In [39]:
df = pd.DataFrame(tokenizer.word_counts.items(),
                  columns=['word', 'count'])

Unnamed: 0,word,count
0,영화,18995
1,너무,8563
2,정말,8537
3,진짜,6682
4,이,5418
...,...,...
296305,년이다,1
296306,공산당들은,1
296307,말이필요없는걸작,1
296308,음악이예술,1


In [40]:
df_sorted = df.sort_values(by='count', ascending=False).reset_index()[['word', 'count']]

In [41]:
# 등장빈도가 20번 미만인 데이터는 제거
df_sorted[df_sorted['count']<20]

Unnamed: 0,word,count
6382,느낌이었다,19
6383,경우,19
6384,줘야,19
6385,했다고,19
6386,살아갈,19
...,...,...
296305,년이다,1
296306,공산당들은,1
296307,말이필요없는걸작,1
296308,음악이예술,1


In [47]:
# 20번 미만의 단어들을 제거하기 위해 6382위 단어까지만 남겨놓자
final_tokenizer = Tokenizer(num_words=6382)

In [48]:
final_tokenizer.fit_on_texts(X_train)

In [49]:
# 실제 리뷰를 숫자로 인코딩
X_train_seq = final_tokenizer.texts_to_sequences(X_train)
X_test_seq = final_tokenizer.texts_to_sequences(X_test)
X_train_seq

[[23, 936, 4, 1097],
 [602, 6117],
 [],
 [73, 356, 27, 33],
 [107, 5354, 1, 852, 568],
 [592, 2290, 51, 4218, 409],
 [1098, 2234, 134],
 [111, 1254, 58, 2741, 3],
 [714, 96, 37, 4915, 1],
 [1187, 40, 285, 3285, 2, 928],
 [],
 [655],
 [2235, 6118, 38],
 [87, 150, 60, 518, 283],
 [3647, 4, 1974, 229, 20, 295, 61, 416, 146, 300, 5127],
 [2327],
 [31, 377, 6, 1111],
 [7, 1361, 263],
 [24, 3, 13, 432],
 [257, 190],
 [174, 4082, 37, 113, 6, 18, 190, 307],
 [162, 165, 253],
 [116, 176, 71, 4083, 76, 3744, 230, 418, 24, 70, 3286, 1283, 70, 2939, 67],
 [190, 767, 19, 148],
 [3068, 310, 3186, 232],
 [12, 233, 10, 59],
 [],
 [640],
 [],
 [1034, 3003, 668, 233, 800],
 [42, 167, 1476, 768, 2564, 1885, 38, 3, 506, 29],
 [23, 3964],
 [447, 2615, 569],
 [199, 3965, 25],
 [145, 5, 673, 45],
 [78, 5587, 5588],
 [282],
 [2565, 4551],
 [4219, 5828],
 [5, 19, 7, 30, 5829, 248],
 [6119, 1],
 [130, 55, 55, 244, 186, 29, 1, 1362, 76, 409, 29, 5830],
 [],
 [3, 428, 233, 56, 2566, 56, 41, 4552, 479],
 [221, 130

#### 리뷰 시퀀스의 길이를 알아보자

In [51]:
X_train_len = [len(doc) for doc in X_train_seq]

In [52]:
print("최대 : ", max(X_train_len))
print("최소 : ", min(X_train_len))
print("평균 : ", np.mean(X_train_len))
print("중앙 : ", np.median(X_train_len))

최대 :  53
최소 :  0
평균 :  4.481342711423714
중앙 :  3.0


In [54]:
# 평균과 중앙 사이값으로 맞춰보자
X_train_pad_seq = sequence.pad_sequences(X_train_seq, maxlen=4)
X_test_pad_seq = sequence.pad_sequences(X_test_seq, maxlen=4)

In [55]:
X_train_pad_seq.shape, X_test_pad_seq.shape

((149995, 4), (49997, 4))

In [57]:
X_train_pad_seq[10]

array([0, 0, 0, 0], dtype=int32)

#### 모델링
- input_shape 고려
- 출력층 고려
- 임베딩 레이어
- LSTM or GRU
- 출력층 고려 (2진분류)
- loss, optimizer 고려
- 검증셋 분리해서 학습
- 모델체크포인트, 얼리스타핑 설정
- 학습결과 시각화
- test데이터로 모델 평가

In [83]:
naver_movie_model = Sequential()
naver_movie_model.add(Embedding(6382,50)) # 사용하는 단어사전의 수, 각 단어를 표현할 수(자기맘)
naver_movie_model.add(Dropout(0.4)) # 과대적합을 막기위해 일부로 중간중간 신경망을 끊어버림(epoch마다 꺼지는 신경망 다름), 0.4=40%끔
naver_movie_model.add(LSTM(128,return_sequences=True)) # Dropout은 바로 밑에층에만 적용됨 
naver_movie_model.add(LSTM(128))
naver_movie_model.add(Dense(1,activation='sigmoid'))
naver_movie_model.compile(loss='binary_crossentropy',
            optimizer=Adam(),
            metrics=['accuracy']
            )

mkpt = ModelCheckpoint(filepath='./data/naver_movie_model/naver_movie_model_{epoch:04d}_{val_loss:.3f}.hdf5', 
                       monitor='val_loss', 
                       verbose=1,  # 모델 저장 로그
                       save_best_only=True) 

early = EarlyStopping(monitor='val_loss',
                      verbose=1,
                      patience=10) # 성능개선 안될시 기다려주는 횟수(그 이내에 개선되면 횟수 리셋, 개선안되면 멈춤)

h1 = naver_movie_model.fit(X_train_pad_seq,y_train,epochs=100, validation_split=0.2,callbacks=[mkpt, early])

Epoch 1/100
Epoch 1: val_loss improved from inf to 0.44656, saving model to ./data/naver_movie_model/naver_movie_model_0001_0.447.hdf5
Epoch 2/100
Epoch 2: val_loss improved from 0.44656 to 0.44027, saving model to ./data/naver_movie_model/naver_movie_model_0002_0.440.hdf5
Epoch 3/100
Epoch 3: val_loss did not improve from 0.44027
Epoch 4/100
Epoch 4: val_loss did not improve from 0.44027
Epoch 5/100
Epoch 5: val_loss did not improve from 0.44027
Epoch 6/100
Epoch 6: val_loss did not improve from 0.44027
Epoch 7/100
Epoch 7: val_loss did not improve from 0.44027
Epoch 8/100
Epoch 8: val_loss did not improve from 0.44027
Epoch 9/100
Epoch 9: val_loss did not improve from 0.44027
Epoch 10/100
Epoch 10: val_loss did not improve from 0.44027
Epoch 11/100
Epoch 11: val_loss did not improve from 0.44027
Epoch 12/100
Epoch 12: val_loss did not improve from 0.44027
Epoch 12: early stopping


In [81]:
plt.figure(figsize=(15,5))
plt.plot(h1.history['accuracy'], label='train_acc')
plt.plot(h1.history['val_accuracy'], label='val_acc')
plt.legend()
plt.show()

NameError: ignored

<Figure size 1080x360 with 0 Axes>