## Naver Movie Review Data에 대한 이해

In [None]:
import pandas as pd
train_data= pd.read_table('./ratings_train.txt')
test_data= pd.read_table('./ratings_test.txt')

In [None]:
print(len(train_data)) # 리뷰 개수 출력

In [None]:
train_data[:5] # 상위 5개 출력

In [None]:
print(len(test_data))

In [None]:
test_data[:5]

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
train_data['label'].value_counts().plot(kind='bar')

In [None]:
print(train_data.groupby('label').size().reset_index(name='count'))

In [None]:
print(train_data.isnull().values.any())

In [None]:
print(train_data.isnull().sum())

In [None]:
train_data.loc[train_data.document.isnull()]

In [None]:
train_data=train_data.dropna(how='any') # Null 값이 존재하는 행 제거
print(train_data.isnull().values.any()) # Null 값이 존재하는지 확인

In [None]:
print(len(train_data))

In [None]:
train_data['document'] = train_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
# 한글과 공백을 제외하고 모두 제거
train_data[:5]

In [None]:
stopwords=['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로',\
           '자','에','와','한','하다']

In [None]:
import konlpy
from konlpy.tag import Okt
okt = Okt()

In [None]:
X_train=[]
for sentence in train_data['document']:
    temp_X = []
    temp_X=okt.morphs(sentence, stem=True) # 토큰화
    temp_X=[word for word in temp_X if not word in stopwords] # 불용어 제거
    X_train.append(temp_X)

In [None]:
print(X_train[:3])

In [None]:
test_data=test_data.dropna(how='any') # Null 값 제거
test_data['document'] = test_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","") # 정규 표현식 수행

X_test=[]
for sentence in test_data['document']:
    temp_X = []
    temp_X=okt.morphs(sentence, stem=True) # 토큰화
    temp_X=[word for word in temp_X if not word in stopwords] # 불용어 제거
    X_test.append(temp_X)

In [None]:
from keras.preprocessing.text import Tokenizer
max_words = 35000
tokenizer = Tokenizer(num_words=max_words) # 상위 35,000개의 단어만 보존
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
print(X_train[:3])

In [None]:
print('리뷰의 최대 길이 :',max(len(l) for l in X_train))
print('리뷰의 평균 길이 :',sum(map(len, X_train))/len(X_train))
plt.hist([len(s) for s in X_train], bins=50)
plt.xlabel('length of Data')
plt.ylabel('number of Data')
plt.show()

## LSTM으로 네이버 영화 리뷰 감성 분류하기

In [None]:
y_train=train_data['label']
y_test=test_data['label']

In [None]:
from keras.layers import Embedding, Dense, LSTM
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences

In [None]:
max_len=30
# 전체 데이터의 길이는 30으로 맞춘다.
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [None]:
model = Sequential()
model.add(Embedding(max_words, 100))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(X_train, y_train, epochs=4, batch_size=60, validation_split=0.2)

In [None]:
print("\n 테스트 정확도: %.4f" % (model.evaluate(X_test, y_test)[1]))