In [1]:
import numpy as np
import pandas as pd

In [2]:
train_df = pd.read_csv('../static/data/NaverMovie/ratings_train.txt', sep='\t')
test_df = pd.read_csv('../static/data/NaverMovie/ratings_test.txt', sep='\t')
train_df.head(3)

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0


In [69]:
train_df.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙 진짜 짜증나네요 목소리,0
1,3819312,흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 솔직히 재미는 없다평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화스파이더맨에서 늙어보이기만 했던 커스틴 던...,1


In [3]:
train_df.shape, test_df.shape

((150000, 3), (50000, 3))

### Train데이터셋 전처리

In [4]:
# 중복 여부 확인
train_df['document'].nunique()

146182

In [5]:
# 중복 샘플 제거
train_df.drop_duplicates(subset=['document'], inplace=True)
train_df.shape

(146183, 3)

In [6]:
# 긍정/부정 분포
train_df.label.value_counts()

0    73342
1    72841
Name: label, dtype: int64

In [7]:
# Null값 확인
train_df.isnull().sum()

id          0
document    1
label       0
dtype: int64

In [8]:
train_df.loc[train_df.document.isnull()]

Unnamed: 0,id,document,label
25857,2172111,,1


In [9]:
# Null값 제거
train_df = train_df.dropna(how='any')
train_df.shape

(146182, 3)

#### 텍스트 전처리

In [13]:
# 한글과 공백을 제외하고 모두 제거
train_df['document'] = train_df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
train_df.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙 진짜 짜증나네요 목소리,0
1,3819312,흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 솔직히 재미는 없다평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화스파이더맨에서 늙어보이기만 했던 커스틴 던...,1


In [14]:
# 빈 데이터가 있으면 nan으로 변경 후 확인 (이후에 dropna 진행)
train_df['document'].replace('', np.nan, inplace=True)
train_df.isnull().sum()

id            0
document    391
label         0
dtype: int64

In [15]:
# nan제거 / 최종 학습에 들어갈 데이터
train_df = train_df.dropna(how='any')
train_df.shape

(145791, 3)

### Test데이터셋 전처리

In [10]:
# 중복 여부 확인
test_df['document'].nunique()

49157

In [11]:
# 중복 제거
test_df.drop_duplicates(subset=['document'], inplace=True)
test_df.shape

(49158, 3)

In [12]:
# Null값 제거
test_df = test_df.dropna(how='any')
test_df.shape

(49157, 3)

#### 텍스트 전처리

In [16]:
test_df['document'] = test_df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
test_df['document'].replace('', np.nan, inplace=True)
test_df.isnull().sum()

id            0
document    162
label         0
dtype: int64

In [17]:
# nan제거 / 최종 학습에 들어갈 데이터
test_df = test_df.dropna(how='any')
test_df.shape

(48995, 3)

In [18]:
train_df.to_csv('../static/data/NaverMovie/train.tsv', sep='\t')
test_df.to_csv('../static/data/NaverMovie/test.tsv', sep='\t')

### 토큰화

In [19]:
from konlpy.tag import Okt

okt = Okt()

In [9]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다', '을']

In [21]:
okt.morphs('교도소 이야기구먼 솔직히 재미는 없다평점 조정', stem=True)

['교도소', '이야기', '구먼', '솔직하다', '재미', '는', '없다', '평점', '조정']

In [22]:
    from tqdm import tqdm_notebook
    X_train = []
    for sentence in tqdm_notebook(train_df['document']):
        morphs = okt.morphs(sentence, stem=True) # 토큰화
        temp_X = ' '.join([word for word in morphs if not word in stopwords]) # 불용어 제거
        X_train.append(temp_X)

HBox(children=(FloatProgress(value=0.0, max=145791.0), HTML(value='')))




In [23]:
X_train[:3]

['아 더빙 진짜 짜증나다 목소리',
 '흠 포스터 보고 초딩 영화 줄 오버 연기 조차 가볍다 않다',
 '너 무재 밓었 다그 래서 보다 추천 다']

In [24]:
from tqdm import tqdm_notebook
X_test = []
for sentence in tqdm_notebook(test_df['document']):
    morphs = okt.morphs(sentence, stem=True) # 토큰화
    temp_X = ' '.join([word for word in morphs if not word in stopwords]) # 불용어 제거
    X_test.append(temp_X)

HBox(children=(FloatProgress(value=0.0, max=48995.0), HTML(value='')))




In [25]:
X_test[:3]

['굳다 ㅋ', '뭐 야 평점 나쁘다 않다 점 짜다 리 더 더욱 아니다', '지루하다 않다 완전 막장 임 돈 주다 보기 에는']

In [26]:
y_train = train_df.label.values
y_test = test_df.label.values

In [27]:
X_train[0]

'아 더빙 진짜 짜증나다 목소리'

In [66]:
print(len(X_train))
print(len(X_test))

145791
48995


### CountVectorizer, LogisticRegression을 이용해서 감성분석하기

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [29]:
cvecter = CountVectorizer()
cvecter.fit(X_train)
X_train_cvect = cvecter.transform(X_train)
X_test_cvect = cvecter.transform(X_test)

In [30]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train_cvect, y_train)
pred = lr_clf.predict(X_test_cvect)
accuracy_score(y_test, pred)

0.8261659352995203

### 실제 테스트

In [31]:
review1 = '진짜 개노잼이다.. 1편이랑 같은 감독맞나?러닝타임도 길어서 개지루함 ㄹㅇ'
review2 = '이런 사랑영화가 다시 나올 수 있을까?'

In [32]:
import re

review1 = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]', '', review1)
review2 = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]', '', review2)
review1, review2

('진짜 개노잼이다 편이랑 같은 감독맞나러닝타임도 길어서 개지루함 ㄹㅇ', '이런 사랑영화가 다시 나올 수 있을까')

In [33]:
morphs = okt.morphs(review1)
review = ' '.join([word for word in morphs if not word in stopwords])
review

'진짜 개 노잼 이다 편이 랑 같은 감독 맞나 러닝 타임 길어서 개 지루함 ㄹㅇ'

In [34]:
# 그냥 review를 넣으면 shape(차원)이 맞지 않기때문에, 리스트로 변환하여 넣어야한다.
review_cvect = cvecter.transform([review])

In [35]:
pred = lr_clf.predict(review_cvect)

In [36]:
pred[0]

0

In [37]:
morphs = okt.morphs(review2)
review = ' '.join([word for word in morphs if not word in stopwords])
review_cvect = cvecter.transform([review])
pred = lr_clf.predict(review_cvect)
pred[0]

1

### GridSearchCV로 최적 파라미터 찾기

In [38]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [39]:
pipeline = Pipeline([
    ('count_vect', CountVectorizer(stop_words='english')),
    ('lr_clf', LogisticRegression())
])

In [42]:
params = {
    'count_vect__ngram_range': [(1,2), (1,3)],
    'count_vect__max_df': [9000, 9100, 9200, 9300],
    'lr_clf__C': [1, 1.2, 1.5]
}

grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_pipe.fit(X_train, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed: 31.5min finished
{'count_vect__max_df': 9300, 'count_vect__ngram_range': (1, 2), 'lr_clf__C': 1.2} 0.8411767530231633


In [44]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train_cvect, y_train)
pred = nb.predict(X_test_cvect)
accuracy_score(y_test, pred)

0.8270231656291458

In [45]:
pipeline_cn = Pipeline([
    ('count_vect', CountVectorizer(stop_words='english')),
    ('nb', MultinomialNB())
])

In [54]:
params_cn = {
    'count_vect__ngram_range': [(1,4), (1,5)],
    'count_vect__max_df': [8000, 8500, 9000]
}

grid_pipe_cn = GridSearchCV(pipeline_cn, param_grid=params_cn, cv=3, scoring='accuracy', verbose=1)
grid_pipe_cn.fit(X_train, y_train)
print(grid_pipe_cn.best_params_, grid_pipe_cn.best_score_)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:  4.6min finished
{'count_vect__max_df': 8000, 'count_vect__ngram_range': (1, 4)} 0.8434814220356538


In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [56]:
pipeline_tl = Pipeline([
    ('tfidf_vect', CountVectorizer(stop_words='english')),
    ('lr_clf', LogisticRegression())
])

In [59]:
params_tl = {
    'tfidf_vect__ngram_range': [(1,2), (1,3)],
    'tfidf_vect__max_df': [9300, 9400, 9500, 10000],
    'lr_clf__C': [1, 1.2, 1.5]
}

grid_pipe_tl = GridSearchCV(pipeline_tl, param_grid=params_tl, cv=3, scoring='accuracy', verbose=1)
grid_pipe_tl.fit(X_train, y_train)
print(grid_pipe_tl.best_params_, grid_pipe_tl.best_score_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed: 31.2min finished
{'lr_clf__C': 1.2, 'tfidf_vect__max_df': 9300, 'tfidf_vect__ngram_range': (1, 2)} 0.8411767530231633


In [60]:
pipeline_tn = Pipeline([
    ('tfidf_vect', CountVectorizer(stop_words='english')),
    ('nb', MultinomialNB())
])

In [67]:
params_tn = {
    'tfidf_vect__ngram_range': [(1,1), (1,2), (1,3)],
    'tfidf_vect__max_df': [100, 200, 300]
}

grid_pipe_tn = GridSearchCV(pipeline_tn, param_grid=params_tn, cv=3, scoring='accuracy', verbose=1)
grid_pipe_tn.fit(X_train, y_train)
print(grid_pipe_tn.best_params_, grid_pipe_tn.best_score_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:  2.9min finished
{'tfidf_vect__max_df': 300, 'tfidf_vect__ngram_range': (1, 3)} 0.8190423277157026


In [68]:
import joblib
joblib.dump(grid_pipe, '../static/model/naver_countlr.pkl')
joblib.dump(grid_pipe_cn, '../static/model/naver_countnb.pkl')
joblib.dump(grid_pipe_tn, '../static/model/naver_tfidfnb.pkl')
joblib.dump(grid_pipe_tl, '../static/model/naver_tfidflr.pkl')

['../static/model/naver_tfidflr.pkl']

In [3]:
import joblib
naver_countlr = joblib.load('../static/model/naver_countlr.pkl')

In [4]:
review = '흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나'

In [5]:
import re

review = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]', '', review)

In [10]:
from konlpy.tag import Okt

okt = Okt()

morphs = okt.morphs(review)ㄴ
review = ' '.join([word for word in morphs if not word in stopwords])
review

'흠 포스터 보고 초딩 영화 줄 오버 연기 조차 가볍지 않구나'

In [12]:
pred = naver_countlr.predict(review)

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'