# 네이버 영화평 감성 분석 - TfidfVectorizer

In [1]:
import numpy as np
import pandas as pd

In [2]:
train_df = pd.read_csv('../00.data/NaverMovie/train.tsv', sep='\t')
test_df = pd.read_csv('../00.data/NaverMovie/test.tsv', sep='\t')

In [22]:
train_df.head(3)

Unnamed: 0.1,Unnamed: 0,id,document,label
0,0,9976970,아 더빙 진짜 짜증나네요 목소리,0
1,1,3819312,흠포스터보고 초딩영화줄오버연기조차 가볍지 않구나,1
2,2,10265843,너무재밓었다그래서보는것을추천한다,0


### Tokenizer 함수 정의

In [3]:
from konlpy.tag import Okt

okt=Okt()
def tw_tokenizer(text):
    tokens_ko = okt.morphs(text)
    return tokens_ko

### TfidfVectorizer로 학습/변환

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tvecter = TfidfVectorizer(tokenizer=tw_tokenizer, ngram_range=(1,2), min_df=3, max_df=0.9)

In [7]:
%time tvecter.fit(train_df.document)

Wall time: 8min 40s


TfidfVectorizer(max_df=0.9, min_df=3, ngram_range=(1, 2),
                tokenizer=<function tw_tokenizer at 0x00000229E76025E0>)

In [8]:
X_train_tvect = tvecter.transform(train_df['document'])

In [9]:
%time X_test_tvect = tvecter.transform(test_df['document'])

Wall time: 2min 50s


In [11]:
y_train = train_df.label.values
y_test = test_df.label.values

### LogisticRegression으로 학습/예측/평가

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [13]:
lr_clf = LogisticRegression(C=3.5)
lr_clf.fit(X_train_tvect, y_train)
pred = lr_clf.predict(X_test_tvect)
accuracy_score(y_test, pred)

0.8584753546280233

### 실제 테스트

In [14]:
review1 = '진짜 개노잼이다.. 1편이랑 같은 감독맞나?러닝타임도 길어서 개지루함 ㄹㅇ'
review2 = '이런 사랑영화가 다시 나올 수 있을까?'

In [15]:
import re

review1 = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]', '', review1)
review2 = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]', '', review2)
review1, review2

('진짜 개노잼이다 편이랑 같은 감독맞나러닝타임도 길어서 개지루함 ㄹㅇ', '이런 사랑영화가 다시 나올 수 있을까')

In [35]:
review_tvect = tvecter.transform([review1])
pred = lr_clf.predict(review_tvect)
pred[0]

0

In [36]:
review_tvect = tvecter.transform([review2])
pred = lr_clf.predict(review_tvect)
pred[0]

1

In [37]:
reviews = ['진짜 개노잼이다.. 1편이랑 같은 감독맞나?러닝타임도 길어서 개지루함 ㄹㅇ', '이런 사랑영화가 다시 나올 수 있을까?']

In [38]:
reviews = list(map(lambda x: re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", x), reviews))

In [39]:
review_tvect = tvecter.transform(reviews)
pred = lr_clf.predict(review_tvect)
pred[0], pred[1]

(0, 1)

### GridSearchCV로 최적 파라미터 찾기

In [48]:
from sklearn.model_selection import GridSearchCV

In [51]:
params = {
    'C': [4,4.1,4.2,4.3]
}

grid_cv = GridSearchCV(lr_clf, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_cv.fit(X_train_tvect, y_train)
print(grid_cv.best_params_, grid_cv.best_score_)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:  1.1min finished
{'C': 4.2} 0.8553888786001881


In [53]:
pred = lr_clf.predict(X_test_tvect)
acc = accuracy_score(y_test, pred)
print(f'Count Vectorizer + Logistic Regression 정확도 : {acc:.4f}')

Count Vectorizer + Logistic Regression 정확도 : 0.8585
