# 네이버 영화평 감상 분석 - TfidfVectorizer

In [1]:
import numpy as np
import pandas as pd

In [2]:
train_df = pd.read_csv('naver_movie_train.tsv', sep='\t')   #index=False
test_df = pd.read_csv('naver_movie_train.tsv', sep='\t')
train_df.head(2)

Unnamed: 0.1,Unnamed: 0,id,document,label
0,0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1


# Tokenizer 함수 정의

In [3]:
from konlpy.tag import Okt
okt = Okt()

In [4]:
stopwords = ['의','가','이','은','들','는','좀','잘','을','를','으로','걍','과','더','도','저','하다','ㅋㅋ','ㅠㅠ','ㅎㅎ']

In [5]:
def okt_tokenizer(text):
    tokens = okt.morphs(text, stem=True)
    tokens = [word for word in tokens if not word in stopwords]
    return tokens

# TfidfVectorizer로 변환

In [7]:
import warnings
warnings.filterwarnings(action='ignore')

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tvect = TfidfVectorizer(
    tokenizer=okt_tokenizer, ngram_range=(1,2),
    max_df=0.9
)

In [9]:
tvect.fit(train_df.document)

TfidfVectorizer(max_df=0.9, ngram_range=(1, 2),
                tokenizer=<function okt_tokenizer at 0x0000025549251790>)

In [10]:
X_train_tv = tvect.transform(train_df.document)

In [11]:
X_test_tv = tvect.transform(test_df.document)

In [12]:
y_train = train_df.label.values
y_test = test_df.label.values

# Naive Bayes 분류기로 학습/예측/평가

In [14]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.get_params()

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True}

In [15]:
nb.fit(X_train_tv, y_train)

MultinomialNB()

In [16]:
from sklearn.metrics import accuracy_score
pred = nb.predict(X_test_tv)
accuracy_score(y_test, pred)

0.9446922329698595

# 실제 테스트

In [17]:
reviews = ['아름다운 음악과 아름다운 풍광~ 그렇지 못한 현실이 찡하네요~',
            '메시지와 작위성의 불협화음!!!']

In [18]:
reviews_tv = tvect.transform(reviews)
pred = nb.predict(reviews_tv)
pred

array([1, 0], dtype=int64)