In [1]:
import pandas as pd
import numpy as np

In [2]:
import konlpy

In [3]:
train_df = pd.read_table('../data18/data_list/ratings_train.txt')
test_df = pd.read_table('../data18/data_list/ratings_test.txt')

In [4]:
train_df

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1
...,...,...,...
149995,6222902,인간이 문제지.. 소는 뭔죄인가..,0
149996,8549745,평점이 너무 낮아서...,1
149997,9311800,이게 뭐요? 한국인은 거들먹거리고 필리핀 혼혈은 착하다?,0
149998,2376369,청춘 영화의 최고봉.방황과 우울했던 날들의 자화상,1


In [5]:
train_df['label'].value_counts()

0    75173
1    74827
Name: label, dtype: int64

In [6]:
train_df = train_df[train_df['document'].notnull()]
test_df = test_df[test_df['document'].notnull()]

In [7]:
import re
import warnings
warnings.filterwarnings(action='ignore')

In [8]:
train_df['document'] = train_df['document'].apply(lambda x: re.sub(r'[^ ㄱ-ㅣ가-힣]+', "", x))
test_df['document'] = test_df['document'].apply(lambda x: re.sub(r'[^ ㄱ-ㅣ가-힣]+', "", x))

In [9]:
# !pip install sklearn

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from konlpy.tag import Okt

In [11]:
okt = Okt()



In [12]:
tfidf = TfidfVectorizer(tokenizer= okt.morphs,
                        ngram_range=(1,2),
                        min_df=3,
                        max_df=0.9)

In [13]:
tfidf.fit(train_df['document'])

TfidfVectorizer(max_df=0.9, min_df=3, ngram_range=(1, 2),
                tokenizer=<bound method Okt.morphs of <konlpy.tag._okt.Okt object at 0x1663cf0a0>>)

In [14]:
train_tfidf_df = tfidf.transform(train_df['document'])
train_tfidf_df

<149995x115612 sparse matrix of type '<class 'numpy.float64'>'
	with 2703859 stored elements in Compressed Sparse Row format>

In [15]:
test_tfidf_df = tfidf.transform(test_df['document'])
test_tfidf_df

<49997x115612 sparse matrix of type '<class 'numpy.float64'>'
	with 871912 stored elements in Compressed Sparse Row format>

In [16]:
test_tfidf_df.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [17]:
from sklearn.linear_model import LogisticRegression
SA_lr = LogisticRegression(random_state=0)

In [18]:
SA_lr.fit(train_tfidf_df, train_df['label'])

LogisticRegression(random_state=0)

In [19]:
test_predict = SA_lr.predict(test_tfidf_df)

In [20]:
from sklearn.metrics import accuracy_score

print('감성 분석 정확도 : ', round(accuracy_score(test_df['label'], test_predict), 3))


감성 분석 정확도 :  0.852


In [21]:
st = "웃자 ^o^ 오늘은 좋은 날이 될 것 같은 예감100%! ^^*"

In [22]:
st2 = re.sub(r'[^ ㄱ-ㅎ|가-힣]+', "", st)
st2

'웃자  오늘은 좋은 날이 될 것 같은 예감 '

In [23]:
st_tfidf_df = tfidf.transform([st2])
st_tfidf_df

<1x115612 sparse matrix of type '<class 'numpy.float64'>'
	with 16 stored elements in Compressed Sparse Row format>

In [24]:
st2_predict = SA_lr.predict(st_tfidf_df)
st2_predict, st2_predict[0]

(array([1]), 1)

In [25]:
if(st2_predict[0] == 0):
    print(st2, '==> 부정 감정')
else:
    print(st2, '==> 긍정 감정')

웃자  오늘은 좋은 날이 될 것 같은 예감  ==> 긍정 감정


In [37]:
st3 = "짱나! 진짜야"

In [38]:
st4 = re.sub(r'[^ ㄱ-ㅎ|가-힣]+', "", st3)
st4

'짱나 진짜야'

In [39]:
# 1) 입력 텍스트의 피처 벡터화
st4_tfidf = tfidf.transform([st4])

# 2) 최적 감성분석 모델에 적용하여 감성 분석 평가
st4_predict2 = SA_lr.predict(st4_tfidf)

In [40]:
st4_predict2[0]

0

In [41]:
if(st4_predict2[0] == 0):
    print(st3, '==> 부정 감정')
else:
    print(st3, '==> 긍정 감정')

짱나! 진짜야 ==> 부정 감정


In [42]:
import pickle 
import joblib

In [43]:
tfidf_fit = tfidf.fit(train_df['document'])
tfidf_fit

TfidfVectorizer(max_df=0.9, min_df=3, ngram_range=(1, 2),
                tokenizer=<bound method Okt.morphs of <konlpy.tag._okt.Okt object at 0x1663cf0a0>>)

In [44]:
saved_model = pickle.dumps(SA_lr)

In [45]:
lr_from_pickle = pickle.loads(saved_model)
lr_from_pickle.predict(st_tfidf_df[0])

array([1])

In [46]:
joblib.dump(SA_lr, 'movie_SA_lr.pkl')

['movie_SA_lr.pkl']

In [47]:
lr_from_joblib = joblib.load('movie_SA_lr.pkl')
lr_from_joblib

LogisticRegression(random_state=0)

In [48]:
lr_from_joblib.predict(st_tfidf_df[0])


array([1])