In [5]:
import konlpy
import pandas as pd
import numpy as np



In [7]:
df_train = pd.read_csv('./data/ratings_train.txt',
                       delimiter='\t', keep_default_na=False)

In [8]:
df_train.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [9]:
X_train = df_train['document'].values
y_train = df_train['label'].values

In [12]:
df_test = pd.read_csv('./data/ratings_test.txt', 
                      delimiter='\t', keep_default_na=False)

X_test = df_test['document'].values
y_test = df_test['label'].values

In [13]:
print(len(X_train), np.bincount(y_train))

150000 [75173 74827]


In [14]:
print(len(X_test), np.bincount(y_test))

50000 [24827 25173]


In [None]:
# tokenizer for Korean text
from konlpy.tag import Okt

okt = Okt()
print(X_train[4])
print(okt.morphs(X_train[4]))




사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다
['사이', '몬페', '그', '의', '익살스런', '연기', '가', '돋보였던', '영화', '!', '스파이더맨', '에서', '늙어', '보이기만', '했던', '커스틴', '던스트', '가', '너무나도', '이뻐', '보였다']


#### took 10min on my laptop

In [None]:
import os
from scipy.sparse import save_npz, load_npz
from sklearn.feature_extraction.text import TfidfVectorizer

if not os.path.isfile('okt_train.npz'):
    tfidf = TfidfVectorizer(ngram_range=(1, 2), 
                            min_df=3,
                            max_df=0.9,
                            tokenizer=okt.morphs, 
                            token_pattern=None)
    tfidf.fit(X_train)
    X_train_okt = tfidf.transform(X_train)
    X_test_okt = tfidf.transform(X_test)
    save_npz('okt_train.npz', X_train_okt)
    save_npz('okt_test.npz', X_test_okt)
else:
    X_train_okt = load_npz('okt_train.npz')
    X_test_okt = load_npz('okt_test.npz')



In [105]:
X_train[:3]

array(['아 더빙.. 진짜 짜증나네요 목소리', '흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나',
       '너무재밓었다그래서보는것을추천한다'], dtype=object)

In [108]:
X_train_okt[:3].toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import SGDClassifier
# from sklearn.utils.fixes import loguniform
from scipy.stats import loguniform

sgd = SGDClassifier(loss='log_loss', random_state=1)

full_pipeline = Pipeline([
    ('vect', tfidf),
    ('clf', SGDClassifier(loss='log_loss', random_state=1))
])
# param_dist = {'alpha': loguniform(0.0001, 100.0)}

param_dist_pipeline = {
    'vect__tokenizer': [okt.morphs],
    'clf__penalty': ['l1', 'l2', 'elasticnet'],
    'clf__alpha': loguniform(0.0001, 100.0),
    'clf__max_iter': [1000]
}

rsv_okt = RandomizedSearchCV(estimator=full_pipeline,
                             param_distributions=param_dist_pipeline,
                             n_iter=50,
                             random_state=1,
                             verbose=1,
                             n_jobs=1)
rsv_okt.fit(X_train, y_train)



Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [88]:
print(rsv_okt.best_score_)
print(rsv_okt.best_params_)

0.8251533333333334
{'alpha': 0.00010015813955858975}


In [109]:
rsv_okt.best_estimator_

0,1,2
,loss,'log_loss'
,penalty,'l2'
,alpha,0.00010015813955858975
,l1_ratio,0.15
,fit_intercept,True
,max_iter,1000
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [None]:
import joblib
joblib.dump(rsv_okt.best_estimator_, 'ted_naver_movie_sgd_model.joblib')

['ted_naver_movie_sgd_model.joblib']

In [91]:
rsv_okt.score(X_test_okt, y_test)

0.8189

In [95]:
X_test[:10]

array(['굳 ㅋ', 'GDNTOPCLASSINTHECLUB',
       '뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아',
       '지루하지는 않은데 완전 막장임... 돈주고 보기에는....',
       '3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??',
       '음악이 주가 된, 최고의 음악영화', '진정한 쓰레기',
       '마치 미국애니에서 튀어나온듯한 창의력없는 로봇디자인부터가,고개를 젖게한다',
       '갈수록 개판되가는 중국영화 유치하고 내용없음 폼잡다 끝남 말도안되는 무기에 유치한cg남무 아 그립다 동사서독같은 영화가 이건 3류아류작이다',
       '이별의 아픔뒤에 찾아오는 새로운 인연의 기쁨 But, 모든 사람이 그렇지는 않네..'], dtype=object)

In [93]:
rsv_okt.predict(X_test_okt[:10])

array([1, 0, 0, 0, 0, 1, 0, 0, 0, 1])

In [None]:
from hmac import new
from json import load

import tensorflow as tf


loaded_model = joblib.load('ted_naver_movie_sgd_model.joblib')
print(type(loaded_model))
new_data = ["영화가 너무 재미 없어요.", "나도 그렇게 재미있게 보지 못함"]
predictions = loaded_model.predict(new_data)
print(predictions)


<class 'sklearn.linear_model._stochastic_gradient.SGDClassifier'>


ValueError: X has 105674 features, but SGDClassifier is expecting 130333 features as input.

In [29]:
!pip install --upgrade soynlp

Collecting soynlp
  Using cached soynlp-0.0.493-py3-none-any.whl.metadata (24 kB)
Using cached soynlp-0.0.493-py3-none-any.whl (416 kB)
Installing collected packages: soynlp
Successfully installed soynlp-0.0.493


In [30]:
from soynlp.tokenizer import LTokenizer

In [32]:
lto = LTokenizer()
print(X_train[4])
print(lto.tokenize(X_train[4]))

사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다
['사이몬페그의', '익살스런', '연기가', '돋보였던', '영화!스파이더맨에서', '늙어보이기만', '했던', '커스틴', '던스트가', '너무나도', '이뻐보였다']


In [33]:
from soynlp.word import WordExtractor

In [34]:
word_ext = WordExtractor()
word_ext.train(X_train)
scores = word_ext.word_scores()

training was done. used memory 1.468 Gbse memory 1.250 Gb
all cohesion probabilities was computed. # words = 85683
all branching entropies was computed # words = 101540
all accessor variety was computed # words = 101540


In [35]:
import math

score_dict = {key: scores[key].cohesion_forward *
              math.exp(scores[key].right_branching_entropy) 
              for key in scores}

In [36]:
lto = LTokenizer(scores=score_dict)

In [37]:
print(lto.tokenize(X_train[4]))

['사이', '몬페그의', '익살스', '런', '연기', '가', '돋보', '였던', '영화', '!스파이더맨에서', '늙어', '보이기만', '했던', '커스틴', '던스트가', '너무', '나도', '이뻐', '보였다']


In [38]:
if not os.path.isfile('soy_train.npz'):
    tfidf = TfidfVectorizer(ngram_range=(1, 2),
                            min_df=3,
                            max_df=0.9,
                            tokenizer=lto.tokenize, 
                            token_pattern=None)
    tfidf.fit(X_train)
    X_train_soy = tfidf.transform(X_train)
    X_test_soy = tfidf.transform(X_test)
    save_npz('soy_train.npz', X_train_soy)
    save_npz('soy_test.npz', X_test_soy)
else:
    X_train_soy = load_npz('soy_train.npz')
    X_test_soy = load_npz('soy_test.npz')

In [39]:
rsv_soy = RandomizedSearchCV(estimator=sgd,
                             param_distributions=param_dist,
                             n_iter=50,
                             random_state=1,
                             verbose=1,
                             n_jobs=1)
rsv_soy.fit(X_train_soy, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


0,1,2
,estimator,SGDClassifier...andom_state=1)
,param_distributions,{'alpha': <scipy.stats....t 0x1a12f2390>}
,n_iter,50
,scoring,
,n_jobs,1
,refit,True
,cv,
,verbose,1
,pre_dispatch,'2*n_jobs'
,random_state,1

0,1,2
,loss,'log_loss'
,penalty,'l2'
,alpha,0.00010015813955858975
,l1_ratio,0.15
,fit_intercept,True
,max_iter,1000
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [41]:
print(rsv_soy.best_score_)
print(rsv_soy.best_params_)

0.8141066666666665
{'alpha': 0.00010015813955858975}


In [43]:
import joblib
best_model = rsv_okt.best_estimator_
model_filename = 'ted_naver_movie_soy_sgd_model.joblib'
joblib.dump(best_model, model_filename)

['ted_naver_movie_soy_sgd_model.joblib']

In [42]:
rsv_soy.score(X_test_soy, y_test)

0.8085