```
# Title: Sentiment Analysis for movie review
# Writer: Ted Jung
# Updated: 2, Aug 2025
# Description:
#       without pre-processing of data
```

In [27]:
import konlpy
import pandas as pd
import numpy as np



In [48]:
df_train = pd.read_csv('./data/ratings_train.txt',
                       delimiter='\t', keep_default_na=False)

In [49]:
df_train.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [50]:
X_train = df_train['document'].values
y_train = df_train['label'].values

In [44]:
df_test = pd.read_csv('./data/ratings_test.txt', 
                      delimiter='\t', keep_default_na=False)

X_test = df_test['document'].values
y_test = df_test['label'].values

In [45]:
print(len(X_train), np.bincount(y_train))

150000 [75173 74827]


In [33]:
print(len(X_test), np.bincount(y_test))

50000 [24827 25173]


In [34]:
# tokenizer for Korean text
from konlpy.tag import Okt

okt = Okt()
print(X_train[4])
print(okt.morphs(X_train[4]))

사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다
['사이', '몬페', '그', '의', '익살스런', '연기', '가', '돋보였던', '영화', '!', '스파이더맨', '에서', '늙어', '보이기만', '했던', '커스틴', '던스트', '가', '너무나도', '이뻐', '보였다']


#### took 10min on my laptop

In [35]:
import os
from scipy.sparse import save_npz, load_npz
from sklearn.feature_extraction.text import TfidfVectorizer

if not os.path.isfile('okt_train.npz'):
    
    # Convert text data into numerical representations
    tfidf = TfidfVectorizer(ngram_range=(1, 2), 
                            min_df=3,
                            max_df=0.9,
                            tokenizer=okt.morphs, 
                            token_pattern=None)
    tfidf.fit(X_train)
    X_train_okt = tfidf.transform(X_train)
    X_test_okt = tfidf.transform(X_test)
    save_npz('okt_train.npz', X_train_okt)
    save_npz('okt_test.npz', X_test_okt)
else:
    X_train_okt = load_npz('okt_train.npz')
    X_test_okt = load_npz('okt_test.npz')



In [36]:
X_train[:3]

array(['아 더빙.. 진짜 짜증나네요 목소리', '흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나',
       '너무재밓었다그래서보는것을추천한다'], dtype=object)

In [37]:
y_train[:3]

array([0, 1, 0])

In [38]:
X_train_okt[:3].toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

It took 139minutes(2 hours 19 minutes) - on my Mac

In [57]:
import joblib

from konlpy.tag import Okt
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import SGDClassifier
# from sklearn.utils.fixes import loguniform
from scipy.stats import loguniform
from sklearn.model_selection import cross_val_score

# --- STEP 1: Top-Level Definitions (The Fix) ---
# Create the Okt instance at the top level of your script.
okt = Okt()

# Define the tokenizer function at the top level.
# This makes it a globally accessible and picklable object.
def okt_tokenizer(text):
    return okt.morphs(text)

# Model Initialization
sgd = SGDClassifier(loss='log_loss', random_state=1, tol=1e-2)

# Building he Pipeline
full_pipeline = Pipeline([
    ('vect', TfidfVectorizer(tokenizer=okt_tokenizer,lowercase=False, token_pattern=None, max_features=10000)), # Use your original TfidfVectorizer params
    ('clf', sgd)                                     # Your SGDClassifier
])

# Defining the Parameter Search Space
param_dist = {
    # 'vect__tokenizer': [okt.morphs],            # Your custom tokenizers
    'clf__alpha': loguniform(0.0001, 100.0),      # Regularization strength (smaller-more prone to overfitting, larger-more prone to underfitting)
    # 'clf__penalty': ['l1', 'l2', 'elasticnet'], # SGDClassifier also has penalty (the most effective one of the three is selected)
    'clf__penalty': ['l1', 'l2'],                 # SGDClassifier also has penalty (the most effective one of the three is selected)
    'clf__max_iter': [100]                        # Good practice to set max_iter for SGDClassifier
}

# It does to find Hyperparameter Search and Training
rsv_okt = RandomizedSearchCV(estimator=full_pipeline,
                             param_distributions=param_dist,
                             n_iter=10,
                             random_state=1,
                             n_jobs=4,
                             verbose=1)



# Try forcing the threading backend

# Find patterns and relationships within the X(input, features) data.
# Corrent answer or the "ground truth" that the model is trying to learn
import dill

with joblib.parallel_backend("threading", n_jobs=4):
    rsv_okt.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [58]:
print(rsv_okt.best_score_)
print(rsv_okt.best_params_)

0.7808133333333334
{'clf__alpha': 0.0013108749615263343, 'clf__max_iter': 100, 'clf__penalty': 'l2'}


In [59]:
rsv_okt.best_estimator_

0,1,2
,steps,"[('vect', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,False
,preprocessor,
,tokenizer,<function okt...t 0x1a4043ce0>
,analyzer,'word'
,stop_words,
,token_pattern,

0,1,2
,loss,'log_loss'
,penalty,'l2'
,alpha,0.0013108749615263343
,l1_ratio,0.15
,fit_intercept,True
,max_iter,100
,tol,0.01
,shuffle,True
,verbose,0
,epsilon,0.1


In [60]:
import joblib
joblib.dump(rsv_okt.best_estimator_, 'ted_naver_movie_sgd_model.joblib')

['ted_naver_movie_sgd_model.joblib']

In [75]:
rsv_okt.score(X_test, y_test)

0.77478

In [91]:
X_test[11:20]

array(['한국독립영화의 한계 그렇게 아버지가 된다와 비교됨',
       '청춘은 아름답다 그 아름다움은 이성을 흔들어 놓는다. 찰나의 아름다움을 잘 포착한 섬세하고 아름다운 수채화같은 퀴어영화이다.',
       '눈에 보이는 반전이었지만 영화의 흡인력은 사라지지 않았다.',
       '"스토리, 연출, 연기, 비주얼 등 영화의 기본 조차 안된 영화에 무슨 평을 해. 이런 영화 찍고도 김문옥 감독은 ""내가 영화 경력이 몇OO인데 조무래기들이 내 영화를 평론해?"" 같은 마인드에 빠져있겠지?"',
       '소위 ㅈ문가라는 평점은 뭐냐?', '최고!!!!!!!!!!!!!!!!',
       '발연기 도저히 못보겠다 진짜 이렇게 연기를 못할거라곤 상상도 못했네', '나이스',
       '별 재미도없는거 우려먹어 .... 챔프에서 방송 몇번했더라 ? ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ'], dtype=object)

In [92]:
rsv_okt.predict(X_test[11:20])

array([1, 1, 1, 0, 0, 1, 0, 0, 0])

In [None]:
# Didn't predict well cause of not enough epoch
# Need to increase epoch to find hidden pattern

loaded_model = joblib.load('ted_naver_movie_sgd_model.joblib')
print(loaded_model)
new_data = ["영화보지 마세요", 
            "이걸 영화라고 만들었나요?", 
            "재미있어요", 
            "강추", 
            "영화한번 보세요", 
            "영화가 너무 재미 없어요.", 
            "나도 그렇게 재미있게 보지 못함", 
            "재미있습니다", 
            "보지 말기를 강추", 
            "보지 말기 강추",
            "보지마세요",
            "한번볼만한 영화입니다.",
            "나이스",
            "베스트"]

predictions = loaded_model.predict(new_data)
print(predictions)


Pipeline(steps=[('vect',
                 TfidfVectorizer(lowercase=False, max_features=10000,
                                 token_pattern=None,
                                 tokenizer=<function okt_tokenizer at 0x1a4043ce0>)),
                ('clf',
                 SGDClassifier(alpha=0.0013108749615263343, loss='log_loss',
                               max_iter=100, random_state=1, tol=0.01))])
[0 0 1 1 1 0 0 1 1 1 0 1 0 1]


In [95]:
# other Tokenizer sonylp

!pip install --upgrade soynlp



In [None]:
from soynlp.tokenizer import LTokenizer

In [None]:
lto = LTokenizer()
print(X_train[4])
print(lto.tokenize(X_train[4]))

사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다
['사이몬페그의', '익살스런', '연기가', '돋보였던', '영화!스파이더맨에서', '늙어보이기만', '했던', '커스틴', '던스트가', '너무나도', '이뻐보였다']


In [None]:
from soynlp.word import WordExtractor

In [None]:
word_ext = WordExtractor()
word_ext.train(X_train)
scores = word_ext.word_scores()

training was done. used memory 1.468 Gbse memory 1.250 Gb
all cohesion probabilities was computed. # words = 85683
all branching entropies was computed # words = 101540
all accessor variety was computed # words = 101540


In [None]:
import math

score_dict = {key: scores[key].cohesion_forward *
              math.exp(scores[key].right_branching_entropy) 
              for key in scores}

In [None]:
lto = LTokenizer(scores=score_dict)

In [None]:
print(lto.tokenize(X_train[4]))

['사이', '몬페그의', '익살스', '런', '연기', '가', '돋보', '였던', '영화', '!스파이더맨에서', '늙어', '보이기만', '했던', '커스틴', '던스트가', '너무', '나도', '이뻐', '보였다']


In [None]:
if not os.path.isfile('soy_train.npz'):
    tfidf = TfidfVectorizer(ngram_range=(1, 2),
                            min_df=3,
                            max_df=0.9,
                            tokenizer=lto.tokenize, 
                            token_pattern=None)
    tfidf.fit(X_train)
    X_train_soy = tfidf.transform(X_train)
    X_test_soy = tfidf.transform(X_test)
    save_npz('soy_train.npz', X_train_soy)
    save_npz('soy_test.npz', X_test_soy)
else:
    X_train_soy = load_npz('soy_train.npz')
    X_test_soy = load_npz('soy_test.npz')

In [None]:
rsv_soy = RandomizedSearchCV(estimator=sgd,
                             param_distributions=param_dist,
                             n_iter=50,
                             random_state=1,
                             verbose=1,
                             n_jobs=1)
rsv_soy.fit(X_train_soy, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


0,1,2
,estimator,SGDClassifier...andom_state=1)
,param_distributions,{'alpha': <scipy.stats....t 0x1a12f2390>}
,n_iter,50
,scoring,
,n_jobs,1
,refit,True
,cv,
,verbose,1
,pre_dispatch,'2*n_jobs'
,random_state,1

0,1,2
,loss,'log_loss'
,penalty,'l2'
,alpha,0.00010015813955858975
,l1_ratio,0.15
,fit_intercept,True
,max_iter,1000
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [None]:
print(rsv_soy.best_score_)
print(rsv_soy.best_params_)

0.8141066666666665
{'alpha': 0.00010015813955858975}


In [None]:
import joblib
best_model = rsv_okt.best_estimator_
model_filename = 'ted_naver_movie_soy_sgd_model.joblib'
joblib.dump(best_model, model_filename)

['ted_naver_movie_soy_sgd_model.joblib']

In [None]:
rsv_soy.score(X_test_soy, y_test)

0.8085