# 네이버 영화평 감성 분석 - TfidfVectorizer

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
train_data = pd.read_csv('../00.data/github.com/nsmc/train.tsv', sep='\t')
train_data.head(1)

Unnamed: 0,id,document,label
0,9976970,아 더빙 진짜 짜증나네요 목소리,0


In [3]:
test_data = pd.read_csv('../00.data/github.com/nsmc/test.tsv', sep='\t')
test_data.head(1)

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1


In [4]:
train_data.shape, test_data.shape

((2977, 3), (989, 3))

## Tokenizer 함수 정의

In [5]:
from konlpy.tag import Okt
okt = Okt()
def koTokenizer(text):
    return okt.morphs(text)

## TfidfVectorizer 로 학습/변환

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(
    tokenizer=koTokenizer, 
    ngram_range=(1,2),
    min_df=3,
    max_df=0.9
)

In [8]:
%%time
tv.fit(train_data['document'])

CPU times: user 43.6 s, sys: 2.18 s, total: 45.8 s
Wall time: 55.4 s


TfidfVectorizer(max_df=0.9, min_df=3, ngram_range=(1, 2),
                tokenizer=<function koTokenizer at 0x1260bef70>)

In [9]:
%%time
X_train = tv.transform(train_data['document'])

CPU times: user 13.1 s, sys: 473 ms, total: 13.6 s
Wall time: 24.2 s


In [10]:
%%time
X_test = tv.transform(test_data['document'])

CPU times: user 4.28 s, sys: 159 ms, total: 4.44 s
Wall time: 7.59 s


In [11]:
y_train = train_data['label'].values
y_test = test_data['label'].values

## LogisticRegression으로 학습/예측/평가

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression(C=3.5)
lr.fit(X_train, y_train)
pred = lr.predict(X_test)

accuracy_score(y_test, pred)

0.7593528816986855

## 최적 하이퍼 파라미터 도출

In [16]:
def randints(low=0, high=10, size=3, replace=False):
    randlist = np.random.randint(low, high, size=size).tolist()
    randbool = [True if randlist.count(n) > 1 else False for n in randlist]
    if randbool.count(True):
        return randints(low, high, size, replace)
    return np.random.choice(randlist, size=size, replace=replace).tolist()

def randfloats(low=0, high=1, size=3, ndigits=3, replace=False):
    randlist = np.random.uniform(low, high, size=size).tolist()
    randlist = list(map(lambda x: round(x, ndigits), randlist))
    randbool = [True if randlist.count(n) > 1 else False for n in randlist]
    if randbool.count(True):
        return randfloat(low, high, size, ndigits, replace)
    return np.random.choice(randlist, size=size, replace=replace).tolist()

In [22]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

# Set up the grid search with 5-fold cross validation
rs = RandomizedSearchCV(
    estimator=LogisticRegression(solver='saga'),
    param_distributions={
        'C': randints(0, 10, size=10)
    },
    cv=10,
    scoring='accuracy',
    verbose=10, 
    n_jobs=-1,
)
rs.fit(X_train, y_train)
print('best_score_: {}'.format(rs.best_score_))
print('best_params_: {}'.format(rs.best_params_))

estimator = rs.best_estimator_
pred = estimator.predict(X_test)
score = accuracy_score(y_test, pred)
print('accuracy_score: {}'.format(score))

Fitting 10 folds for each of 10 candidates, totalling 100 fits
best_score_: 0.7604851648475808
best_params_: {'C': 2}
accuracy_score: 0.7603640040444893


In [24]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

# Extension of sag that also allows for L1 regularization. Should generally train faster than sag
pipe = Pipeline([
    ('tv', TfidfVectorizer(
        tokenizer=koTokenizer,
        # ngram_range=(1,2)
    )),
    ('lr', LogisticRegression(solver='saga'))
])

# Set up the grid search with 5-fold cross validation
rs = RandomizedSearchCV(
    estimator=pipe,
    param_distributions={
        # 'tv__ngram_range': [(1, n) for n in randints(0, 10, size=3)],
        # 'tv__min_df': randfloats(0, 10, size=3, ndigits=1),
        # 'tv__max_df': randints(0, 10, size=3),
        'lr__C': randints(0, 10, size=3)
    },
    cv=5,
    scoring='accuracy',
    verbose=10, 
    n_jobs=-1,
)
rs.fit(X_train, y_train)
print('best_score_: {}'.format(rs.best_score_))
print('best_params_: {}'.format(rs.best_params_))

estimator = rs.best_estimator_
pred = estimator.predict(X_test)
score = accuracy_score(y_test, pred)
print('accuracy_score: {}'.format(score))

Fitting 5 folds for each of 3 candidates, totalling 15 fits


PicklingError: Could not pickle the task to send it to the workers.