In [11]:
import numpy as np
import konlpy

In [61]:
with open('data/ratings_train.txt') as f:
    f.readline()
    ratings_train = np.array([l.rstrip().split('\t')[1:] for l in f.readlines()])
text_train, y_train = ratings_train[:,0], ratings_train[:,1].astype('int64')

In [63]:
with open('data/ratings_test.txt') as f:
    f.readline()
    ratings_test = np.array([l.rstrip().split('\t')[1:] for l in f.readlines()])
text_test, y_test = ratings_test[:,0], ratings_test[:,1].astype('int64')

In [64]:
len(text_train), np.bincount(y_train)

(150000, array([75173, 74827]))

In [65]:
len(text_test), np.bincount(y_test)

(50000, array([24827, 25173]))

In [70]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(min_df=5).fit(text_train)
X_train = vect.transform(text_train)

In [71]:
feature_names = vect.get_feature_names()
print("Number of features: {}".format(len(feature_names)))
print("First 20 features:\n{}".format(feature_names[:20]))
print("Features 20010 to 20030:\n{}".format(feature_names[20010:20030]))
print("Every 2000th feature:\n{}".format(feature_names[::2000]))

Number of features: 24160
First 20 features:
['007', '007시리즈', '007을', '007의', '007이', '02', '04', '0개', '0개는', '0은', '0점', '0점도', '0점없나', '0점은', '0점을', '0점이', '0점이다', '0점주고싶다', '0점짜리', '10']
Features 20010 to 20030:
['주연은', '주연을', '주연의', '주연이', '주연인', '주연인데', '주연한', '주옥같은', '주온', '주온보다', '주원', '주위', '주위를', '주위에', '주위에서', '주윤발', '주윤발은', '주윤발의', '주윤발이', '주의']
Every 2000th feature:
['007', '결혼도', '나타난', '된다면', '몰입', '봉태규', '슈퍼', '애기들', '왕가위의', '자극적이지', '주연', '탕웨이가', '흘러간']


In [72]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=5)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))

Mean cross-validation accuracy: 0.80


In [79]:
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(TfidfVectorizer(min_df=3, norm=None),
                     LogisticRegression())
param_grid = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10]}

grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(text_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

Best cross-validation score: 0.81


In [78]:
pipe = make_pipeline(TfidfVectorizer(min_df=5), LogisticRegression())
# running the grid-search takes a long time because of the
# relatively large grid and the inclusion of trigrams
param_grid = {'logisticregression__C': [0.1, 1, 10],
              "tfidfvectorizer__ngram_range": [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)]}

grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(text_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters:\n{}".format(grid.best_params_))

Best cross-validation score: 0.80
Best parameters:
{'tfidfvectorizer__ngram_range': (1, 3), 'logisticregression__C': 1}


In [77]:
# extract scores from grid_search
scores = grid.cv_results_['mean_test_score'].reshape(-1, 3).T
# visualize heatmap
heatmap = mglearn.tools.heatmap(
    scores, xlabel="C", ylabel="ngram_range", cmap="viridis", fmt="%.3f",
    xticklabels=param_grid['logisticregression__C'],
    yticklabels=param_grid['tfidfvectorizer__ngram_range'])
plt.colorbar(heatmap)

NameError: name 'mglearn' is not defined

In [4]:
from konlpy.tag import Twitter
tag = Twitter()

In [9]:
tokens_ko = tag.morphs(ratings_train[0].split('\t')[1:2])

In [10]:
tokens_ko

['아', '더빙', '..', '진짜', '짜증', '나네', '요', '목소리']