In [1]:
#import MeCab

#def tokenizer(sentence):
#    m = MeCab.Tagger("-Owakati")
#    kekka = m.parse(sentence)
#    return kekka.split(" ")

#pythonanywhereにMeCabインストール方法が分からなかったため、janomeに変更

In [1]:
from janome.tokenizer import Tokenizer

def tokenizer(sentence):
    t = Tokenizer()
    return t.tokenize(sentence, wakati = True)

In [2]:
import pandas as pd
import numpy as np

#csvファイルからデータ読み込み、加工
review = pd.read_csv("data2010.csv",names=('comment','star'))
review = review[review['star'] != '-']
review['star'] = np.where(review['star'].astype(float) > 3.8, 1, 0 )

#「3.8」に設定すると、ちょうど半分に分かれる

In [3]:
from sklearn.model_selection import train_test_split

#訓練用とテスト用にデータを分ける
X_train, X_test, y_train, y_test \
    = train_test_split(review['comment'].values, review['star'].values, \
    test_size=0.25, random_state=1, stratify=review['star'])

In [4]:
from sklearn.feature_extraction.text import  CountVectorizer, TfidfTransformer

#ベクトルの準備
count = CountVectorizer(tokenizer=tokenizer)
tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)

In [14]:
from sklearn.ensemble import RandomForestClassifier
from numpy.core.umath_tests import inner1d

#classifierの準備
#ランダムフォレスト
#調整要  n_estimators(大きいほど良い)
#        max_features(n_featuresぐらい)
#        max_depth (<5ぐらい、あまり影響なし)
forest = RandomForestClassifier(criterion='gini', n_jobs=2, random_state=1)

In [6]:
from sklearn.pipeline import Pipeline

#パイプライン作成
pipe_forest = Pipeline([('count', count), \
                        ('vect', tfidf), \
                        ('clf', forest)])

In [13]:
#グリッドサーチ用のキー名を検索
pipe_forest.get_params().keys()

dict_keys(['memory', 'steps', 'vect', 'clf', 'vect__norm', 'vect__smooth_idf', 'vect__sublinear_tf', 'vect__use_idf', 'clf__bootstrap', 'clf__class_weight', 'clf__criterion', 'clf__max_depth', 'clf__max_features', 'clf__max_leaf_nodes', 'clf__min_impurity_decrease', 'clf__min_impurity_split', 'clf__min_samples_leaf', 'clf__min_samples_split', 'clf__min_weight_fraction_leaf', 'clf__n_estimators', 'clf__n_jobs', 'clf__oob_score', 'clf__random_state', 'clf__verbose', 'clf__warm_start'])

In [80]:
from sklearn.model_selection import GridSearchCV

#グリッドサーチ作成
param_grid = [{'clf__max_depth':[50,60,70,80],
               'clf__n_estimators':[10,50,100]}]
gs = GridSearchCV(estimator=pipe_forest,
                  param_grid=param_grid,
                  cv=2)

In [81]:
これを流すと時間がかかるので、普段の調査は以下のforest2を使用して調査すること

#from sklearn.model_selection import cross_val_score
#scores = cross_val_score(estimator=gs, X=X_train, y=y_train, cv=5)

In [82]:
print(scores)

[0.70799347 0.69004894 0.69168026 0.69820555 0.70751634]


グリッドサーチは時間がかかる。    
ちょっとした調査は↓で実行する。

In [7]:
from sklearn.pipeline import Pipeline
forest2 = RandomForestClassifier(criterion='gini', n_jobs=-1, random_state=1, n_estimators=10, max_depth=5, max_features=10)

#パイプライン作成
pipe_forest2 = Pipeline([('count', count), \
                         ('vect', tfidf), \
                         ('clf', forest2)])

In [238]:
pipe_forest2.fit(X_train, y_train)
print(pipe_forest2.score(X_test, y_test))
print(pipe_forest2.score(X_train, y_train))

0.5557461406518011
0.5568896512292739


In [301]:
from sklearn.linear_model import LogisticRegression
lr2 = LogisticRegression(C=0.1,random_state=1)
#3.8以上にすると、C-0.1あたりがいい性能をだせる

#パイプライン作成
pipe_lr2 = Pipeline([('count', count), \
                     ('vect', tfidf), \
                     ('clf', lr2)])

In [302]:
pipe_lr2.fit(X_train, y_train)
print(pipe_lr2.score(X_test, y_test))
print(pipe_lr2.score(X_train, y_train))

0.6166380789022299
0.6626643796455117


In [247]:
from sklearn.svm import SVC
svm = SVC(kernel='linear', C=0.3, random_state=1)

#パイプライン作成
pipe_svm = Pipeline([('count', count), \
                     ('vect', tfidf), \
                     ('clf', svm)])

In [248]:
pipe_svm.fit(X_train, y_train)
print(pipe_svm.score(X_test, y_test))
print(pipe_svm.score(X_train, y_train))

0.6783876500857633
0.7790165809033733


In [273]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(solver='lbfgs', random_state=0, hidden_layer_sizes=[10,10], alpha=18.1)
#3.8以上のデータは、alpha=30で設定すると

#mlp = MLPClassifier(solver='lbfgs', random_state=0, hidden_layer_sizes=[2])

#パイプライン作成
pipe_mlp = Pipeline([('count', count), \
                     ('vect', tfidf), \
                     ('clf', mlp)])

In [274]:
pipe_mlp.fit(X_train, y_train)
print(pipe_mlp.score(X_train, y_train))
print(pipe_mlp.score(X_test, y_test))

0.9997141223556318
0.6329331046312179


ここから下は、いろいろテスト用

In [236]:
a = np.bincount(y_train)
print("train:{}".format(a[0]/a.sum()))
b = np.bincount(y_test)
print("test:{}".format(b[0]/b.sum()))

train:0.5563178959405375
test:0.5557461406518011
