# 交差検証時の警告が出ないようなデータを抽出

引き続き、テストデータとして「test_daikin_conversation.csv」を使用するものとします。

ただし、交差検証時に警告が出ないよう、データをクレンジングします。

## (1) テストデータ／環境準備

In [1]:
'''
    プロトタイピング用のパスと、Botライブラリーパスを取得／設定します
'''
import sys
import os

prototype_dir = os.path.join(os.getcwd(), '..')
prototype_dir = os.path.abspath(prototype_dir)

learning_dir = os.path.join(prototype_dir, '..')
learning_dir = os.path.abspath(learning_dir)
os.chdir(learning_dir)

if learning_dir not in sys.path:
    sys.path.append(learning_dir)

print('prototype_dir=%s\nlearning_dir=%s\nsys.path=%s' % (prototype_dir, learning_dir, sys.path))

prototype_dir=/Users/makmorit/GitHub/donusagi-bot/learning/prototype
learning_dir=/Users/makmorit/GitHub/donusagi-bot/learning
sys.path=['', '/Library/Frameworks/Python.framework/Versions/3.5/lib/python35.zip', '/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5', '/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/plat-darwin', '/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/lib-dynload', '/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages', '/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/IPython/extensions', '/Users/makmorit/.ipython', '/Users/makmorit/GitHub/donusagi-bot/learning']


In [2]:
'''
    データファイルは、既存の訓練データを別場所にコピーしてから使用します
    テストデータは、csv_file_name で指定したものを使用します。
'''
csv_file_name = 'test_daikin_conversation.csv'
original_csv_dir = os.path.join(learning_dir, 'learning/tests/engine/fixtures/')
original_file_path = os.path.join(original_csv_dir, csv_file_name)

csv_dir = os.path.join(prototype_dir, 'resources')

import shutil
shutil.copy2(original_file_path, csv_dir)
copied_csv_file_path = os.path.join(csv_dir, csv_file_name)

print('CSV file for test=[%s]' % copied_csv_file_path)

CSV file for test=[/Users/makmorit/GitHub/donusagi-bot/learning/prototype/resources/test_daikin_conversation.csv]


## (2) 訓練データを生成

In [3]:
'''
    初期設定
    データファイル、エンコードを指定
    内容は、learn.py を参考にしました。    
'''
from learning.core.learn.learning_parameter import LearningParameter
bot_id = 8888
attr = {
    'include_failed_data': False,
    'include_tag_vector': False,
    'classify_threshold': None,
    # 'algorithm': LearningParameter.ALGORITHM_NAIVE_BAYES
    'algorithm': LearningParameter.ALGORITHM_LOGISTIC_REGRESSION,
    # 'params_for_algorithm': { 'C': 200 }
    'params_for_algorithm': {}
}
learning_parameter = LearningParameter(attr)
csv_file_path = copied_csv_file_path
csv_file_encoding = 'utf-8'

In [4]:
'''
    訓練データの生成（内部で TF-IDF 処理を実行）
'''
from learning.core.training_set.training_message_from_csv import TrainingMessageFromCsv
training_set = TrainingMessageFromCsv(bot_id, csv_file_path, learning_parameter, encoding=csv_file_encoding)
build_training_set_from_csv = training_set.build()

TrainingMessageFromCsv#__build_learning_training_messages count of learning data: 17443
2017/03/03 PM 05:47:16 TrainingMessageFromCsv#__build_learning_training_messages count of learning data: 17443
TextArray#__init__ start
2017/03/03 PM 05:47:16 TextArray#__init__ start
TextArray#to_vec start
2017/03/03 PM 05:47:16 TextArray#to_vec start
TextArray#to_vec end
2017/03/03 PM 05:47:35 TextArray#to_vec end
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]
2017/03/03 PM 05:47:35 [[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [5]:
print(type(build_training_set_from_csv.x), len(build_training_set_from_csv.x))

<class 'numpy.ndarray'> 17443


In [6]:
print(type(build_training_set_from_csv.x[1][0]))

<class 'numpy.float64'>


In [7]:
print(type(build_training_set_from_csv.y), len(build_training_set_from_csv.y))

<class 'pandas.core.series.Series'> 17443


In [8]:
print(type(build_training_set_from_csv.y[0]))

<class 'numpy.int64'>


## (3) K-fold 分割数以上のクラスを抽出してクレンジング

In [9]:
'''
    訓練データ分割前のチェック処理
    
    scikit-learn/sklearn/cross_validation.py 内部の処理を切り出して実行
'''
import numpy as np
y = np.asarray(build_training_set_from_csv.y)
n_samples = y.shape[0]
unique_labels, y_inversed = np.unique(y, return_inverse=True)

In [10]:
def bincount(x, weights=None, minlength=None):
    if len(x) > 0:
        return np.bincount(x, weights, minlength) # <---
    else:
        if minlength is None:
            minlength = 0
        minlength = np.asscalar(np.asarray(minlength, dtype=np.intp))
        return np.zeros(minlength, dtype=np.intp)

label_counts = bincount(y_inversed)

In [11]:
'''
    サンプル数が、K-fold分割数を下回っているクラスを抽出
'''
n_folds = 3
warning_class_ids = []
for i, unique_label in enumerate(unique_labels):
    if label_counts[i] < n_folds:
        warning_class_ids.append(unique_label)

warning_class_ids[0:10]

[3931, 3933, 3935, 3938, 3973, 3976, 3977, 3979, 3981, 3984]

In [12]:
'''
    サンプル数が、K-fold分割数以上のクラスの訓練データを抽出
'''
list_x = []
list_y = []
for i, y in enumerate(build_training_set_from_csv.y):
    if build_training_set_from_csv.y[i] not in warning_class_ids:
        list_x.append(build_training_set_from_csv.x[i])
        list_y.append(build_training_set_from_csv.y[i])

In [13]:
X = np.array(list_x, np.float64)
X

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [14]:
y = np.array(list_y, np.int64)
y

array([3397, 3398, 3399, ..., 4665, 4667, 4671])

## (4) クレンジングされたデータセットを永続化

（次回以降使用するため）

In [15]:
np.save('training_set_from_csv_X', X)

In [16]:
np.save('training_set_from_csv_y', y)

## (5) 動作確認：クレンジング後のデータセットにより学習実行

In [17]:
X = np.load('training_set_from_csv_X.npy')
X

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [18]:
y = np.load('training_set_from_csv_y.npy')
y

array([3397, 3398, 3399, ..., 4665, 4667, 4671])

In [19]:
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression

params = {'C': [10, 100, 140, 200]}
grid = GridSearchCV(LogisticRegression(), param_grid=params)
grid.fit(X, y)
estimator = grid.best_estimator_



In [20]:
estimator

LogisticRegression(C=200, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

## (6) 動作確認：学習結果を評価（既存のAccuracyによるクロスバリデーション）

In [21]:
from learning.core.evaluator import Evaluator
evaluator = Evaluator()
evaluator.evaluate(estimator, X, y, threshold=learning_parameter.classify_threshold)

self.threshold: 0.0
2017/03/03 PM 05:50:42 self.threshold: 0.0
Evaluator#evaluate#elapsed time: 24329.417944 ms
2017/03/03 PM 05:51:07 Evaluator#evaluate#elapsed time: 24329.417944 ms
accuracy: 0.984562211982
2017/03/03 PM 05:51:07 accuracy: 0.984562211982


ちなみに「クラスのサンプル数 ＜ K-fold分割数」のデータを含んだ結果（<b><a href="01.ipynb">レポートはこちらをご参照</a></b>）は
~~~~
2017/03/01 PM 03:26:21 accuracy: 0.984177940839
~~~~
であったため、若干ではあるが向上している様子。