# 【問題1】train_test_splitのスクラッチ
- scikit-learnのtrain_test_splitを自作する。 

本家：[sklearn.model_selection.train_test_split — scikit-learn 0.20.0 documentation](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)  
自家製：[spilt.train_test_split()](https://github.com/y9dai/diveintocode-ml/blob/master/ml-scratch/utils/split.py)

In [0]:
# colab用
!mkdir -p /ml-scratch/utils
!cp split.py /ml-scratch/utils/

In [6]:
import sys
# ライブラリまでのディレクトリ定義
sys.path.append('../ml-scratch/utils')

import numpy as np
from split import train_test_split

X = np.arange(1,101).reshape([10,10])
y = np.arange(1,11).reshape(10, 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, 0.7)


print('自家製')
print(X_train)
print(y_train)
print(X_test)
print(y_test)

from sklearn.model_selection import train_test_split

X = np.arange(1,101).reshape([10,10])
y = np.arange(1,11).reshape(10, 1)

X_train, X_test, y_train, y_test = train_test_split(X, y)


print('本家')
print(X_train)
print(y_train)
print(X_test)
print(y_test)

自家製
[[ 71  72  73  74  75  76  77  78  79  80]
 [ 31  32  33  34  35  36  37  38  39  40]
 [ 91  92  93  94  95  96  97  98  99 100]
 [ 81  82  83  84  85  86  87  88  89  90]
 [ 51  52  53  54  55  56  57  58  59  60]
 [ 41  42  43  44  45  46  47  48  49  50]
 [ 21  22  23  24  25  26  27  28  29  30]]
[[ 9]
 [ 6]
 [ 8]
 [10]
 [ 1]
 [ 2]
 [ 4]]
[[61 62 63 64 65 66 67 68 69 70]
 [ 1  2  3  4  5  6  7  8  9 10]
 [11 12 13 14 15 16 17 18 19 20]]
[[3]
 [5]
 [7]]
本家
[[ 51  52  53  54  55  56  57  58  59  60]
 [ 61  62  63  64  65  66  67  68  69  70]
 [ 91  92  93  94  95  96  97  98  99 100]
 [ 11  12  13  14  15  16  17  18  19  20]
 [ 81  82  83  84  85  86  87  88  89  90]
 [  1   2   3   4   5   6   7   8   9  10]
 [ 41  42  43  44  45  46  47  48  49  50]]
[[ 6]
 [ 7]
 [10]
 [ 2]
 [ 9]
 [ 1]
 [ 5]]
[[21 22 23 24 25 26 27 28 29 30]
 [71 72 73 74 75 76 77 78 79 80]
 [31 32 33 34 35 36 37 38 39 40]]
[[3]
 [8]
 [4]]


# 【問題2】 分類パイプラインの作成
- ロジスティック回帰
- SVM
- 決定木

をそれぞれ扱うためのパイプラインを作成する。  
データセットは下記を使用

In [0]:
# データ格納用
data = {}

# 以下指定のデータセットをdataに格納

from sklearn.datasets import load_iris
import pandas as pd

iris = load_iris()

X = pd.DataFrame(
    iris.data, 
    columns = iris.feature_names
)

y = pd.DataFrame(iris.target, columns = ['species'])

data[0] = {'X': X, 'y': y}


import numpy as np

np.random.seed(seed=0)
n_samples = 500
f0 = [-1, 2]
f1 = [2, -1]
cov = [[1.0,0.8], [0.8, 1.0]]

f0 = np.random.multivariate_normal(f0, cov, int(n_samples/2))
f1 = np.random.multivariate_normal(f1, cov, int(n_samples/2))

X = np.concatenate((f0, f1))
y = np.concatenate((np.ones((int(n_samples/2))), np.ones((int(n_samples/2))) *(-1))).astype(np.int)

random_index = np.random.permutation(np.arange(n_samples))
X = X[random_index]
y = y[random_index]

data[1] = {'X': X, 'y': y}


X = np.array([[-0.44699 , -2.8073  ],[-1.4621  , -2.4586  ],
       [ 0.10645 ,  1.9242  ],[-3.5944  , -4.0112  ],
       [-0.9888  ,  4.5718  ],[-3.1625  , -3.9606  ],
       [ 0.56421 ,  0.72888 ],[-0.60216 ,  8.4636  ],
       [-0.61251 , -0.75345 ],[-0.73535 , -2.2718  ],
       [-0.80647 , -2.2135  ],[ 0.86291 ,  2.3946  ],
       [-3.1108  ,  0.15394 ],[-2.9362  ,  2.5462  ],
       [-0.57242 , -2.9915  ],[ 1.4771  ,  3.4896  ],
       [ 0.58619 ,  0.37158 ],[ 0.6017  ,  4.3439  ],
       [-2.1086  ,  8.3428  ],[-4.1013  , -4.353   ],
       [-1.9948  , -1.3927  ],[ 0.35084 , -0.031994],
       [ 0.96765 ,  7.8929  ],[-1.281   , 15.6824  ],
       [ 0.96765 , 10.083   ],[ 1.3763  ,  1.3347  ],
       [-2.234   , -2.5323  ],[-2.9452  , -1.8219  ],
       [ 0.14654 , -0.28733 ],[ 0.5461  ,  5.8245  ],
       [-0.65259 ,  9.3444  ],[ 0.59912 ,  5.3524  ],
       [ 0.50214 , -0.31818 ],[-3.0603  , -3.6461  ],
       [-6.6797  ,  0.67661 ],[-2.353   , -0.72261 ],
       [ 1.1319  ,  2.4023  ],[-0.12243 ,  9.0162  ],
       [-2.5677  , 13.1779  ],[ 0.057313,  5.4681  ]])
y = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

data[2] = {'X': X, 'y': y}

#print(data)

パイプラインは下記を作成した。  
[c_pipeline.c_pipeline()](https://github.com/y9dai/diveintocode-ml/blob/master/ml-scratch/utils/c_pipeline.py)

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

# 各手法インスタンス格納用
clf_dict = {}

# 各手法インスタンス作成
# ロジスティック回帰
clf_dict['LogisticRegression'] = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial')

# SVM
clf_dict['SVM'] = SVC(gamma='auto')

# 決定木
clf_dict['DecisionTree'] = DecisionTreeClassifier(random_state=0)


# 分類パイプラインコール
from c_pipeline import c_pipeline
accuracy_dict, confusion_matrix_dict = c_pipeline.c_pipeline(data, clf_dict)

# 中身を確認
for clf_key in clf_dict:
    print('{} : '.format(clf_key))
    print('accuracy : {}'.format(accuracy_dict[clf_key]))
    print(confusion_matrix_dict[clf_key])

LogisticRegression : 
accuracy : 0.9473684210526315
[[ 9  0  0]
 [ 0 13  1]
 [ 0  1 14]]
SVM : 
accuracy : 1.0
[[66  0]
 [ 0 59]]
DecisionTree : 
accuracy : 0.8
[[5 1]
 [1 3]]


  y = column_or_1d(y, warn=True)


# 【問題3】 回帰パイプラインの作成
- 線形回帰  

を扱うパイプラインを作成する。
- データセットはHouse Pricesコンペティション
- 目的変数としてSalePrice、説明変数として、GrLivAreaとYearBuiltを使用

パイプラインは下記を作成した。  
[r_pipeline.r_pipeline()](https://github.com/y9dai/diveintocode-ml/blob/master/ml-scratch/utils/r_pipeline.py)

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

# データ格納用
data = {}

df = pd.read_csv('train.csv')

data= {'X': df.loc[:, ['GrLivArea', 'YearBuilt']], 'y': df.loc[:, ['SalePrice']]}


clf = LinearRegression()

# 回帰パイプラインコール
from r_pipeline import r_pipeline
mse = r_pipeline.r_pipeline(data, clf)

print(mse)

1741766026.2166011
