# Sprint
## 機械学習スクラッチ入門
今後の機械学習スクラッチ課題で作成するモデルを、scikit-learnを用いて一度動かしておきます。これまでの復習を兼ねたスクラッチ課題の準備です。

# 【問題1】
## train_test_splitのスクラッチ
スクラッチの練習として、scikit-learnのtrain_test_splitを自作してみます。以下の雛形をベースとして関数を完成させてください。

[sklearn.model_selection.train_test_split — scikit-learn 0.21.3 documentation](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)

なお、作成した関数がscikit-learnのtrain_test_splitと同じ動作をしているか必ず確認をするようにしましょう。

In [1]:
import numpy as np
import pandas as pd
import math

In [2]:
from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit
def scratch_train_test_split(X, y, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None):
    """
    検証データを分割する。

    Parameters
    ----------
    X : 次の形のndarray, shape (n_samples, n_features)
      訓練データ
    y : 次の形のndarray, shape (n_samples, )
      正解値
    test_size : float (0<test_size<1)
      何割をtestとするか指定
    train_size : float (0<train_size<1)
      何割をtrainとするか指定
    random_state : int
      分割時の乱数シードを固定する
    shuffle: bool
      分割前にデータをシャッフルする  shuffle=Falseの場合、stratify=Falseである必要がある
    stratify: 次の形のndarray, shape (n_samples, )
      層化抽出を行う（クラスラベルの比率を保ったまま分割する）

    Returns
    ----------
    X_train : 次の形のndarray, shape (n_samples, n_features)
      訓練データ
    X_test : 次の形のndarray, shape (n_samples, n_features)
      検証データ
    y_train : 次の形のndarray, shape (n_samples, )
      訓練データの正解値
    y_test : 次の形のndarray, shape (n_samples, )
      検証データの正解値
    """
    #ここにコードを書く
    # 引数チェック 
    ## train_size/test_size - train_size優先でエラーを避ける
    if train_size is not None:
        ### train_size only
        if test_size is None:
            if not (0 < train_size < 1):
                raise ValueError('train_size must be range(0,1)')
        ### train_size and test_size
        else:
            if (0 < (train_size+test_size) < 1):
                if not (0 < train_size < 1):
                    if 0 < test_size < 1:
                        train_size = 1-test_size
                    else:
                        raise ValueError('train_size or test_size must be range(0,1)')
            else:
                raise ValueError('(train_size+test_size) must be range(0,1)')
    else:
        ### None
        if test_size is None:
            train_size = 0.75
        ### test_size only
        else:
            if 0 < test_size < 1:
                train_size = 1-test_size
            else:
                raise ValueError('test_size must be range(0,1)')

    ## random_state
    if random_state is not None:
        np.random.seed(random_state)

    ## shuffle
    if shuffle:
        if stratify is not None:
            ## shuffle-stratify
            sss = StratifiedShuffleSplit(n_splits=1, train_size=train_size, random_state=random_state)
            n_samples = X.shape[0]
            for train_index, test_index in sss.split(np.zeros(n_samples), stratify):
                X_train = X[train_index, :]
                X_test = X[test_index, :]
                y_train = y[train_index]
                y_test = y[test_index]
            return X_train, X_test, y_train, y_test
        else:
            ## shuffle
            ss = ShuffleSplit(n_splits=1, train_size=train_size, random_state=random_state)
            n_samples = X.shape[0]
            for train_index, test_index in ss.split(np.zeros(n_samples)):
                X_train = X[train_index, :]
                X_test = X[test_index, :]
                y_train = y[train_index]
                y_test = y[test_index]
            return X_train, X_test, y_train, y_test
    else:
        ## stratify(must be None)
        if stratify is not None:
            raise ValueError

In [3]:
# irisデータセット読み込み（検証用）
from sklearn.datasets import load_iris

data = load_iris()

# 説明変数
X = pd.DataFrame(data=data.get('data'), 
    columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])

# 目的変数
y = pd.DataFrame(data=data.get('target'),
    columns=['Species'])

In [4]:
# 動作検証
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = scratch_train_test_split(X.values, y.values.ravel(), 
                                        train_size=0.79, random_state=0, stratify=y.values.ravel())
print(f'scratch: {X_train.shape, X_test.shape, y_train.shape, y_test.shape}')
display(np.sort(X_train), np.sort(X_test), np.sort(y_train), np.sort(y_test))

X_train, X_test, y_train, y_test = train_test_split(X.values, y.values.ravel(), 
                                        train_size=0.79, random_state=0, stratify=y.values.ravel())
print(f'sklearn: {X_train.shape, X_test.shape, y_train.shape, y_test.shape}')
display(np.sort(X_train), np.sort(X_test), np.sort(y_train), np.sort(y_test))

scratch: ((118, 4), (32, 4), (118,), (32,))


array([[1.8, 2.8, 4.8, 6.2],
       [0.6, 1.6, 3.5, 5. ],
       [1.3, 2.9, 4.6, 6.6],
       [2.3, 2.6, 6.9, 7.7],
       [0.3, 1.3, 3.5, 5. ],
       [0.2, 1.4, 2.9, 4.4],
       [1.8, 3. , 4.8, 6. ],
       [0.2, 1.3, 3. , 4.4],
       [1. , 2.2, 4. , 6. ],
       [2.3, 3.2, 5.9, 6.8],
       [1.6, 3.4, 4.5, 6. ],
       [1.5, 3. , 4.5, 5.4],
       [0.2, 1.2, 4. , 5.8],
       [2.3, 3.1, 5.1, 6.9],
       [1.3, 2.8, 4. , 6.1],
       [0.2, 1.2, 3.2, 5. ],
       [1.4, 2.8, 4.8, 6.8],
       [0.2, 1.4, 3.4, 5.2],
       [2.2, 3. , 5.8, 6.5],
       [0.4, 1.6, 3.4, 5. ],
       [1.4, 3. , 4.6, 6.1],
       [1.2, 3. , 4.2, 5.7],
       [1. , 2.3, 3.3, 5. ],
       [1.3, 2.9, 4.2, 5.7],
       [0.4, 1.5, 3.4, 5.4],
       [2.1, 3. , 5.5, 6.8],
       [1.8, 3.2, 6. , 7.2],
       [2.2, 3.8, 6.7, 7.7],
       [0.1, 1.5, 4.1, 5.2],
       [2.4, 3.1, 5.6, 6.7],
       [1.3, 2.3, 4.4, 6.3],
       [1.8, 2.9, 6.3, 7.3],
       [2.1, 3.3, 5.7, 6.7],
       [2.3, 3.4, 5.4, 6.2],
       [0.2, 1

array([[0.2, 1.9, 3.4, 4.8],
       [1.2, 2.6, 4.4, 5.5],
       [0.2, 1.5, 3.1, 4.6],
       [1.8, 2.9, 5.6, 6.3],
       [0.4, 1.5, 3.7, 5.1],
       [1.4, 3.1, 4.4, 6.7],
       [2. , 2.5, 5. , 5.7],
       [0.2, 1.5, 3.7, 5.3],
       [0.1, 1.5, 3.1, 4.9],
       [0.2, 1.4, 3.5, 5.1],
       [1.4, 2.6, 5.6, 6.1],
       [1. , 2.7, 4.1, 5.8],
       [1.5, 3.2, 4.5, 6.4],
       [1.4, 3.2, 4.7, 7. ],
       [1.5, 2.8, 4.6, 6.5],
       [2.1, 3.1, 5.4, 6.9],
       [2. , 2.8, 4.9, 5.6],
       [1. , 2.4, 3.7, 5.5],
       [1.5, 3.1, 4.7, 6.7],
       [0.3, 1.5, 3.8, 5.1],
       [0.5, 1.7, 3.3, 5.1],
       [1.3, 2.8, 4.1, 5.7],
       [2.5, 3.3, 6. , 6.3],
       [2. , 3. , 5.2, 6.5],
       [0.3, 1.4, 3.4, 4.6],
       [1.3, 3. , 4.1, 5.6],
       [1.3, 2.8, 4.5, 5.7],
       [2.4, 3.4, 5.6, 6.3],
       [1.6, 3. , 5.8, 7.2],
       [0.2, 1.5, 3.4, 5.1],
       [0.2, 1.3, 3.5, 5.5],
       [1.9, 2.7, 5.1, 5.8]])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

sklearn: ((118, 4), (32, 4), (118,), (32,))


array([[1.8, 2.8, 4.8, 6.2],
       [0.6, 1.6, 3.5, 5. ],
       [1.3, 2.9, 4.6, 6.6],
       [2.3, 2.6, 6.9, 7.7],
       [0.3, 1.3, 3.5, 5. ],
       [0.2, 1.4, 2.9, 4.4],
       [1.8, 3. , 4.8, 6. ],
       [0.2, 1.3, 3. , 4.4],
       [1. , 2.2, 4. , 6. ],
       [2.3, 3.2, 5.9, 6.8],
       [1.6, 3.4, 4.5, 6. ],
       [1.5, 3. , 4.5, 5.4],
       [0.2, 1.2, 4. , 5.8],
       [2.3, 3.1, 5.1, 6.9],
       [1.3, 2.8, 4. , 6.1],
       [0.2, 1.2, 3.2, 5. ],
       [1.4, 2.8, 4.8, 6.8],
       [0.2, 1.4, 3.4, 5.2],
       [2.2, 3. , 5.8, 6.5],
       [0.4, 1.6, 3.4, 5. ],
       [1.4, 3. , 4.6, 6.1],
       [1.2, 3. , 4.2, 5.7],
       [1. , 2.3, 3.3, 5. ],
       [1.3, 2.9, 4.2, 5.7],
       [0.4, 1.5, 3.4, 5.4],
       [2.1, 3. , 5.5, 6.8],
       [1.8, 3.2, 6. , 7.2],
       [2.2, 3.8, 6.7, 7.7],
       [0.1, 1.5, 4.1, 5.2],
       [2.4, 3.1, 5.6, 6.7],
       [1.3, 2.3, 4.4, 6.3],
       [1.8, 2.9, 6.3, 7.3],
       [2.1, 3.3, 5.7, 6.7],
       [2.3, 3.4, 5.4, 6.2],
       [0.2, 1

array([[0.2, 1.9, 3.4, 4.8],
       [1.2, 2.6, 4.4, 5.5],
       [0.2, 1.5, 3.1, 4.6],
       [1.8, 2.9, 5.6, 6.3],
       [0.4, 1.5, 3.7, 5.1],
       [1.4, 3.1, 4.4, 6.7],
       [2. , 2.5, 5. , 5.7],
       [0.2, 1.5, 3.7, 5.3],
       [0.1, 1.5, 3.1, 4.9],
       [0.2, 1.4, 3.5, 5.1],
       [1.4, 2.6, 5.6, 6.1],
       [1. , 2.7, 4.1, 5.8],
       [1.5, 3.2, 4.5, 6.4],
       [1.4, 3.2, 4.7, 7. ],
       [1.5, 2.8, 4.6, 6.5],
       [2.1, 3.1, 5.4, 6.9],
       [2. , 2.8, 4.9, 5.6],
       [1. , 2.4, 3.7, 5.5],
       [1.5, 3.1, 4.7, 6.7],
       [0.3, 1.5, 3.8, 5.1],
       [0.5, 1.7, 3.3, 5.1],
       [1.3, 2.8, 4.1, 5.7],
       [2.5, 3.3, 6. , 6.3],
       [2. , 3. , 5.2, 6.5],
       [0.3, 1.4, 3.4, 4.6],
       [1.3, 3. , 4.1, 5.6],
       [1.3, 2.8, 4.5, 5.7],
       [2.4, 3.4, 5.6, 6.3],
       [1.6, 3. , 5.8, 7.2],
       [0.2, 1.5, 3.4, 5.1],
       [0.2, 1.3, 3.5, 5.5],
       [1.9, 2.7, 5.1, 5.8]])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [5]:
np.random.seed(0)
print(np.random.normal(loc = 0, scale = 1, size = 10))
np.random.seed(0)
print(np.random.normal(loc = 0, scale = 1, size = 10))

[ 1.76405235  0.40015721  0.97873798  2.2408932   1.86755799 -0.97727788
  0.95008842 -0.15135721 -0.10321885  0.4105985 ]
[ 1.76405235  0.40015721  0.97873798  2.2408932   1.86755799 -0.97727788
  0.95008842 -0.15135721 -0.10321885  0.4105985 ]


scikit-learnを使ったコードを作成していきます。

検証データの分割には問題1で作成した自作の関数を用いてください。クロスバリデーションではなくホールドアウト法で構いません。

分類は3種類の手法をスクラッチします。

- ロジスティック回帰
- SVM
- 決定木

データセットは3種類用意します。

1つ目は事前学習期間同様にirisデータセットです。

2値分類としたいため、以下の2つの目的変数のみ利用します。特徴量は4種類全て使います。

- virgicolorとvirginica

残り2つは特徴量が2つのデータセットを人工的に用意します。以下のコードで説明変数X,目的変数yが作成可能です。「シンプルデータセット1」「シンプルデータセット2」とします。特徴量が2つであるため可視化が容易です。

In [6]:
# シンプルデータセット1
np.random.seed(seed=0)
n_samples = 500
f0 = [-1, 2]
f1 = [2, -1]
cov = [[1.0,0.8], [0.8, 1.0]]
f0 = np.random.multivariate_normal(f0, cov, int(n_samples/2))
f1 = np.random.multivariate_normal(f1, cov, int(n_samples/2))
X1 = np.concatenate((f0, f1))
y1 = np.concatenate((np.ones((int(n_samples/2))), np.ones((int(n_samples/2))) *(-1))).astype(np.int)
random_index = np.random.permutation(np.arange(n_samples))
X1 = X1[random_index]
y1 = y1[random_index]

In [7]:
# シンプルデータセット2
X2 = np.array([[-0.44699 , -2.8073  ],[-1.4621  , -2.4586  ],
       [ 0.10645 ,  1.9242  ],[-3.5944  , -4.0112  ],
       [-0.9888  ,  4.5718  ],[-3.1625  , -3.9606  ],
       [ 0.56421 ,  0.72888 ],[-0.60216 ,  8.4636  ],
       [-0.61251 , -0.75345 ],[-0.73535 , -2.2718  ],
       [-0.80647 , -2.2135  ],[ 0.86291 ,  2.3946  ],
       [-3.1108  ,  0.15394 ],[-2.9362  ,  2.5462  ],
       [-0.57242 , -2.9915  ],[ 1.4771  ,  3.4896  ],
       [ 0.58619 ,  0.37158 ],[ 0.6017  ,  4.3439  ],
       [-2.1086  ,  8.3428  ],[-4.1013  , -4.353   ],
       [-1.9948  , -1.3927  ],[ 0.35084 , -0.031994],
       [ 0.96765 ,  7.8929  ],[-1.281   , 15.6824  ],
       [ 0.96765 , 10.083   ],[ 1.3763  ,  1.3347  ],
       [-2.234   , -2.5323  ],[-2.9452  , -1.8219  ],
       [ 0.14654 , -0.28733 ],[ 0.5461  ,  5.8245  ],
       [-0.65259 ,  9.3444  ],[ 0.59912 ,  5.3524  ],
       [ 0.50214 , -0.31818 ],[-3.0603  , -3.6461  ],
       [-6.6797  ,  0.67661 ],[-2.353   , -0.72261 ],
       [ 1.1319  ,  2.4023  ],[-0.12243 ,  9.0162  ],
       [-2.5677  , 13.1779  ],[ 0.057313,  5.4681  ]])
y2 = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

# 【問題2】
## 分類問題を解くコードの作成
上記3種類の手法で3種類のデータセットを学習・推定するコードを作成してください。

In [8]:
X_train, X_test, y_train, y_test = scratch_train_test_split(X.values[50:, :], y.values.ravel()[50:])
X1_train, X1_test, y1_train, y1_test = scratch_train_test_split(X1, y1)
X2_train, X2_test, y2_train, y2_test = scratch_train_test_split(X2, y2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print(X1_train.shape, X1_test.shape, y1_train.shape, y1_test.shape)
print(X2_train.shape, X2_test.shape, y2_train.shape, y2_test.shape)

(75, 4) (25, 4) (75,) (25,)
(375, 2) (125, 2) (375,) (125,)
(30, 2) (10, 2) (30,) (10,)


In [9]:
# ロジスティック回帰
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import SGDClassifier
logistic = SGDClassifier(loss="log")

# iris
logistic.fit(X_train, y_train)
y_pred = logistic.predict(X_test)
print(f'iris/true: {y_test}')
print(f'iris/pred: {y_pred}')
print(confusion_matrix(y_test, y_pred))

# simple dataset 1
logistic.fit(X1_train, y1_train)
y1_pred = logistic.predict(X1_test)
print(f'simple dataset 1/true: {y1_test}')
print(f'simple dataset 1/pred: {y1_pred}')
print(confusion_matrix(y1_test, y1_pred))

# simple dataset 2
logistic.fit(X2_train, y2_train)
y2_pred = logistic.predict(X2_test)
print(f'simple dataset 2/true: {y2_test}')
print(f'simple dataset 2/pred: {y2_pred}')
print(confusion_matrix(y2_test, y2_pred))

iris/true: [2 2 2 2 2 1 2 1 1 1 1 2 1 1 2 2 2 1 2 1 1 1 2 1 1]
iris/pred: [2 2 2 1 2 1 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 2 1 1]
[[13  0]
 [ 4  8]]
simple dataset 1/true: [ 1 -1 -1 -1 -1  1  1  1 -1  1  1 -1  1  1  1  1 -1 -1  1 -1  1 -1  1 -1
 -1  1  1 -1 -1  1  1  1 -1  1  1 -1 -1  1  1 -1  1 -1  1  1 -1  1 -1 -1
  1 -1 -1  1  1 -1 -1  1  1  1 -1  1 -1 -1 -1  1 -1  1 -1  1 -1  1 -1 -1
  1  1 -1  1  1  1  1  1 -1 -1 -1 -1  1 -1  1  1 -1 -1 -1 -1 -1  1  1  1
 -1 -1  1 -1 -1  1 -1  1  1  1  1 -1 -1 -1  1  1  1 -1 -1 -1 -1  1 -1 -1
  1 -1  1  1 -1]
simple dataset 1/pred: [ 1 -1 -1 -1 -1  1  1  1 -1  1  1 -1  1  1  1  1 -1 -1  1 -1  1 -1  1 -1
 -1  1  1 -1 -1  1  1  1 -1  1  1 -1 -1  1  1 -1  1 -1  1  1 -1  1 -1 -1
  1 -1 -1  1  1 -1 -1  1  1  1 -1  1 -1 -1 -1  1 -1  1 -1  1 -1  1 -1 -1
  1  1 -1  1  1  1  1  1 -1 -1 -1 -1  1 -1  1  1 -1 -1 -1 -1 -1  1  1  1
 -1 -1  1 -1 -1  1 -1  1  1  1  1 -1 -1 -1  1  1  1 -1 -1 -1 -1  1 -1 -1
  1 -1  1  1 -1]
[[62  0]
 [ 0 63]]
simple dataset 2/true: [1 

In [10]:
# SVM
from sklearn.svm import SVC
svc = SVC()

# iris
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
print(f'iris/true: {y_test}')
print(f'iris/pred: {y_pred}')
print(confusion_matrix(y_test, y_pred))

# simple dataset 1
svc.fit(X1_train, y1_train)
y1_pred = svc.predict(X1_test)
print(f'simple dataset 1/true: {y1_test}')
print(f'simple dataset 1/pred: {y1_pred}')
print(confusion_matrix(y1_test, y1_pred))

# simple dataset 2
svc.fit(X2_train, y2_train)
y2_pred = svc.predict(X2_test)
print(f'simple dataset 2/true: {y2_test}')
print(f'simple dataset 2/pred: {y2_pred}')
print(confusion_matrix(y2_test, y2_pred))

iris/true: [2 2 2 2 2 1 2 1 1 1 1 2 1 1 2 2 2 1 2 1 1 1 2 1 1]
iris/pred: [2 2 2 1 2 1 1 1 1 1 1 1 1 1 2 2 1 1 2 1 1 1 2 1 1]
[[13  0]
 [ 4  8]]
simple dataset 1/true: [ 1 -1 -1 -1 -1  1  1  1 -1  1  1 -1  1  1  1  1 -1 -1  1 -1  1 -1  1 -1
 -1  1  1 -1 -1  1  1  1 -1  1  1 -1 -1  1  1 -1  1 -1  1  1 -1  1 -1 -1
  1 -1 -1  1  1 -1 -1  1  1  1 -1  1 -1 -1 -1  1 -1  1 -1  1 -1  1 -1 -1
  1  1 -1  1  1  1  1  1 -1 -1 -1 -1  1 -1  1  1 -1 -1 -1 -1 -1  1  1  1
 -1 -1  1 -1 -1  1 -1  1  1  1  1 -1 -1 -1  1  1  1 -1 -1 -1 -1  1 -1 -1
  1 -1  1  1 -1]
simple dataset 1/pred: [ 1 -1 -1 -1 -1  1  1  1 -1  1  1 -1  1  1  1  1 -1 -1  1 -1  1 -1  1 -1
 -1  1  1 -1 -1  1  1  1 -1  1  1 -1 -1  1  1 -1  1 -1  1  1 -1  1 -1 -1
  1 -1 -1  1  1 -1 -1  1  1  1 -1  1 -1 -1 -1  1 -1  1 -1  1 -1  1 -1 -1
  1  1 -1  1  1  1  1  1 -1 -1 -1 -1  1 -1  1  1 -1 -1 -1 -1 -1  1  1  1
 -1 -1  1 -1 -1  1 -1  1  1  1  1 -1 -1 -1  1  1  1 -1 -1 -1 -1  1 -1 -1
  1 -1  1  1 -1]
[[62  0]
 [ 0 63]]
simple dataset 2/true: [1 

In [11]:
# 決定木
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()

# iris
dtree.fit(X_train, y_train)
y_pred = dtree.predict(X_test)
print(f'iris/true: {y_test}')
print(f'iris/pred: {y_pred}')
print(confusion_matrix(y_test, y_pred))

# simple dataset 1
dtree.fit(X1_train, y1_train)
y1_pred = dtree.predict(X1_test)
print(f'simple dataset 1/true: {y1_test}')
print(f'simple dataset 1/pred: {y1_pred}')
print(confusion_matrix(y1_test, y1_pred))

# simple dataset 2
dtree.fit(X2_train, y2_train)
y2_pred = dtree.predict(X2_test)
print(f'simple dataset 2/true: {y2_test}')
print(f'simple dataset 2/pred: {y2_pred}')
print(confusion_matrix(y2_test, y2_pred))

iris/true: [2 2 2 2 2 1 2 1 1 1 1 2 1 1 2 2 2 1 2 1 1 1 2 1 1]
iris/pred: [1 2 2 1 2 1 1 1 1 1 1 2 1 1 2 2 1 1 2 1 1 1 2 1 1]
[[13  0]
 [ 4  8]]
simple dataset 1/true: [ 1 -1 -1 -1 -1  1  1  1 -1  1  1 -1  1  1  1  1 -1 -1  1 -1  1 -1  1 -1
 -1  1  1 -1 -1  1  1  1 -1  1  1 -1 -1  1  1 -1  1 -1  1  1 -1  1 -1 -1
  1 -1 -1  1  1 -1 -1  1  1  1 -1  1 -1 -1 -1  1 -1  1 -1  1 -1  1 -1 -1
  1  1 -1  1  1  1  1  1 -1 -1 -1 -1  1 -1  1  1 -1 -1 -1 -1 -1  1  1  1
 -1 -1  1 -1 -1  1 -1  1  1  1  1 -1 -1 -1  1  1  1 -1 -1 -1 -1  1 -1 -1
  1 -1  1  1 -1]
simple dataset 1/pred: [ 1 -1 -1 -1 -1  1  1  1 -1  1  1 -1  1  1  1  1 -1 -1  1 -1  1 -1  1 -1
 -1  1  1 -1 -1  1  1  1 -1  1  1 -1 -1  1  1 -1  1 -1  1  1 -1  1 -1 -1
  1 -1 -1  1  1 -1 -1  1  1  1 -1  1 -1 -1 -1  1 -1  1 -1  1 -1  1 -1 -1
  1  1 -1  1  1  1  1  1 -1 -1 -1 -1  1 -1  1  1 -1 -1 -1 -1 -1  1  1  1
 -1 -1  1 -1 -1  1 -1  1  1  1  1 -1 -1 -1  1  1  1 -1 -1 -1 -1  1 -1 -1
  1 -1  1  1 -1]
[[62  0]
 [ 0 63]]
simple dataset 2/true: [1 

回帰は1種類をスクラッチします。

- 線形回帰

線形回帰は勾配降下法を用いて計算するSGDRegressorクラスを利用してください。

データセットは事前学習期間同様にHouse Pricesコンペティションのものを使います。

train.csvをダウンロードし、目的変数としてSalePrice、説明変数として、GrLivAreaとYearBuiltを使います。

# 【問題3】
## 回帰問題を解くコードの作成
線形回帰でHouse Pricesデータセットを学習・推定するコードを作成してください。

In [12]:
train_df = pd.read_csv('train.csv')

X_df = train_df[['GrLivArea', 'YearBuilt']]
y_df = train_df['SalePrice']

In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error

scaler = StandardScaler()

X_train, X_valid, y_train, y_valid = scratch_train_test_split(X_df.values, y_df.values)

scaler.fit(X_train)
X_train_scale = scaler.transform(X_train)

# 学習
sgd = SGDRegressor()
sgd.fit(X_train_scale, y_train)
y_train_pred = sgd.predict(X_train_scale)
print(f'R2/train: {sgd.score(X_train_scale, y_train)}')
print(f'RMSE/train: {np.sqrt(mean_squared_error(y_train, y_train_pred))}')

# 予測
X_valid_scale = scaler.transform(X_valid)
y_valid_pred = sgd.predict(X_valid_scale)
print(f'R2/valid: {sgd.score(X_valid_scale, y_valid)}')
print(f'RMSE/valid: {np.sqrt(mean_squared_error(y_valid, y_valid_pred))}')

R2/train: 0.6383140947789696
RMSE/train: 48185.10572778466
R2/valid: 0.7014270953652211
RMSE/valid: 42193.66937229907


In [14]:
# test
test_df = pd.read_csv('test.csv')
X_test = test_df[['GrLivArea', 'YearBuilt']].values
ids = test_df['Id'].values

X_test_scale = scaler.transform(X_test)
y_test_pred = sgd.predict(X_test_scale)

submit_arr = np.concatenate((ids.reshape(-1, 1), y_test_pred.reshape(-1, 1)), axis=1)
submit_df = pd.DataFrame(submit_arr, columns=['Id', 'SalePrice'])
submit_df['Id'] = submit_df['Id'].astype(np.int32)
display(submit_df)
submit_df.info()
submit_df.to_csv('submit.csv', index=False)

Unnamed: 0,Id,SalePrice
0,1461,112538.849678
1,1462,148945.286732
2,1463,218595.067468
3,1464,217387.242330
4,1465,181228.819372
...,...,...
1454,2915,140220.453737
1455,2916,140220.453737
1456,2917,141493.147935
1457,2918,152841.338624


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Id         1459 non-null   int32  
 1   SalePrice  1459 non-null   float64
dtypes: float64(1), int32(1)
memory usage: 17.2 KB
