# 特徴量のスケーリング

## ライブラリーのインポート

In [94]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler

## データセットの生成

In [80]:
# データセットをロード
dataset = datasets.load_iris()
# 一意なクラスラベルを出力
print('特徴量:', dataset.feature_names)
print('クラス:', np.unique(dataset.target))

特徴量: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
クラス: [0 1 2]


## データセットの確認

In [81]:
pd.set_option('display.max_columns', None)
# データセットの特徴量をDataFrameに変換
df = pd.DataFrame(dataset.data, columns=dataset.feature_names)

# 目標変数をDataFrameに追加
df['target'] = dataset.target
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


## 説明変数と目的変数に分割

In [82]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

## 訓練データとテストデータに分割

In [83]:
# 全体の30%をテストデータととする
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0, stratify=y)

In [84]:
print(X_train)

[[6.7 3.3 5.7 2.5]
 [7.7 3.8 6.7 2.2]
 [6.4 3.2 5.3 2.3]
 [4.6 3.6 1.  0.2]
 [5.2 3.4 1.4 0.2]
 [4.4 3.  1.3 0.2]
 [4.9 2.4 3.3 1. ]
 [6.4 2.7 5.3 1.9]
 [6.1 3.  4.6 1.4]
 [6.8 3.  5.5 2.1]
 [4.8 3.  1.4 0.3]
 [6.3 3.3 4.7 1.6]
 [5.1 2.5 3.  1.1]
 [6.6 2.9 4.6 1.3]
 [4.9 3.  1.4 0.2]
 [5.  3.4 1.5 0.2]
 [7.7 3.  6.1 2.3]
 [5.6 2.5 3.9 1.1]
 [5.5 2.4 3.8 1.1]
 [6.  2.2 5.  1.5]
 [7.9 3.8 6.4 2. ]
 [5.7 2.9 4.2 1.3]
 [5.  3.6 1.4 0.2]
 [5.4 3.4 1.5 0.4]
 [5.  2.3 3.3 1. ]
 [5.8 2.7 3.9 1.2]
 [5.4 3.9 1.7 0.4]
 [6.1 2.9 4.7 1.4]
 [6.4 3.1 5.5 1.8]
 [6.8 3.2 5.9 2.3]
 [6.7 3.1 5.6 2.4]
 [6.  2.7 5.1 1.6]
 [6.2 2.8 4.8 1.8]
 [6.4 2.8 5.6 2.2]
 [4.4 2.9 1.4 0.2]
 [5.4 3.7 1.5 0.2]
 [4.3 3.  1.1 0.1]
 [6.1 2.8 4.7 1.2]
 [5.5 4.2 1.4 0.2]
 [5.7 4.4 1.5 0.4]
 [7.4 2.8 6.1 1.9]
 [6.  2.2 4.  1. ]
 [6.  3.  4.8 1.8]
 [5.4 3.9 1.3 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 4.1 1.5 0.1]
 [6.9 3.1 4.9 1.5]
 [6.6 3.  4.4 1.4]
 [5.  3.4 1.6 0.4]
 [5.7 3.  4.2 1.2]
 [5.7 2.6 3.5 1. ]
 [5.  2.  3.5 1. ]
 [6.7 2.5 5.

In [85]:
print(X_test)

[[6.3 3.4 5.6 2.4]
 [5.8 2.7 5.1 1.9]
 [5.1 3.4 1.5 0.2]
 [5.1 3.8 1.9 0.4]
 [7.  3.2 4.7 1.4]
 [5.1 3.3 1.7 0.5]
 [5.5 2.6 4.4 1.2]
 [5.9 3.  5.1 1.8]
 [5.1 3.8 1.6 0.2]
 [5.7 2.8 4.5 1.3]
 [5.1 3.7 1.5 0.4]
 [6.5 3.  5.2 2. ]
 [4.6 3.2 1.4 0.2]
 [6.3 2.7 4.9 1.8]
 [5.8 2.7 4.1 1. ]
 [6.1 2.6 5.6 1.4]
 [6.2 2.2 4.5 1.5]
 [6.7 3.1 4.4 1.4]
 [6.4 3.2 4.5 1.5]
 [5.1 3.5 1.4 0.2]
 [6.5 2.8 4.6 1.5]
 [6.3 2.9 5.6 1.8]
 [4.8 3.4 1.9 0.2]
 [5.5 2.4 3.7 1. ]
 [7.2 3.  5.8 1.6]
 [7.1 3.  5.9 2.1]
 [5.7 2.5 5.  2. ]
 [6.3 3.3 6.  2.5]
 [5.5 2.5 4.  1.3]
 [6.5 3.2 5.1 2. ]
 [5.7 2.8 4.1 1.3]
 [5.1 3.8 1.5 0.3]
 [4.6 3.4 1.4 0.3]
 [5.6 3.  4.1 1.3]
 [6.7 3.1 4.7 1.5]
 [5.6 2.8 4.9 2. ]
 [6.  3.4 4.5 1.6]
 [4.9 3.6 1.4 0.1]
 [4.9 3.1 1.5 0.1]
 [5.9 3.  4.2 1.5]
 [5.5 3.5 1.3 0.2]
 [7.2 3.6 6.1 2.5]
 [4.6 3.1 1.5 0.2]
 [4.8 3.  1.4 0.1]
 [6.9 3.1 5.4 2.1]]


## 標準化を行う場合

In [86]:
# 標準化のスケーリングのインスタンスの生成
scaler = StandardScaler()
# 訓練データをスケーリング
X_train_std = scaler.fit_transform(X_train)
# テストデータをスケーリング
X_test_std = scaler.transform(X_test)

In [87]:
print(X_train_std)

[[ 9.75431527e-01  5.69896463e-01  1.08080020e+00  1.70475496e+00]
 [ 2.14194987e+00  1.67393943e+00  1.63709442e+00  1.31106155e+00]
 [ 6.25476025e-01  3.49087870e-01  8.58282512e-01  1.44229269e+00]
 [-1.47425699e+00  1.23232224e+00 -1.53378264e+00 -1.31356119e+00]
 [-7.74345984e-01  7.90705055e-01 -1.31126495e+00 -1.31356119e+00]
 [-1.70756066e+00 -9.25293150e-02 -1.36689437e+00 -1.31356119e+00]
 [-1.12430149e+00 -1.41738087e+00 -2.54305930e-01 -2.63712094e-01]
 [ 6.25476025e-01 -7.54955092e-01  8.58282512e-01  9.17368138e-01]
 [ 2.75520522e-01 -9.25293150e-02  4.68876558e-01  2.61212453e-01]
 [ 1.09208336e+00 -9.25293150e-02  9.69541356e-01  1.17983041e+00]
 [-1.24095332e+00 -9.25293150e-02 -1.31126495e+00 -1.18233005e+00]
 [ 5.08824191e-01  5.69896463e-01  5.24505980e-01  5.23674727e-01]
 [-8.90997819e-01 -1.19657228e+00 -4.21194196e-01 -1.32480957e-01]
 [ 8.58779693e-01 -3.13337907e-01  4.68876558e-01  1.29981317e-01]
 [-1.12430149e+00 -9.25293150e-02 -1.31126495e+00 -1.31356119e

In [88]:
print(X_test_std)

[[ 5.08824191e-01  7.90705055e-01  1.02517078e+00  1.57352382e+00]
 [-7.44349799e-02 -7.54955092e-01  7.47023668e-01  9.17368138e-01]
 [-8.90997819e-01  7.90705055e-01 -1.25563553e+00 -1.31356119e+00]
 [-8.90997819e-01  1.67393943e+00 -1.03311784e+00 -1.05109892e+00]
 [ 1.32538703e+00  3.49087870e-01  5.24505980e-01  2.61212453e-01]
 [-8.90997819e-01  5.69896463e-01 -1.14437668e+00 -9.19867779e-01]
 [-4.24390482e-01 -9.75763685e-01  3.57617713e-01 -1.24982035e-03]
 [ 4.22168542e-02 -9.25293150e-02  7.47023668e-01  7.86137001e-01]
 [-8.90997819e-01  1.67393943e+00 -1.20000611e+00 -1.31356119e+00]
 [-1.91086814e-01 -5.34146500e-01  4.13247136e-01  1.29981317e-01]
 [-8.90997819e-01  1.45313083e+00 -1.25563553e+00 -1.05109892e+00]
 [ 7.42127859e-01 -9.25293150e-02  8.02653090e-01  1.04859927e+00]
 [-1.47425699e+00  3.49087870e-01 -1.31126495e+00 -1.31356119e+00]
 [ 5.08824191e-01 -7.54955092e-01  6.35764824e-01  7.86137001e-01]
 [-7.44349799e-02 -7.54955092e-01  1.90729447e-01 -2.63712094e

## 正規化を行う場合

In [89]:
# min-max スケーリングのインスタンスの生成
scaler = MinMaxScaler()
# 訓練データをスケーリング
X_train_norm = scaler.fit_transform(X_train)
# テストデータをスケーリング
X_test_norm = scaler.transform(X_test)

In [90]:
print(X_train_norm)

[[0.66666667 0.54166667 0.79661017 1.        ]
 [0.94444444 0.75       0.96610169 0.875     ]
 [0.58333333 0.5        0.72881356 0.91666667]
 [0.08333333 0.66666667 0.         0.04166667]
 [0.25       0.58333333 0.06779661 0.04166667]
 [0.02777778 0.41666667 0.05084746 0.04166667]
 [0.16666667 0.16666667 0.38983051 0.375     ]
 [0.58333333 0.29166667 0.72881356 0.75      ]
 [0.5        0.41666667 0.61016949 0.54166667]
 [0.69444444 0.41666667 0.76271186 0.83333333]
 [0.13888889 0.41666667 0.06779661 0.08333333]
 [0.55555556 0.54166667 0.62711864 0.625     ]
 [0.22222222 0.20833333 0.33898305 0.41666667]
 [0.63888889 0.375      0.61016949 0.5       ]
 [0.16666667 0.41666667 0.06779661 0.04166667]
 [0.19444444 0.58333333 0.08474576 0.04166667]
 [0.94444444 0.41666667 0.86440678 0.91666667]
 [0.36111111 0.20833333 0.49152542 0.41666667]
 [0.33333333 0.16666667 0.47457627 0.41666667]
 [0.47222222 0.08333333 0.6779661  0.58333333]
 [1.         0.75       0.91525424 0.79166667]
 [0.38888889 

In [91]:
print(X_test_norm)

[[0.55555556 0.58333333 0.77966102 0.95833333]
 [0.41666667 0.29166667 0.69491525 0.75      ]
 [0.22222222 0.58333333 0.08474576 0.04166667]
 [0.22222222 0.75       0.15254237 0.125     ]
 [0.75       0.5        0.62711864 0.54166667]
 [0.22222222 0.54166667 0.11864407 0.16666667]
 [0.33333333 0.25       0.57627119 0.45833333]
 [0.44444444 0.41666667 0.69491525 0.70833333]
 [0.22222222 0.75       0.10169492 0.04166667]
 [0.38888889 0.33333333 0.59322034 0.5       ]
 [0.22222222 0.70833333 0.08474576 0.125     ]
 [0.61111111 0.41666667 0.71186441 0.79166667]
 [0.08333333 0.5        0.06779661 0.04166667]
 [0.55555556 0.29166667 0.66101695 0.70833333]
 [0.41666667 0.29166667 0.52542373 0.375     ]
 [0.5        0.25       0.77966102 0.54166667]
 [0.52777778 0.08333333 0.59322034 0.58333333]
 [0.66666667 0.45833333 0.57627119 0.54166667]
 [0.58333333 0.5        0.59322034 0.58333333]
 [0.22222222 0.625      0.06779661 0.04166667]
 [0.61111111 0.33333333 0.61016949 0.58333333]
 [0.55555556 

# RobustScler(外れ値が多い小さなデータセット、か学習しやすいデータセット)

In [95]:
# ロバストスケーリングのインスタンスの生成
scaler = RobustScaler()
# 訓練データをスケーリング
X_train_rs = scaler.fit_transform(X_train)
# テストデータをスケーリング
X_test_rs = scaler.transform(X_test)

In [96]:
print(X_train_rs)

[[ 0.69230769  0.6         0.4         0.8       ]
 [ 1.46153846  1.6         0.68571429  0.6       ]
 [ 0.46153846  0.4         0.28571429  0.66666667]
 [-0.92307692  1.2        -0.94285714 -0.73333333]
 [-0.46153846  0.8        -0.82857143 -0.73333333]
 [-1.07692308  0.         -0.85714286 -0.73333333]
 [-0.69230769 -1.2        -0.28571429 -0.2       ]
 [ 0.46153846 -0.6         0.28571429  0.4       ]
 [ 0.23076923  0.          0.08571429  0.06666667]
 [ 0.76923077  0.          0.34285714  0.53333333]
 [-0.76923077  0.         -0.82857143 -0.66666667]
 [ 0.38461538  0.6         0.11428571  0.2       ]
 [-0.53846154 -1.         -0.37142857 -0.13333333]
 [ 0.61538462 -0.2         0.08571429  0.        ]
 [-0.69230769  0.         -0.82857143 -0.73333333]
 [-0.61538462  0.8        -0.8        -0.73333333]
 [ 1.46153846  0.          0.51428571  0.66666667]
 [-0.15384615 -1.         -0.11428571 -0.13333333]
 [-0.23076923 -1.2        -0.14285714 -0.13333333]
 [ 0.15384615 -1.6         0.2 

In [97]:
print(X_test_rs)

[[ 0.38461538  0.8         0.37142857  0.73333333]
 [ 0.         -0.6         0.22857143  0.4       ]
 [-0.53846154  0.8        -0.8        -0.73333333]
 [-0.53846154  1.6        -0.68571429 -0.6       ]
 [ 0.92307692  0.4         0.11428571  0.06666667]
 [-0.53846154  0.6        -0.74285714 -0.53333333]
 [-0.23076923 -0.8         0.02857143 -0.06666667]
 [ 0.07692308  0.          0.22857143  0.33333333]
 [-0.53846154  1.6        -0.77142857 -0.73333333]
 [-0.07692308 -0.4         0.05714286  0.        ]
 [-0.53846154  1.4        -0.8        -0.6       ]
 [ 0.53846154  0.          0.25714286  0.46666667]
 [-0.92307692  0.4        -0.82857143 -0.73333333]
 [ 0.38461538 -0.6         0.17142857  0.33333333]
 [ 0.         -0.6        -0.05714286 -0.2       ]
 [ 0.23076923 -0.8         0.37142857  0.06666667]
 [ 0.30769231 -1.6         0.05714286  0.13333333]
 [ 0.69230769  0.2         0.02857143  0.06666667]
 [ 0.46153846  0.4         0.05714286  0.13333333]
 [-0.53846154  1.         -0.82