# 最大最小值归一化 

In [33]:
import numpy as np
from sklearn import datasets

In [2]:
# 加载鸢尾花数据集
iris = datasets.load_iris()

In [3]:
X = iris.data
y = iris.target

In [4]:
X[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [5]:
np.max(X[:,0])

7.9

In [6]:
np.min(X[:,0])

4.3

In [7]:
X[:,0] = (X[:,0] - np.min(X[:,0])) /  (np.max(X[:,0]) - np.min(X[:,0]))

In [8]:
X[:5,0]

array([0.22222222, 0.16666667, 0.11111111, 0.08333333, 0.19444444])

In [9]:
X[:,1] = (X[:,1] - np.min(X[:,1])) /  (np.max(X[:,1]) - np.min(X[:,1]))
X[:,2] = (X[:,2] - np.min(X[:,2])) /  (np.max(X[:,2]) - np.min(X[:,2]))
X[:,3] = (X[:,3] - np.min(X[:,3])) /  (np.max(X[:,3]) - np.min(X[:,3]))

In [10]:
X[:5]

array([[0.22222222, 0.625     , 0.06779661, 0.04166667],
       [0.16666667, 0.41666667, 0.06779661, 0.04166667],
       [0.11111111, 0.5       , 0.05084746, 0.04166667],
       [0.08333333, 0.45833333, 0.08474576, 0.04166667],
       [0.19444444, 0.66666667, 0.06779661, 0.04166667]])

# 零均值归一化

In [4]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [5]:
X[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [6]:
np.mean(X[:,0])

5.843333333333334

In [7]:
np.std(X[:,0])

0.8253012917851409

In [8]:
X[:,0] = (X[:,0] - np.mean(X[:,0]))/np.std(X[:,0])

In [9]:
X[:5,0]

array([-0.90068117, -1.14301691, -1.38535265, -1.50652052, -1.02184904])

In [10]:
np.mean(X[:,0])

-4.736951571734001e-16

In [11]:
np.std(X[:,0])

1.0

In [12]:
X[:,1] = (X[:,1] - np.mean(X[:,1]))/np.std(X[:,1])
X[:,2] = (X[:,2] - np.mean(X[:,2]))/np.std(X[:,2])
X[:,3] = (X[:,3] - np.mean(X[:,3]))/np.std(X[:,3])

In [13]:
X[:5]

array([[-0.90068117,  1.01900435, -1.34022653, -1.3154443 ],
       [-1.14301691, -0.13197948, -1.34022653, -1.3154443 ],
       [-1.38535265,  0.32841405, -1.39706395, -1.3154443 ],
       [-1.50652052,  0.09821729, -1.2833891 , -1.3154443 ],
       [-1.02184904,  1.24920112, -1.34022653, -1.3154443 ]])

# scikit-learn 中的StandardScaler

In [14]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [15]:
X[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:
standard_scaler = StandardScaler()

In [18]:
standard_scaler.fit(X)

In [19]:
standard_scaler.mean_

array([5.84333333, 3.05733333, 3.758     , 1.19933333])

In [20]:
standard_scaler.scale_

array([0.82530129, 0.43441097, 1.75940407, 0.75969263])

In [21]:
X = standard_scaler.transform(X)

In [22]:
X[:5]

array([[-0.90068117,  1.01900435, -1.34022653, -1.3154443 ],
       [-1.14301691, -0.13197948, -1.34022653, -1.3154443 ],
       [-1.38535265,  0.32841405, -1.39706395, -1.3154443 ],
       [-1.50652052,  0.09821729, -1.2833891 , -1.3154443 ],
       [-1.02184904,  1.24920112, -1.34022653, -1.3154443 ]])

In [23]:
np.std(X[:,0])

1.0

# 使用归一化

In [24]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(iris.data,iris.target,train_size=0.8,random_state=666)

In [25]:
from sklearn.preprocessing import StandardScaler

In [26]:
standard_scaler = StandardScaler()

In [27]:
standard_scaler.fit(X_train)

In [28]:
standard_scaler.mean_

array([5.83416667, 3.08666667, 3.70833333, 1.17      ])

In [29]:
# 原数据的标准差
standard_scaler.scale_

array([0.81019502, 0.44327067, 1.76401924, 0.75317107])

In [36]:
# 这里才是真正的归一化操作
X_train_standard = standard_scaler.transform(X_train)

In [37]:
# 测试集也要去进行归一化才能去测试
X_test_standard = standard_scaler.transform(X_test)

In [38]:
# 归一化操作完之后让我们来看看新数据的标准差
np.std(X_train_standard[:,0])

1.0000000000000002

In [38]:
from sklearn.neighbors import KNeighborsClassifier

In [39]:
knn_classifier = KNeighborsClassifier(n_neighbors=5)

In [40]:
knn_classifier.fit(X_train_standard,y_train)

In [41]:
knn_classifier.score(X_test_standard, y_test)

1.0