In [4]:
from sklearn import preprocessing
import numpy as np

## 标准化

- preprocessing.StandardScaler
  - 使用 scipy.sparse 可以处理稀疏矩阵
- preprocessing.MinMaxScaler
- preprocessing.MaxAbsScaler
  - 处理稀疏矩阵
- RobustScaler
  - 处理 outlier

In [5]:
X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])

In [10]:
# 标准化 变换后的各维特征有 0 均值，单位方差
# 也叫 z-score 规范化），计算方式是将特征值减去均值，除以标准差
# 创建一个标准化器，然后拟合 X_train
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
print(f'scaler.mean_: {scaler.mean_}')  # 转化前每个维度的均值
print(f'scaler.scale_: {scaler.scale_}')  # 转化前每个维度的方差
X_scaled = scaler.transform(X_train)
X_scaled

scaler.mean_: [1.         0.         0.33333333]
scaler.scale_: [0.81649658 0.81649658 1.24721913]


array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [12]:
# 转化后每个维度的均值和方差
X_scaled.mean(axis=0), X_scaled.std(axis=0)

(array([0., 0., 0.]), array([1., 1., 1.]))

In [2]:
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# 创建分类用的数据集
X, y = make_classification(random_state=42)
# 划分数据
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# 创建模型（pipe）
## 标准化，逻辑回归
pipe = make_pipeline(StandardScaler(), LogisticRegression())

# 拟合
pipe.fit(X_train, y_train)  # apply scaling on training data

pipe.score(X_test, y_test)  # apply scaling on testing data, without leaking training data.


0.96

## Normalization

In [9]:
X = [[ 1., -1.,  2.],
	[ 2.,  0.,  0.],
	[ 0.,  1., -1.]]
X_normalized = preprocessing.normalize(X, norm='l2')
X_normalized

array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

In [None]:
# 创建一个正则化器，拟合 X
normalizer = preprocessing.Normalizer().fit(X)
# 正则化 X
normalizer.transform(X)
# 正则化其他数据
normalizer.transform([[-1.,  1., 0.]])

## Encoding categorical features

OrdinalEncoder will also passthrough missing values that are indicated by np.nan.

In [11]:
enc = preprocessing.OrdinalEncoder()
X = [
	['male', 'from US', 'uses Safari'],
	['female', 'from Europe', 'uses Firefox']
	]
enc.fit(X)
enc.transform([['female', 'from US', 'uses Safari']])

array([[0., 1., 1.]])

One hot

In [17]:
enc = preprocessing.OneHotEncoder()
X = [
	['male', 'from US', 'uses Safari'],
	['female', 'from Europe', 'uses Firefox'],
	['male', 'from Europe', 'uses Edge']
	]

enc.fit(X)
enc.transform([
	['female', 'from US', 'uses Safari'],
	['male', 'from Europe', 'uses Safari']
	]).toarray()

array([[1., 0., 0., 1., 0., 0., 1.],
       [0., 1., 1., 0., 0., 0., 1.]])

In [18]:
enc.categories_

[array(['female', 'male'], dtype=object),
 array(['from Europe', 'from US'], dtype=object),
 array(['uses Edge', 'uses Firefox', 'uses Safari'], dtype=object)]

In [19]:
genders = ['female', 'male']
locations = ['from Africa', 'from Asia', 'from Europe', 'from US']
browsers = ['uses Chrome', 'uses Firefox', 'uses IE', 'uses Safari']
enc = preprocessing.OneHotEncoder(categories=[genders, locations, browsers])
# Note that for there are missing categorical values for the 2nd and 3rd
# feature
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)
enc.transform([['female', 'from Asia', 'uses Chrome']]).toarray()


array([[1., 0., 0., 1., 0., 0., 1., 0., 0., 0.]])

## LabelEncoder

In [4]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit([1, 2, 2, 6])
print(repr(le.classes_))
# 转变
print(repr(le.transform([1, 1, 2, 6])))
# 逆转
print(repr(le.inverse_transform([0, 0, 1, 2])))

array([1, 2, 6])
array([0, 0, 1, 2])
array([1, 1, 2, 6])
