In [25]:
# 导入库
from sklearn import preprocessing
import numpy as np
from sklearn.impute import SimpleImputer

# 创建一个3行3列的二维数组
x_train = np.array([[1.0, -1.0, 2.0], [2.0, 0.0, 2.0], [0.0, 1.0, -1.0]])

In [26]:
# 标准化
x_scaled = preprocessing.scale(x_train)
x_scaled.mean(axis=0)
x_scaled.std(axis=0)

array([1., 1., 1.])

In [27]:
# 使用StandardScaler类的fit()函数
scaler = preprocessing.StandardScaler().fit(x_train)
scaler.mean_
scaler.scale_
scaler.transform(x_train)

array([[ 0.        , -1.22474487,  0.70710678],
       [ 1.22474487,  0.        ,  0.70710678],
       [-1.22474487,  1.22474487, -1.41421356]])

In [28]:
# 测试数据标准化
x_test = [[-1.0, 1.0, 0.0]]
scaler.transform(x_test)


array([[-2.44948974,  1.22474487, -0.70710678]])

In [29]:
# 区间缩放
min_max_scaler = preprocessing.MinMaxScaler()
x_train_minmax = min_max_scaler.fit_transform(x_train)

x_test = np.array([[-3.0, -1.0, 4.0]]).reshape(1, -1)
x_test_minmax = min_max_scaler.fit_transform(x_test)

max_abs_scaler = preprocessing.MaxAbsScaler()
X_train_maxabs = max_abs_scaler.fit_transform(x_train)
X_test_maxabs = max_abs_scaler.fit_transform(x_test)

In [30]:
# 正则化
x_normalized = preprocessing.normalize(x_train, norm='l2')

# 二值化
binarizer = preprocessing.Binarizer().fit(x_train)
binarizer.transform(x_train)

binarizer1 = preprocessing.Binarizer(threshold=1.1)
binarizer1.transform(x_train)

array([[0., 0., 1.],
       [1., 0., 1.],
       [0., 0., 0.]])

In [31]:
# 定性特征编码
enc = preprocessing.OneHotEncoder()
enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
enc.transform([[0, 1, 3]]).toarray()

enc.fit([[1, 0, 3], [0, 2, 0]])
enc.transform([[1, 0, 0]]).toarray()

array([[0., 1., 1., 0., 1., 0.]])

In [32]:
# 缺失值计算
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit([[1, 2], [np.nan, 3], [7, 6]])

X = [[np.nan, 2], [6, np.nan], [7, 6]]
imp.transform(X)

array([[4.        , 2.        ],
       [6.        , 3.66666667],
       [7.        , 6.        ]])

In [33]:
# 生成多项式特征
X = np.arange(6).reshape(3, 2)
poly = preprocessing.PolynomialFeatures(2)
poly.fit_transform(X)

X = np.arange(9).reshape(3, 3)
poly = preprocessing.PolynomialFeatures(degree=3, interaction_only=True)
poly.fit_transform(X)


array([[  1.,   0.,   1.,   2.,   0.,   0.,   2.,   0.],
       [  1.,   3.,   4.,   5.,  12.,  15.,  20.,  60.],
       [  1.,   6.,   7.,   8.,  42.,  48.,  56., 336.]])

In [34]:
# 自定义函数变换
transformer = preprocessing.FunctionTransformer(np.log1p)
X = np.array([[0, 1], [2, 3]])
transformer.transform(X)

array([[0.        , 0.69314718],
       [1.09861229, 1.38629436]])