## pipeline

## 特征提取

## 1.数据预处理

### 标准化

* 据服从零均值、单位方差的正态分布

In [1]:
from sklearn import preprocessing
import numpy as np

X_train = np.array([[1.0, -1.0, 2.0],
                    [2.0, 0.0, 0.0],
                    [0.0, 1.0, -1.0]])
X_scaled = preprocessing.scale(X_train)
X_scaled

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

### 规范化

* 使数据规范化到一个指定的区间范围内，比如；[0, 1]

### 缩放稀疏数据

### 利用异常值缩放

### 核矩阵中心化

### 映射数据到[0,1]范围的均匀分布

### 映射数据到Gaussian分布

> * 很多模型中，希望的特征分布为正态分布？
> * Power转换可以将特征转换为接近正态分布；
> * 得到平稳的方差和减小分布的偏度；


* Yeo-Johnson变换
$$
\begin{split}x_i^{(\lambda)} =
\begin{cases}
 [(x_i + 1)^\lambda - 1] / \lambda & \text{if } \lambda \neq 0, x_i \geq 0, \\[8pt]
\ln{(x_i) + 1} & \text{if } \lambda = 0, x_i \geq 0 \\[8pt]
-[(-x_i + 1)^{2 - \lambda} - 1] / (2 - \lambda) & \text{if } \lambda \neq 2, x_i < 0, \\[8pt]
 - \ln (- x_i + 1) & \text{if } \lambda = 2, x_i < 0
\end{cases}\end{split}
$$

* Box-Cox变换

$$
\begin{split}x_i^{(\lambda)} =
\begin{cases}
\dfrac{x_i^\lambda - 1}{\lambda} & \text{if } \lambda \neq 0, \\[8pt]
\ln{(x_i)} & \text{if } \lambda = 0,
\end{cases}\end{split}
$$

In [None]:
# Box-Cox转换
pt = preprocessing.PowerTransformer(method = 'box-cox', standardize = False)
X_lognormal = np.random.RandomState(616).lognormal(size = (3, 3))
pt.fit_transform(X_lognormal)

In [None]:
# 分位数转换
quantitle_transformer = preprocessing.QuantitleTransformer(output_distribution = 'normal', random_state = 0)
X_trans = quantitle_transformer.fit_transform(X)
quantitle_transformer.quantitles_

### 正规化(Normalization)

* Normalization is the process of scalaing individual samples to have unit norm
* 适用于进行二元形式的点积、度量两个相似变量之间的相似度

In [7]:
from sklearn import preprocessing

X = [[1.0, -1.0, 2.0], 
     [2.0, 0.0, 0.0], 
     [0.0, 1.0, -1.0]]
X_normalized = preprocessing.normalize(X, norm = 'l2')
X_normalized

array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

In [9]:
normalizer = preprocessing.Normalizer().fit(X)
print(normalizer.transform(X))
normalizer.transform([[-1.0, 1.0, 0.0]])

[[ 0.40824829 -0.40824829  0.81649658]
 [ 1.          0.          0.        ]
 [ 0.          0.70710678 -0.70710678]]


array([[-0.70710678,  0.70710678,  0.        ]])

### 类别型变量重编码

* 将类别型变量重编码为整数；

In [None]:
# OrdinalEncoder
enc = preprocessing.OrdinalEncoder()
X = [['male', 'from US', 'uses Safari'], 
     ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)
enc.transform(X)
enc.transform(['female', 'from US', 'uses Safari'])

In [None]:
# OneHotEncoder
enc = preprocessing.OneHotEncoder()
X = [['male', 'from US', 'uses Safari'], 
     ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)
enc.transform(X)
enc.transform(['female', 'from US', 'uses Safari'],
              ['male', 'from Europe', 'uses Safari']).toarray()
enc.categories_

enc = preprocessing.OneHotEncoder(handle_unknown = 'ignore')
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)
enc.transform(X)
enc.transform(['female', 'from Asia', 'uses Chrome']).toarray()

### 连续型变量离散化

In [None]:
# K-bins discretization
X = np.array([[-3.0, 5.0, 15],
              [0.0, 6.0, 14],
              [6.0, 3.0, 11]])
est = preprocessing.KBinsDiscretizer(n_bins = [3, 2, 2], encode = 'ordinal').fit(X)
est.transform(X)

In [None]:
# Feature binarization


### 缺失值填充

### 生成多项式特征

* 在模型中引入数据的非线性特征
* 多用在核方法中的多项式核函数中

In [22]:
import numpy as np
from sklearn.preprocessing import PolynomialFeatures

X = np.arange(6).reshape(3, 2)
poly = PolynomialFeatures(degree = 2)
poly.fit_transform(X)

array([[ 1.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  2.,  3.,  4.,  6.,  9.],
       [ 1.,  4.,  5., 16., 20., 25.]])

In [23]:
X = np.arange(9).reshape(3, 3)
poly = PolynomialFeatures(degree = 3, interaction_only = True)
poly.fit_transform(X)

array([[  1.,   0.,   1.,   2.,   0.,   0.,   2.,   0.],
       [  1.,   3.,   4.,   5.,  12.,  15.,  20.,  60.],
       [  1.,   6.,   7.,   8.,  42.,  48.,  56., 336.]])