# 特征工程

## 异常值处理

In [18]:
import pandas as pd
import numpy as np
from  matplotlib import pyplot as plt
import seaborn as sns
import scipy.stats as ss
%matplotlib inline

In [19]:
df1 = pd.DataFrame({'A':['a0','a1','a1','a2','a3','a4'],
                    'B':['b0','b1','b2','b2','b3',None],
                    'C':[1,2,None,3,4,5],
                    'D':[0.1,10.2,11.4,8.9,9.1,12],
                    'E':[10,19,32,25,8,None],
                    'F':['f0','f2','g2','f3','f4','f5']})
df1

Unnamed: 0,A,B,C,D,E,F
0,a0,b0,1.0,0.1,10.0,f0
1,a1,b1,2.0,10.2,19.0,f2
2,a1,b2,,11.4,32.0,g2
3,a2,b2,3.0,8.9,25.0,f3
4,a3,b3,4.0,9.1,8.0,f4
5,a4,,5.0,12.0,,f5


In [20]:
# 识别空值
df1.isnull()

Unnamed: 0,A,B,C,D,E,F
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,True,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
5,False,True,False,False,True,False


In [21]:
# 删除空值
df1.dropna()

Unnamed: 0,A,B,C,D,E,F
0,a0,b0,1.0,0.1,10.0,f0
1,a1,b1,2.0,10.2,19.0,f2
3,a2,b2,3.0,8.9,25.0,f3
4,a3,b3,4.0,9.1,8.0,f4


In [22]:
df1.dropna(subset=['B'])

Unnamed: 0,A,B,C,D,E,F
0,a0,b0,1.0,0.1,10.0,f0
1,a1,b1,2.0,10.2,19.0,f2
2,a1,b2,,11.4,32.0,g2
3,a2,b2,3.0,8.9,25.0,f3
4,a3,b3,4.0,9.1,8.0,f4


In [23]:
# 识别重复值
df1.duplicated(['A'])

0    False
1    False
2     True
3    False
4    False
5    False
dtype: bool

In [24]:
# 删除重复值
df1.drop_duplicates(['A'])

Unnamed: 0,A,B,C,D,E,F
0,a0,b0,1.0,0.1,10.0,f0
1,a1,b1,2.0,10.2,19.0,f2
3,a2,b2,3.0,8.9,25.0,f3
4,a3,b3,4.0,9.1,8.0,f4
5,a4,,5.0,12.0,,f5


In [25]:
df1.drop_duplicates(['A'],keep=False)

Unnamed: 0,A,B,C,D,E,F
0,a0,b0,1.0,0.1,10.0,f0
3,a2,b2,3.0,8.9,25.0,f3
4,a3,b3,4.0,9.1,8.0,f4
5,a4,,5.0,12.0,,f5


In [26]:
# 填充异常值B
df1.fillna('b*')

Unnamed: 0,A,B,C,D,E,F
0,a0,b0,1,0.1,10,f0
1,a1,b1,2,10.2,19,f2
2,a1,b2,b*,11.4,32,g2
3,a2,b2,3,8.9,25,f3
4,a3,b3,4,9.1,8,f4
5,a4,b*,5,12.0,b*,f5


In [27]:
# 填充异常值E
df1.fillna(df1['E'].mean())

Unnamed: 0,A,B,C,D,E,F
0,a0,b0,1.0,0.1,10.0,f0
1,a1,b1,2.0,10.2,19.0,f2
2,a1,b2,18.8,11.4,32.0,g2
3,a2,b2,3.0,8.9,25.0,f3
4,a3,b3,4.0,9.1,8.0,f4
5,a4,18.8,5.0,12.0,18.8,f5


In [28]:
# 插值法填充,在末尾取上面一个数，在开头取下面的数，在中间，取两个数的均值
df1['E'].interpolate()

0    10.0
1    19.0
2    32.0
3    25.0
4     8.0
5     8.0
Name: E, dtype: float64

In [29]:
pd.Series([1,None,4,5,20]).interpolate()

0     1.0
1     2.5
2     4.0
3     5.0
4    20.0
dtype: float64

In [30]:
# 3次样条插值法
df1['E'].interpolate(method='spline',order=3)

0    10.000000
1    19.000000
2    32.000000
3    25.000000
4     8.000000
5   -20.143603
Name: E, dtype: float64

In [31]:
# 使用四分数的上下界过滤D属性
upper_q = df1['D'].quantile(0.75)
lower_q = df1['D'].quantile(0.25)
q_int = upper_q - lower_q
k = 1.5
df1[df1['D']>lower_q-k*q_int][df1['D']<upper_q+k*q_int]

  


Unnamed: 0,A,B,C,D,E,F
1,a1,b1,2.0,10.2,19.0,f2
2,a1,b2,,11.4,32.0,g2
3,a2,b2,3.0,8.9,25.0,f3
4,a3,b3,4.0,9.1,8.0,f4
5,a4,,5.0,12.0,,f5


In [32]:
# 过滤F属性中的异常值
df1[[True if item.startswith('f') else False for item in list(df1['F'].values)]]

Unnamed: 0,A,B,C,D,E,F
0,a0,b0,1.0,0.1,10.0,f0
1,a1,b1,2.0,10.2,19.0,f2
3,a2,b2,3.0,8.9,25.0,f3
4,a3,b3,4.0,9.1,8.0,f4
5,a4,,5.0,12.0,,f5


## 特征预处理

### 特征选择
特征选择使用的是样本作为估计量，而正是建模是使用全量数据

In [35]:
df2 = pd.DataFrame({'A':ss.norm.rvs(size=10),'B':ss.norm.rvs(size=10),
                    'C':ss.norm.rvs(size=10),'D':np.random.randint(low=0,high=2,size=10)})
df2

Unnamed: 0,A,B,C,D
0,0.888145,0.543618,-0.960614,1
1,0.462175,0.034925,-0.234244,0
2,-0.064108,1.743161,-0.792092,0
3,0.943182,-0.074923,-0.879533,0
4,-0.268589,0.254662,0.688653,1
5,1.075152,-1.233661,-0.036948,0
6,0.594596,-1.114602,0.003042,1
7,0.677053,0.240538,1.740412,1
8,0.945887,0.367086,0.165444,0
9,-0.395973,-1.135481,0.409995,0


In [48]:
# 引入SVR回归器和决策树回归器
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
# 引入特征选择的三种思想的包
from sklearn.feature_selection import SelectKBest,RFE,SelectFromModel
# 特征和标注
X = df2.loc[:,['A','B','C']]
Y = df2.loc[:,'D']
Y

0    1
1    0
2    0
3    0
4    1
5    0
6    1
7    1
8    0
9    0
Name: D, dtype: int32

In [50]:
# 1.过滤思想
skb = SelectKBest(k=2)
skb.fit(X,Y)
skb.transform(X)

array([[ 0.88814511, -0.96061358],
       [ 0.46217474, -0.23424413],
       [-0.06410792, -0.79209155],
       [ 0.94318202, -0.87953263],
       [-0.26858863,  0.68865281],
       [ 1.07515191, -0.03694783],
       [ 0.59459605,  0.00304236],
       [ 0.67705281,  1.74041243],
       [ 0.94588657,  0.16544364],
       [-0.39597342,  0.4099947 ]])

In [51]:
# 2.包裹思想
rfe = RFE(estimator=SVR(kernel='linear'),n_features_to_select=2,step=1)
rfe.fit_transform(X,Y)

array([[ 0.54361779, -0.96061358],
       [ 0.03492471, -0.23424413],
       [ 1.74316121, -0.79209155],
       [-0.07492272, -0.87953263],
       [ 0.25466202,  0.68865281],
       [-1.23366086, -0.03694783],
       [-1.11460221,  0.00304236],
       [ 0.24053831,  1.74041243],
       [ 0.36708556,  0.16544364],
       [-1.1354814 ,  0.4099947 ]])

In [52]:
# 3.嵌入思想
# threshold=0.1 为阈值
sfm = SelectFromModel(estimator=DecisionTreeRegressor(),threshold=0.1)
sfm.fit_transform(X,Y)

array([[ 0.54361779, -0.96061358],
       [ 0.03492471, -0.23424413],
       [ 1.74316121, -0.79209155],
       [-0.07492272, -0.87953263],
       [ 0.25466202,  0.68865281],
       [-1.23366086, -0.03694783],
       [-1.11460221,  0.00304236],
       [ 0.24053831,  1.74041243],
       [ 0.36708556,  0.16544364],
       [-1.1354814 ,  0.4099947 ]])

### 特征变换-离散化

In [54]:
lst = [6,8,10,15,16,24,15,40,67]
lst

[6, 8, 10, 15, 16, 24, 15, 40, 67]

In [55]:
# 等身（等频）分箱
pd.qcut(lst,q=3)

[(5.999, 13.333], (5.999, 13.333], (5.999, 13.333], (13.333, 18.667], (13.333, 18.667], (18.667, 67.0], (13.333, 18.667], (18.667, 67.0], (18.667, 67.0]]
Categories (3, interval[float64]): [(5.999, 13.333] < (13.333, 18.667] < (18.667, 67.0]]

In [57]:
pd.qcut(lst,q=3,labels=['low','medium','high'])

[low, low, low, medium, medium, high, medium, high, high]
Categories (3, object): [low < medium < high]

In [58]:
# 等宽分箱
pd.cut(lst,bins=3)

[(5.939, 26.333], (5.939, 26.333], (5.939, 26.333], (5.939, 26.333], (5.939, 26.333], (5.939, 26.333], (5.939, 26.333], (26.333, 46.667], (46.667, 67.0]]
Categories (3, interval[float64]): [(5.939, 26.333] < (26.333, 46.667] < (46.667, 67.0]]

In [59]:
pd.cut(lst,bins=3,labels=['low','medium','high'])

[low, low, low, low, low, low, low, medium, high]
Categories (3, object): [low < medium < high]

### 特征变换-归一化和标准化

In [60]:
# 引入标准化和归一化的函数
from sklearn.preprocessing import MinMaxScaler,StandardScaler

In [61]:
# 归一化，reshape(-1,1)的意思：-1是不要求有几行，但一定要有1列
MinMaxScaler().fit_transform(np.array([1,4,10,15,21]).reshape(-1,1))

array([[0.  ],
       [0.15],
       [0.45],
       [0.7 ],
       [1.  ]])

In [62]:
# 标准化
StandardScaler().fit_transform(np.array([1,1,1,1,0,0,0,0]).reshape(-1,1))

array([[ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [-1.],
       [-1.],
       [-1.],
       [-1.]])

In [63]:
# 标准化
StandardScaler().fit_transform(np.array([1,0,0,0,0,0,0,0]).reshape(-1,1))

array([[ 2.64575131],
       [-0.37796447],
       [-0.37796447],
       [-0.37796447],
       [-0.37796447],
       [-0.37796447],
       [-0.37796447],
       [-0.37796447]])

### 特征变换-数值化

In [64]:
# 引入标签化和独热编码
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

In [66]:
# 标签化
LabelEncoder().fit_transform(np.array(['Down','Up','Down','Up']).reshape(-1,1))

array([0, 1, 0, 1], dtype=int64)

In [67]:
# 首字母做升序，进行排序并编码
LabelEncoder().fit_transform(np.array(['Low','Medium','High','Medium','Low']).reshape(-1,1))

array([1, 2, 0, 2, 1], dtype=int64)

In [69]:
# 独热编码
# 独热编码前需要先进行标签化
lb_encoder = LabelEncoder()
lb_tran_f = lb_encoder.fit_transform(np.array(['Red','Yellow','Blue','Green']))
lb_tran_f

array([2, 3, 0, 1], dtype=int64)

In [75]:
# 独热编码拟合
oht_encoder = OneHotEncoder().fit(lb_tran_f.reshape(-1,1))
# transform
oht_encoder.transform(lb_encoder.transform(np.array(['Yellow','Blue','Green','Green','Red']))
                      .reshape(-1,1)).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.]])

### 特征变换-正规化

In [76]:
# 导入正规化包
from sklearn.preprocessing import Normalizer

In [78]:
# L1正则化
Normalizer(norm='l1').fit_transform(np.array([1,1,3,-1,2]).reshape(1,-1))

array([[ 0.125,  0.125,  0.375, -0.125,  0.25 ]])

In [80]:
# L2正则化
Normalizer(norm='l2').fit_transform(np.array([[1,1,3,-1,2]]))

array([[ 0.25,  0.25,  0.75, -0.25,  0.5 ]])

## 特征降维-LDA

In [81]:
# 导入LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [83]:
# LDA降维，有监督降维，使同一标注间尽可能小，不同标注间尽可能大
X = np.array([[-1,-1],[-2,-1],[-3,-2],[1,1],[2,1],[3,2]])
Y = np.array([1,1,1,2,2,2])
LinearDiscriminantAnalysis(n_components=1).fit_transform(X,Y)

array([[-1.73205081],
       [-1.73205081],
       [-3.46410162],
       [ 1.73205081],
       [ 1.73205081],
       [ 3.46410162]])

In [84]:
# LDA可以当做一个判别器（分类器）来用，也叫作fisher classification
clf = LinearDiscriminantAnalysis(n_components=1).fit(X,Y)
clf.predict([[0.8,1]])

array([2])