# 一、数据准备

### 1.数据缩放

缩放数据到（0-1），机器学习核心算法如梯度下降；设置输入权重如回归、神经网络；基于距离度量算法如KNN中使用

# Sklearn Preprocessing

关于为什么需要预处理数据，下面这个链接有些介绍。
这部分介绍的主要是异常值的处理

http://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#results

下面是对preprocessing 的介绍：

http://scikit-learn.org/stable/modules/preprocessing.html


In [19]:
import pandas as pd
import scipy as sp
import numpy as np
import sklearn.preprocessing as pp

filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(filename, delimiter=',', header=None, names=names)

data_array = dataframe.values
X = data_array[:,0:8]
Y = data_array[:,8]

scaler = pp.MinMaxScaler(feature_range=(0,1))
scaler.fit(X)
newX = scaler.transform(X)

np.set_printoptions(precision=3)

print (newX.shape)
print (newX)

(768L, 8L)
[[ 0.353  0.744  0.59  ...,  0.501  0.234  0.483]
 [ 0.059  0.427  0.541 ...,  0.396  0.117  0.167]
 [ 0.471  0.92   0.525 ...,  0.347  0.254  0.183]
 ..., 
 [ 0.294  0.608  0.59  ...,  0.39   0.071  0.15 ]
 [ 0.059  0.633  0.492 ...,  0.449  0.116  0.433]
 [ 0.059  0.467  0.574 ...,  0.453  0.101  0.033]]


### 2.标准化数据

假定数据是正态分布，将数据处理为标准正态分布（0,1），适合线性回归、逻辑回归、线性判别分析

In [1]:
import pandas as pd
import scipy as sp
import numpy as np
import sklearn.preprocessing as pp
import matplotlib.pyplot as plt
%matplotlib inline

filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(filename, delimiter=',', header=None, names=names)

data_array = dataframe.values
X = data_array[:,0:8]
Y = data_array[:,8]

scaler = pp.StandardScaler()
scaler.fit(X)
newX = scaler.transform(X)

np.set_printoptions(precision=3)

print (newX.shape)
print (newX)


(768, 8)
[[ 0.64   0.848  0.15  ...  0.204  0.468  1.426]
 [-0.845 -1.123 -0.161 ... -0.684 -0.365 -0.191]
 [ 1.234  1.944 -0.264 ... -1.103  0.604 -0.106]
 ...
 [ 0.343  0.003  0.15  ... -0.735 -0.685 -0.276]
 [-0.845  0.16  -0.471 ... -0.24  -0.371  1.171]
 [-0.845 -0.873  0.046 ... -0.202 -0.474 -0.871]]


### 3.规一化数据

将行数据转换为长度为1的向量，应用于向量空间模型、稀疏数据，在文本分类和聚类、神经网络算法、距离度量KNN算法中使用

In [22]:
import pandas as pd
import scipy as sp
import numpy as np
import sklearn.preprocessing as pp

filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(filename, delimiter=',', header=None, names=names)

data_array = dataframe.values
X = data_array[:,0:8]
Y = data_array[:,8]

scaler = pp.Normalizer(norm='l2')
scaler.fit(X)
newX = scaler.transform(X)

np.set_printoptions(precision=3)

print (newX.shape)
print (newX)

(768L, 8L)
[[ 0.034  0.828  0.403 ...,  0.188  0.004  0.28 ]
 [ 0.008  0.716  0.556 ...,  0.224  0.003  0.261]
 [ 0.04   0.924  0.323 ...,  0.118  0.003  0.162]
 ..., 
 [ 0.027  0.651  0.388 ...,  0.141  0.001  0.161]
 [ 0.007  0.838  0.399 ...,  0.2    0.002  0.313]
 [ 0.008  0.736  0.554 ...,  0.241  0.002  0.182]]


### 4.二值化数据

将类别数据转化为二进制数据，在特征工程、文本处理中使用

In [24]:
import pandas as pd
import scipy as sp
import numpy as np
import sklearn.preprocessing as pp

filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(filename, delimiter=',', header=None, names=names)

data_array = dataframe.values
X = data_array[:,0:8]
Y = data_array[:,8]

scaler = pp.Binarizer(threshold=0.0)
scaler.fit(X)
newX = scaler.transform(X)

np.set_printoptions(precision=3)

print (newX.shape)
print (newX)

(768L, 8L)
[[ 1.  1.  1. ...,  1.  1.  1.]
 [ 1.  1.  1. ...,  1.  1.  1.]
 [ 1.  1.  1. ...,  1.  1.  1.]
 ..., 
 [ 1.  1.  1. ...,  1.  1.  1.]
 [ 1.  1.  1. ...,  1.  1.  1.]
 [ 1.  1.  1. ...,  1.  1.  1.]]


# 二、特征选择

通过选择特征可以降低过拟合风险、提升准确度、减少训练时间

### 1.变量选择

In [5]:
import pandas as pd
import scipy as sp
import numpy as np
import sklearn.feature_selection as fs

filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(filename, delimiter=',', header=None, names=names)

data_array = dataframe.values
print(data_array.shape)
X = data_array[:,0:8]
Y = data_array[:,8]


# 这里的score_func使用的是chi2也就是卡方分布
selector = fs.SelectKBest(score_func=fs.chi2, k=4)

selector.fit(X,Y)

print (selector.scores_)

X_best = selector.transform(X)

np.set_printoptions(precision=3)

print (X_best.shape)
print (X_best)

(768, 9)
[ 111.52  1411.887   17.605   53.108 2175.565  127.669    5.393  181.304]
(768, 4)
[[148.    0.   33.6  50. ]
 [ 85.    0.   26.6  31. ]
 [183.    0.   23.3  32. ]
 ...
 [121.  112.   26.2  30. ]
 [126.    0.   30.1  47. ]
 [ 93.    0.   30.4  23. ]]


### 2.递归特征删除

In [12]:
import pandas as pd
import scipy as sp
import numpy as np
import sklearn.feature_selection as fs

from sklearn.linear_model import LogisticRegression

filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(filename, delimiter=',', header=None, names=names)

data_array = dataframe.values
X = data_array[:,0:8]
y = data_array[:,8]

model = LogisticRegression()

selector = fs.RFE(estimator=model, n_features_to_select=3, step=1)

selector.fit(X,y)

print ('number of features : {}'.format(selector.n_features_))

print ('feature supports : {}'.format(selector.support_))

print ('feature ranking : {}'.format(selector.ranking_))

np.set_printoptions(precision=3)

print (X[1])
X_best = selector.transform(X)
print (X_best)

number of features : 3
feature supports : [ True False False False False  True  True False]
feature ranking : [1 2 3 5 6 1 1 4]
[ 1.    85.    66.    29.     0.    26.6    0.351 31.   ]
[[ 6.    33.6    0.627]
 [ 1.    26.6    0.351]
 [ 8.    23.3    0.672]
 ...
 [ 5.    26.2    0.245]
 [ 1.    30.1    0.349]
 [ 1.    30.4    0.315]]


### 3.主成分分析

In [14]:
import pandas as pd
import scipy as sp
import numpy as np


from sklearn.decomposition import PCA

filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(filename, delimiter=',', header=None, names=names)

data_array = dataframe.values
X = data_array[:,0:8]
Y = data_array[:,8]


pca = PCA(n_components=5)

pca.fit(X)

print ('explained variance : {}'.format(pca.explained_variance_ratio_))

print (pca.components_)

np.set_printoptions(precision=3)

explained variance : [0.889 0.062 0.026 0.013 0.007]
[[-2.022e-03  9.781e-02  1.609e-02  6.076e-02  9.931e-01  1.401e-02
   5.372e-04 -3.565e-03]
 [-2.265e-02 -9.722e-01 -1.419e-01  5.786e-02  9.463e-02 -4.697e-02
  -8.168e-04 -1.402e-01]
 [-2.246e-02  1.434e-01 -9.225e-01 -3.070e-01  2.098e-02 -1.324e-01
  -6.400e-04 -1.255e-01]
 [-4.905e-02  1.198e-01 -2.627e-01  8.844e-01 -6.555e-02  1.928e-01
   2.699e-03 -3.010e-01]
 [ 1.516e-01 -8.794e-02 -2.322e-01  2.600e-01 -1.723e-04  2.147e-02
   1.641e-03  9.205e-01]]
[[  6.    148.     72.    ...  33.6     0.627  50.   ]
 [  1.     85.     66.    ...  26.6     0.351  31.   ]
 [  8.    183.     64.    ...  23.3     0.672  32.   ]
 ...
 [  5.    121.     72.    ...  26.2     0.245  30.   ]
 [  1.    126.     60.    ...  30.1     0.349  47.   ]
 [  1.     93.     70.    ...  30.4     0.315  23.   ]]


### 4.重要特征

In [50]:
import pandas as pd
import scipy as sp
import numpy as np


from sklearn.ensemble import ExtraTreesClassifier

filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(filename, delimiter=',', header=None, names=names)

data_array = dataframe.values
X = data_array[:,0:8]
Y = data_array[:,8]


etc = ExtraTreesClassifier(n_estimators=10)

etc.fit(X,y)


np.set_printoptions(precision=3)

print (etc.feature_importances_)


[ 0.098  0.247  0.103  0.08   0.076  0.142  0.112  0.142]
