In [1]:
from pandas import read_csv
from numpy import set_printoptions

In [2]:
from sklearn.feature_selection import SelectKBest, chi2

In [3]:
target_url=r'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
data=read_csv(target_url,header=None,names=['sepal length', 'sepal width','petal length', 'petal width', 'class'])
array=data.values

In [4]:
set_printoptions(precision=3)

In [5]:
X=array[:,:-1]
y=array[:,-1]

In [25]:
X[:3,:]

array([[4.9, 3.0, 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2]], dtype=object)

In [27]:
y[:3]

array(['Iris-setosa', 'Iris-setosa', 'Iris-setosa'], dtype=object)

# 单变量特征选定

卡方检验

In [6]:
test=SelectKBest(score_func=chi2,k='all')

In [7]:
fit=test.fit(X,y)

In [8]:
fit.scores_

array([ 10.715,   3.535, 114.2  ,  66.04 ])

In [9]:
features=fit.transform(X)

In [10]:
features

array([[4.9, 3.0, 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5.0, 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5.0, 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3.0, 1.4, 0.1],
       [4.3, 3.0, 1.1, 0.1],
       [5.8, 4.0, 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1.0, 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5.0, 3.0, 1.6, 0.2],
       [5.0, 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.0, 3

# 递归特征消除

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

In [12]:
solver='liblinear'
clf=LogisticRegression(solver='liblinear',multi_class='auto')

In [13]:
rfe=RFE(clf,3)
fit=rfe.fit(X,y)

In [14]:
'特征个数：'
fit.n_features_

3

In [15]:
'被选定的特征：'
fit.support_

array([False,  True,  True,  True])

In [16]:
'特征排名：'
fit.ranking_

array([2, 1, 1, 1])

# PCA主成分分析算法

In [17]:
from sklearn.decomposition import PCA

In [18]:
pca=PCA(n_components=4)
newX=pca.fit_transform(X)
fit=pca.fit(X)

In [19]:
'解释方差：%s'%fit.explained_variance_ratio_

'解释方差：[0.924 0.053 0.017 0.005]'

In [20]:
fit.components_

array([[ 0.363, -0.081,  0.856,  0.359],
       [ 0.656,  0.73 , -0.177, -0.075],
       [-0.581,  0.596,  0.073,  0.549],
       [ 0.317, -0.324, -0.48 ,  0.751]])

# 特征重要性

决策树

In [21]:
from sklearn.ensemble import ExtraTreesClassifier

In [22]:
clf=ExtraTreesClassifier(n_estimators=10)

In [23]:
fit=clf.fit(X,y)

In [24]:
fit.feature_importances_

array([0.091, 0.071, 0.435, 0.403])