### 逻辑回归中二分类问题与多分类问题中的特征选择

主要应用函数***SelectFromModel***


In [55]:
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_breast_cancer,load_iris
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

#### 1. 二分类问题

In [56]:
#加载癌细胞预测数据集，属于二分类问题
data=load_breast_cancer()
data

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0

In [57]:
x=data['data']
y=data['target']
x.shape
#原数据特征有30个

(569, 30)

In [58]:
clf=LogisticRegression(random_state=78)
clf.fit(x,y)
clf.score(x,y)

0.9472759226713533

In [59]:
# 特征选择
sm=SelectFromModel(clf,norm_order=1)
x_embedded=sm.fit_transform(x,y)
x_embedded.shape
# 经过特征选择后特征维度降到10

(569, 10)

In [60]:
clf.fit(x_embedded,y)
clf.score(x_embedded,y)

0.9560632688927944

经过特征选择，数据降维，并且准确率甚至有所提高。

In [62]:
print('lr模型系数：',sm.estimator_.coef_)
print('特征选择器阈值:',sm.threshold_)
features=data['feature_names']
print('模型选择的特征：',features[sm.get_support()])

lr模型系数： [[ 0.93992188  0.45363814  0.28289519 -0.01623003 -0.03488654 -0.16449716
  -0.23033234 -0.09699644 -0.04832641 -0.00963572  0.04078182  0.37027066
   0.14274215 -0.10932016 -0.00315728 -0.03538223 -0.04926916 -0.01263282
  -0.01166827 -0.00328427  0.99914523 -0.50254973 -0.24960281 -0.01373331
  -0.06331502 -0.51312369 -0.63860072 -0.18666133 -0.15329368 -0.0492862 ]]
模型选择器阈值: 0.21417268043549473
模型选择的特征： ['mean radius' 'mean texture' 'mean perimeter' 'mean concavity'
 'texture error' 'worst radius' 'worst texture' 'worst perimeter'
 'worst compactness' 'worst concavity']


In [63]:
print('原始x采样：',x[:3,:])
print('选择x采样：',x_embedded[:3,:])

原始x采样： [[1.799e+01 1.038e+01 1.228e+02 1.001e+03 1.184e-01 2.776e-01 3.001e-01
  1.471e-01 2.419e-01 7.871e-02 1.095e+00 9.053e-01 8.589e+00 1.534e+02
  6.399e-03 4.904e-02 5.373e-02 1.587e-02 3.003e-02 6.193e-03 2.538e+01
  1.733e+01 1.846e+02 2.019e+03 1.622e-01 6.656e-01 7.119e-01 2.654e-01
  4.601e-01 1.189e-01]
 [2.057e+01 1.777e+01 1.329e+02 1.326e+03 8.474e-02 7.864e-02 8.690e-02
  7.017e-02 1.812e-01 5.667e-02 5.435e-01 7.339e-01 3.398e+00 7.408e+01
  5.225e-03 1.308e-02 1.860e-02 1.340e-02 1.389e-02 3.532e-03 2.499e+01
  2.341e+01 1.588e+02 1.956e+03 1.238e-01 1.866e-01 2.416e-01 1.860e-01
  2.750e-01 8.902e-02]
 [1.969e+01 2.125e+01 1.300e+02 1.203e+03 1.096e-01 1.599e-01 1.974e-01
  1.279e-01 2.069e-01 5.999e-02 7.456e-01 7.869e-01 4.585e+00 9.403e+01
  6.150e-03 4.006e-02 3.832e-02 2.058e-02 2.250e-02 4.571e-03 2.357e+01
  2.553e+01 1.525e+02 1.709e+03 1.444e-01 4.245e-01 4.504e-01 2.430e-01
  3.613e-01 8.758e-02]]
选择x采样： [[1.799e+01 1.038e+01 1.228e+02 3.001e-01 9.053e-01 

可见将绝对值小于阈值的全部剔除



#### 2. 多分类问题

In [64]:
# 加载鸢尾花数据集 多分类问题
data=load_iris()
data

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [65]:
x=data['data']
y=data['target']
x.shape
# 原始特征维度4

(150, 4)

In [66]:
clf=LogisticRegression(random_state=78)
clf.fit(x,y)
clf.score(x,y)

0.9733333333333334

In [67]:
# 特征选择
sm=SelectFromModel(clf)
x_embedded=sm.fit_transform(x,y)
x_embedded.shape

(150, 2)

In [68]:
clf.fit(x_embedded,y)
clf.score(x_embedded,y)

0.9666666666666667

经过特征选择，模型分数有些许下降（其实此例不太适合，4维特征不用降维）

In [76]:
print('lr系数：',sm.estimator_.coef_)
print('特征选择阈值:',sm.threshold_)
print('特征选择掩码:',sm.get_support())

lr系数： [[-0.41813284  0.96633399 -2.52102777 -1.08409539]
 [ 0.53029432 -0.31442411 -0.19947604 -0.94816287]
 [-0.11216148 -0.65190988  2.72050381  2.03225827]]
特征选择阈值: 3.1246951929888924
特征选择掩码: [False False  True  True]


因为是多分类问题，故系数维度[n_classes,n_features]

In [77]:
print('原始x采样:',x[:3,:])

原始x采样: [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]]


In [78]:
print('选择x采样:',x_embedded[:3,:])

选择x采样: [[1.4 0.2]
 [1.4 0.2]
 [1.3 0.2]]


多分类问题中根据系数来选择的处理过程

In [73]:
cf=sm.estimator_.coef_.copy()
np.abs(cf).sum(axis=0) # 求每一特征列系数的L1范数(特征选择器默认norm_order=1)

array([1.06058863, 1.93266799, 5.44100762, 4.06451653])

In [75]:
# 比较求得的L1范数与划分阈值，得到的结果与特征选择器的掩码一致
np.abs(cf).sum(axis=0)>=sm.threshold_

array([False, False,  True,  True])