# Seleção de Atributos (variáveis)

## Seleção de Atributos - Utilizando RFE - Recursive Feature Elimination

In [1]:
# Gerando o dataset
import pandas as pd
from sklearn.datasets import load_boston

boston = load_boston() 
dataset = pd.DataFrame(boston.data, columns = boston.feature_names)
dataset['target'] = boston.target
dataset.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [2]:
print('Número de Atributos:', len(dataset.columns) -1)

Número de Atributos: 13


In [3]:
# aplicando a RFE
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

X = dataset.iloc[:,:-1].values
y = dataset['target'].values

num_atributos_relevantes = 8
estimator = SVR(kernel="linear")
selector = RFE(estimator, num_atributos_relevantes, step=1)
selector = selector.fit(X, y)

print("Num Features: ", selector.n_features_)
print("Selected Features: ", selector.support_)
print("Feature Ranking: ", selector.ranking_)

Num Features:  8
Selected Features:  [ True False  True  True  True  True False  True False False  True False
  True]
Feature Ranking:  [1 2 1 1 1 1 3 1 5 6 1 4 1]


In [4]:
dfatributos = pd.DataFrame( {'Atributo': dataset.columns[:-1],
                              'Importancia': selector.ranking_ })
                             
dfatributos = dfatributos.sort_values(by='Importancia', ascending=True)
dfatributos

Unnamed: 0,Atributo,Importancia
0,CRIM,1
2,INDUS,1
3,CHAS,1
4,NOX,1
5,RM,1
7,DIS,1
10,PTRATIO,1
12,LSTAT,1
1,ZN,2
6,AGE,3


## Seleção de Atributos -  Seleciona os atributos pela variância

In [5]:
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold()
selector.fit_transform(X)
selector.variances_

array([7.38403597e+01, 5.42861840e+02, 4.69714297e+01, 6.43854770e-02,
       1.34010989e-02, 4.92695216e-01, 7.90792473e+02, 4.42525226e+00,
       7.56665313e+01, 2.83486236e+04, 4.67772630e+00, 8.31828042e+03,
       5.08939794e+01])

In [6]:
dfatributos2 = pd.DataFrame( {'Atributo': dataset.columns[:-1],
                              'Importancia': selector.variances_  })
                             
dfatributos2 = dfatributos2.sort_values(by='Importancia', ascending=True)
dfatributos2

Unnamed: 0,Atributo,Importancia
4,NOX,0.013401
3,CHAS,0.064385
5,RM,0.492695
7,DIS,4.425252
10,PTRATIO,4.677726
2,INDUS,46.97143
12,LSTAT,50.893979
0,CRIM,73.84036
8,RAD,75.666531
1,ZN,542.86184


## Seleção de Atributos - Utilizando Árvores de Decisão 

In [7]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(n_estimators=30,max_depth=6)
clf = clf.fit(X, y)
clf.feature_importances_           

array([0.03245332, 0.00060974, 0.00384296, 0.001299  , 0.02558246,
       0.43132481, 0.00800649, 0.06634847, 0.00368559, 0.01151136,
       0.01570849, 0.00624076, 0.39338655])

In [8]:
dfatributos3 = pd.DataFrame( {'Atributo': dataset.columns[:-1],
                              'Importancia': clf.feature_importances_  })
                             
dfatributos3 = dfatributos3.sort_values(by='Importancia', ascending=False)
dfatributos3

Unnamed: 0,Atributo,Importancia
5,RM,0.431325
12,LSTAT,0.393387
7,DIS,0.066348
0,CRIM,0.032453
4,NOX,0.025582
10,PTRATIO,0.015708
9,TAX,0.011511
6,AGE,0.008006
11,B,0.006241
2,INDUS,0.003843


In [9]:
dfatributos.sort_values(by=['Importancia', 'Atributo'])

Unnamed: 0,Atributo,Importancia
3,CHAS,1
0,CRIM,1
7,DIS,1
2,INDUS,1
12,LSTAT,1
4,NOX,1
10,PTRATIO,1
5,RM,1
1,ZN,2
6,AGE,3


## Seleção de Atributos - Extra Trees Classifier

In [10]:
# Feature Importance with Extra Trees Classifier

from sklearn.ensemble import ExtraTreesRegressor

# feature extraction
model = ExtraTreesRegressor()
model.fit(X, y)
print(model.feature_importances_)

[0.02706877 0.00301489 0.06708798 0.00935502 0.03263592 0.31506678
 0.01654456 0.03876032 0.01003995 0.0476678  0.04100575 0.01930966
 0.37244259]


In [11]:
dfatributos4 = pd.DataFrame( {'Atributo': dataset.columns[:-1],
                              'Importancia': model.feature_importances_ })
                             
dfatributos4 = dfatributos3.sort_values(by='Importancia', ascending=False)
dfatributos4

Unnamed: 0,Atributo,Importancia
5,RM,0.431325
12,LSTAT,0.393387
7,DIS,0.066348
0,CRIM,0.032453
4,NOX,0.025582
10,PTRATIO,0.015708
9,TAX,0.011511
6,AGE,0.008006
11,B,0.006241
2,INDUS,0.003843
