# Seleção de Atributos (variáveis)

# Métricas

### Selecionar os melhores atributos, depois treinar o modelo e aplicar as métricas

In [1]:
# Criando um modelo
from sklearn import linear_model

def calcula_metricas(dataset, atributos_selecionados):

    X = dataset.loc[:, atributos_selecionados ].values
    y = dataset['target'].values

    modelo = linear_model.LinearRegression() 
    modelo.fit(X,y)

    # MAE - Mean Absolute Error
    from sklearn import model_selection
    kfold = 10
    resultado = model_selection.cross_val_score(modelo, X, y, cv = kfold, scoring = 'neg_mean_absolute_error')

    # Print do resultado
    print("MAE: %.3f (%.3f)" % (resultado.mean(), resultado.std()))

    # MSE - Mean Squared Error
    # Definindo os valores para o número de folds
    num_folds = 10
    num_instances = len(X)
    seed = 7

    # Separando os dados em folds
    kfold = model_selection.KFold(num_folds, True, random_state = seed)

    resultado = model_selection.cross_val_score(modelo, X, y, cv = kfold, scoring = 'neg_mean_squared_error')

    # Print do resultado
    print("MSE: %.3f (%.3f)" % (resultado.mean(), resultado.std()))

    # RMSE (Root Mean Squared Error
    from math import sqrt
    print("RMSE: %.3f " % (sqrt(abs(resultado.mean()))))

    # R2
    resultado = model_selection.cross_val_score(modelo, X, y, cv = kfold, scoring = 'r2')

    # Print do resultado
    print("R^2: %.3f (%.3f)" % (resultado.mean(), resultado.std()))

# Seleciona os atributos
atributos_selecionados = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', # 'DIS', 
                              'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'] # 'AGE',

#calcula_metricas(dataset, atributos_selecionados)

## Carregando o Dataset Boston Houses

1. CRIM: per capita crime rate by town 
2. ZN: proportion of residential land zoned for lots over 25,000 sq.ft. 
3. INDUS: proportion of non-residential acres per town 
4. CHAS: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) 
5. NOX: nitric oxides concentration (parts per 10 million) 
6. RM: average number of rooms per dwelling 
7. AGE: proportion of owner-occupied units built prior to 1940 
8. DIS: weighted distances to five Boston employment centres 
9. RAD: index of accessibility to radial highways 
10. TAX: full-value property-tax rate per 10,000 
11. PTRATIO: pupil-teacher ratio by town 
12. B: 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town 
13. LSTAT: % lower status of the population 
14. TARGET: Median value of owner-occupied homes in $1000's

## Seleção de Atributos - Utilizando RFE - Recursive Feature Elimination

Esta técnica para seleção de atributos, que recursivamente remove os atributos e constrói o modelo com os atributos remanescentes. <br> Esta técnica utiliza a acurácia do modelo para identificar os atributos que mais contribuem para prever a variável alvo. <br> Em inglês esta técnica é chamada Recursive Feature Elimination (RFE).


In [2]:
# Gerando o dataset
import pandas as pd
from sklearn.datasets import load_boston

boston = load_boston() 
dataset = pd.DataFrame(boston.data, columns = boston.feature_names)
dataset['target'] = boston.target
dataset.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [3]:
print('Número de Atributos:', len(dataset.columns) -1)

Número de Atributos: 13


In [4]:
dataset.columns

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'target'],
      dtype='object')

In [5]:
# aplicando a RFE
from sklearn.feature_selection import RFE
from sklearn.svm import SVR


X = dataset.iloc[:,:-1].values
y = dataset['target'].values

num_atributos_relevantes = 8
estimator = SVR(kernel="linear")
selector = RFE(estimator, num_atributos_relevantes, step=1)
selector = selector.fit(X, y)

print("Num Features: ", selector.n_features_)
print("Selected Features: ", selector.support_)
print("Feature Ranking: ", selector.ranking_)



Num Features:  8
Selected Features:  [ True False  True  True  True  True False  True False False  True False
  True]
Feature Ranking:  [1 2 1 1 1 1 3 1 5 6 1 4 1]


In [6]:
dfatributos = pd.DataFrame( {'Atributo': dataset.columns[:-1],
                              'Importancia': selector.ranking_ })
                             
dfatributos = dfatributos.sort_values(by='Importancia', ascending=True)
dfatributos

Unnamed: 0,Atributo,Importancia
0,CRIM,1
2,INDUS,1
3,CHAS,1
4,NOX,1
5,RM,1
7,DIS,1
10,PTRATIO,1
12,LSTAT,1
1,ZN,2
6,AGE,3


In [7]:
# Seleciona os atributos
atributos_selecionados = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM',  'DIS', 
                              'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'] #'AGE',

calcula_metricas(dataset, atributos_selecionados)

MAE: -3.939 (1.991)
MSE: -23.558 (11.116)
RMSE: 4.854 
R^2: 0.721 (0.098)




## Seleção de Atributos -  Seleciona os atributos pela variância

In [8]:
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold()
selector.fit_transform(X)
selector.variances_

array([7.38403597e+01, 1.00000000e+02, 2.72800000e+01, 6.43854770e-02,
       1.34010989e-02, 4.92695216e-01, 9.71000000e+01, 4.42525226e+00,
       2.30000000e+01, 5.24000000e+02, 4.67772630e+00, 3.96580000e+02,
       3.62400000e+01])

In [9]:
dfatributos2 = pd.DataFrame( {'Atributo': dataset.columns[:-1],
                              'Importancia': selector.variances_  })
                             
dfatributos2 = dfatributos2.sort_values(by='Importancia', ascending=True)
dfatributos2

Unnamed: 0,Atributo,Importancia
4,NOX,0.013401
3,CHAS,0.064385
5,RM,0.492695
7,DIS,4.425252
10,PTRATIO,4.677726
8,RAD,23.0
2,INDUS,27.28
12,LSTAT,36.24
0,CRIM,73.84036
6,AGE,97.1


In [10]:
# Seleciona os atributos
atributos_selecionados = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', # 'DIS', 
                              'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'] # 'AGE',

calcula_metricas(dataset, atributos_selecionados)

MAE: -4.149 (2.069)
MSE: -26.130 (12.988)
RMSE: 5.112 
R^2: 0.692 (0.112)




## Seleção de Atributos - Utilizando Árvores de Decisão 

Bagged Decision Trees, como o algoritmo RandomForest, podem ser usados para estimar a importância de cada atributo. Esse método retorna um score para cada atributo.

Quanto maior o score, maior a importância do atributo.

In [11]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(n_estimators=30,max_depth=6)
clf = clf.fit(X, y)
clf.feature_importances_           

array([0.03113753, 0.00062585, 0.00395266, 0.00053151, 0.01643529,
       0.43191119, 0.00827766, 0.06177954, 0.00234545, 0.01313099,
       0.01716502, 0.00992891, 0.40277839])

In [12]:
dfatributos3 = pd.DataFrame( {'Atributo': dataset.columns[:-1],
                              'Importancia': clf.feature_importances_  })
                             
dfatributos3 = dfatributos3.sort_values(by='Importancia', ascending=False)
dfatributos3

Unnamed: 0,Atributo,Importancia
5,RM,0.431911
12,LSTAT,0.402778
7,DIS,0.06178
0,CRIM,0.031138
10,PTRATIO,0.017165
4,NOX,0.016435
9,TAX,0.013131
11,B,0.009929
6,AGE,0.008278
2,INDUS,0.003953


In [13]:
dfatributos.sort_values(by=['Importancia', 'Atributo'])

Unnamed: 0,Atributo,Importancia
3,CHAS,1
0,CRIM,1
7,DIS,1
2,INDUS,1
12,LSTAT,1
4,NOX,1
10,PTRATIO,1
5,RM,1
1,ZN,2
6,AGE,3


In [14]:
# Seleciona os atributos
atributos_selecionados = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', # 'DIS', 
                              'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'] # 'AGE',

calcula_metricas(dataset, atributos_selecionados)

MAE: -4.149 (2.069)
MSE: -26.130 (12.988)
RMSE: 5.112 
R^2: 0.692 (0.112)


## Seleção de Atributos - Extra Trees Classifier

In [15]:
# Feature Importance with Extra Trees Classifier

from sklearn.ensemble import ExtraTreesRegressor

# feature extraction
model = ExtraTreesRegressor()
model.fit(X, y)
print(model.feature_importances_)

[0.02829395 0.00511372 0.04042292 0.01279403 0.03868176 0.30002933
 0.0186173  0.03520413 0.01711507 0.03997533 0.04062731 0.01842087
 0.40470427]


In [16]:
dfatributos4 = pd.DataFrame( {'Atributo': dataset.columns[:-1],
                              'Importancia': model.feature_importances_ })
                             
dfatributos4 = dfatributos3.sort_values(by='Importancia', ascending=False)
dfatributos4

Unnamed: 0,Atributo,Importancia
5,RM,0.431911
12,LSTAT,0.402778
7,DIS,0.06178
0,CRIM,0.031138
10,PTRATIO,0.017165
4,NOX,0.016435
9,TAX,0.013131
11,B,0.009929
6,AGE,0.008278
2,INDUS,0.003953


In [17]:
# Seleciona os atributos
atributos_selecionados = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', # 'DIS', 
                              'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'] # 'AGE',

calcula_metricas(dataset, atributos_selecionados)

MAE: -4.149 (2.069)
MSE: -26.130 (12.988)
RMSE: 5.112 
R^2: 0.692 (0.112)
