# Seleção de Atributos (variáveis)

# Métricas

### Selecionar os melhores atributos, depois treinar o modelo e aplicar as métricas

In [1]:
import warnings
warnings.filterwarnings('ignore')

# Criando um modelo
from sklearn import linear_model

def calcula_metricas(dataset, atributos_selecionados):

    X = dataset.loc[:, atributos_selecionados ].values
    y = dataset['target'].values

    modelo = linear_model.LinearRegression() 
    modelo.fit(X,y)

    # MAE - Mean Absolute Error
    from sklearn import model_selection
    kfold = 10
    resultado = model_selection.cross_val_score(modelo, X, y, cv = kfold, 
                                                scoring = 'neg_mean_absolute_error')

    # Print do resultado
    print("MAE: %.3f (%.3f)" % (resultado.mean(), resultado.std()))

    # MSE - Mean Squared Error
    # Definindo os valores para o número de folds
    num_folds = 10
    num_instances = len(X)
    seed = 7

    # Separando os dados em folds
    kfold = model_selection.KFold(num_folds, random_state = seed, shuffle=True)

    resultado = model_selection.cross_val_score(modelo, X, y, cv = kfold, scoring = 'neg_mean_squared_error')

    # Print do resultado
    print("MSE: %.3f (%.3f)" % (resultado.mean(), resultado.std()))

    # RMSE (Root Mean Squared Error
    from math import sqrt
    print("RMSE: %.3f " % (sqrt(abs(resultado.mean()))))

    # R2
    resultado = model_selection.cross_val_score(modelo, X, y, cv = kfold, scoring = 'r2')

    # Print do resultado
    print("R^2: %.3f (%.3f)" % (resultado.mean(), resultado.std()))

# Seleciona os atributos
atributos_selecionados = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', # 'DIS', 
                              'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'] # 'AGE',

#calcula_metricas(dataset, atributos_selecionados)

## Carregando o Dataset Boston Houses

1. CRIM: per capita crime rate by town 
2. ZN: proportion of residential land zoned for lots over 25,000 sq.ft. 
3. INDUS: proportion of non-residential acres per town 
4. CHAS: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) 
5. NOX: nitric oxides concentration (parts per 10 million) 
6. RM: average number of rooms per dwelling 
7. AGE: proportion of owner-occupied units built prior to 1940 
8. DIS: weighted distances to five Boston employment centres 
9. RAD: index of accessibility to radial highways 
10. TAX: full-value property-tax rate per 10,000 
11. PTRATIO: pupil-teacher ratio by town 
12. B: 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town 
13. LSTAT: % lower status of the population 
14. TARGET: Median value of owner-occupied homes in $1000's

## Seleção de Atributos - Utilizando RFE - Recursive Feature Elimination

Esta técnica para seleção de atributos, que recursivamente remove os atributos e constrói o modelo com os atributos remanescentes. <br> Esta técnica utiliza a acurácia do modelo para identificar os atributos que mais contribuem para prever a variável alvo. <br> Em inglês esta técnica é chamada Recursive Feature Elimination (RFE).


In [3]:
# Gerando o dataset
import pandas as pd
from sklearn.datasets import load_boston

boston = load_boston() 
dataset = pd.DataFrame(boston.data, columns = boston.feature_names)
dataset['target'] = boston.target
dataset.head()
dataset.to_csv('boston_houses.csv', index=False)

In [4]:
print('Número de Atributos:', len(dataset.columns) -1)

Número de Atributos: 13


In [5]:
dataset.columns

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'target'],
      dtype='object')

In [6]:
!ls -lah *.csv

-rw-r--r--@ 1 valencar  staff    38K May 29 20:10 boston_houses.csv
-rw-r--r--@ 1 valencar  staff    33K May 29 18:30 ckd-dataset-v2.csv


In [7]:
!head  boston_houses.csv

CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2
0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3.0,222.0,18.7,394.12,5.21,28.7
0.08829,12.5,7.87,0.0,0.524,6.012,66.6,5.5605,5.0,311.0,15.2,395.6,12.43,22.9
0.14455,12.5,7.87,0.0,0.524,6.172,96.1,5.9505,5.0,311.0,15.2,396.9,19.15,27.1
0.21124,12.5,7.87,0.0,0.524,5.631,100.0,6.0821,5.0,311.0,15.2,386.63,29.93,16.5


# Aplicando a RFE para Seleção dos Melhores Atributos

In [8]:
%%time 

# aplicando a RFE
from sklearn.feature_selection import RFE
from sklearn import linear_model


X = dataset.iloc[:,:-1].values
y = dataset['target'].values

num_atributos_relevantes = 8
estimator = modelo = linear_model.LinearRegression() #normalize = False, fit_intercept = True)
selector = RFE(estimator, n_features_to_select=num_atributos_relevantes, step=1)
selector = selector.fit(X, y)

print("Num Features: ", selector.n_features_)

print(dataset.columns)
print("Selected Features: ", selector.support_)
print("Feature Ranking: ", selector.ranking_)

Num Features:  8
Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'target'],
      dtype='object')
Selected Features:  [ True False False  True  True  True False  True  True False  True False
  True]
Feature Ranking:  [1 3 2 1 1 1 6 1 1 4 1 5 1]
CPU times: user 39 ms, sys: 21.7 ms, total: 60.7 ms
Wall time: 1.73 s


In [9]:
dfatributos = pd.DataFrame( {'Atributo': dataset.columns[:-1],
                              'Importancia': selector.ranking_ })
                             
dfatributos = dfatributos.sort_values(by='Importancia', ascending=True)
dfatributos

Unnamed: 0,Atributo,Importancia
0,CRIM,1
3,CHAS,1
4,NOX,1
5,RM,1
7,DIS,1
8,RAD,1
10,PTRATIO,1
12,LSTAT,1
2,INDUS,2
1,ZN,3


In [10]:
%%time 

# aplicando a RFE
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor

modelo = RandomForestRegressor(n_estimators=35, max_depth=8)


X = dataset.iloc[:,:-1].values
y = dataset['target'].values

num_atributos_relevantes = 8
estimator = modelo
selector = RFE(estimator, n_features_to_select=num_atributos_relevantes, step=1)
selector = selector.fit(X, y)

print("Num Features: ", selector.n_features_)

print(dataset.columns)
print("Selected Features: ", selector.support_)
print("Feature Ranking: ", selector.ranking_)

Num Features:  8
Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'target'],
      dtype='object')
Selected Features:  [ True False False False  True  True  True  True False  True  True False
  True]
Feature Ranking:  [1 6 4 5 1 1 1 1 3 1 1 2 1]
CPU times: user 483 ms, sys: 37 ms, total: 520 ms
Wall time: 3.96 s


In [11]:
dfatributos = pd.DataFrame( {'Atributo': dataset.columns[:-1],
                              'Importancia': selector.ranking_ })
                             
dfatributos = dfatributos.sort_values(by='Importancia', ascending=True)
dfatributos

Unnamed: 0,Atributo,Importancia
0,CRIM,1
4,NOX,1
5,RM,1
6,AGE,1
7,DIS,1
9,TAX,1
10,PTRATIO,1
12,LSTAT,1
11,B,2
8,RAD,3


In [12]:
%%time 

# aplicando a RFE
from sklearn.feature_selection import RFE
from sklearn.svm import SVR


X = dataset.iloc[:,:-1].values
y = dataset['target'].values

num_atributos_relevantes = 8
estimator = SVR(kernel="linear")
selector = RFE(estimator, n_features_to_select=num_atributos_relevantes, step=1)
selector = selector.fit(X, y)

print("Num Features: ", selector.n_features_)

print(dataset.columns)
print("Selected Features: ", selector.support_)
print("Feature Ranking: ", selector.ranking_)

Num Features:  8
Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'target'],
      dtype='object')
Selected Features:  [ True False  True  True  True  True False  True False False  True False
  True]
Feature Ranking:  [1 2 1 1 1 1 3 1 5 6 1 4 1]
CPU times: user 6.99 s, sys: 60.1 ms, total: 7.05 s
Wall time: 7.38 s


In [20]:
dfatributos = pd.DataFrame( {'Atributo': dataset.columns[:-1],
                              'Importancia': selector.ranking_ })
                             
dfatributos = dfatributos.sort_values(by='Importancia', ascending=True)
dfatributos

Unnamed: 0,Atributo,Importancia
0,CRIM,1
2,INDUS,1
3,CHAS,1
4,NOX,1
5,RM,1
7,DIS,1
10,PTRATIO,1
12,LSTAT,1
1,ZN,2
6,AGE,3


In [26]:
dfatributos[ dfatributos.Importancia == 1.0 ]['Atributo'].values

array(['CRIM', 'INDUS', 'CHAS', 'NOX', 'RM', 'DIS', 'PTRATIO', 'LSTAT'],
      dtype=object)

# Comparação de Modelos

In [27]:
print("Modelo com 13 atributos")
atributos_selecionados = ['CRIM', 'INDUS', 'CHAS', 'NOX', 'RM', 'DIS', 'PTRATIO', 'LSTAT',
       'ZN', 'AGE', 'B', 'RAD', 'TAX']
calcula_metricas(dataset, atributos_selecionados)

print("\nModelo com atributos selecionados com RFE")
# Seleciona os atributos
atributos_selecionados = ['CRIM', 'INDUS', 'CHAS', 'NOX', 'RM', 'DIS', 'PTRATIO', 'LSTAT']
       #'ZN', 'AGE', 'B', 'RAD', 'TAX']

calcula_metricas(dataset, atributos_selecionados)

Modelo com 13 atributos
MAE: -4.005 (2.084)
MSE: -23.747 (11.143)
RMSE: 4.873 
R^2: 0.718 (0.099)

Modelo com atributos selecionados com RFE
MAE: -3.872 (1.752)
MSE: -24.967 (11.602)
RMSE: 4.997 
R^2: 0.701 (0.104)


## Seleção de Atributos -  Seleciona os atributos pela variância

In [28]:
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold()
selector.fit_transform(X)
selector.variances_

array([7.38403597e+01, 1.00000000e+02, 2.72800000e+01, 6.43854770e-02,
       1.34010989e-02, 4.92695216e-01, 9.71000000e+01, 4.42525226e+00,
       2.30000000e+01, 5.24000000e+02, 4.67772630e+00, 3.96580000e+02,
       3.62400000e+01])

In [29]:
dfatributos2 = pd.DataFrame( {'Atributo': dataset.columns[:-1],
                              'Importancia': selector.variances_  })
                             
dfatributos2 = dfatributos2.sort_values(by='Importancia', ascending=True).reset_index(drop=True)
dfatributos2

Unnamed: 0,Atributo,Importancia
0,NOX,0.013401
1,CHAS,0.064385
2,RM,0.492695
3,DIS,4.425252
4,PTRATIO,4.677726
5,RAD,23.0
6,INDUS,27.28
7,LSTAT,36.24
8,CRIM,73.84036
9,AGE,97.1


In [30]:
dfatributos2.Atributo.values

array(['NOX', 'CHAS', 'RM', 'DIS', 'PTRATIO', 'RAD', 'INDUS', 'LSTAT',
       'CRIM', 'AGE', 'ZN', 'B', 'TAX'], dtype=object)

In [31]:
print("Modelo com 13 atributos")
atributos_selecionados = ['NOX', 'CHAS', 'RM', 'DIS', 'PTRATIO', 'RAD', 'INDUS', 'LSTAT',
       'CRIM', 'AGE', 'ZN', 'B', 'TAX']
calcula_metricas(dataset, atributos_selecionados)

print("\nModelo com atributos selecionados com VarianceThreshold")
# Seleciona os atributos
atributos_selecionados = ['NOX', 'CHAS', 'RM', 'DIS', 'PTRATIO', 'RAD', 'INDUS', 'LSTAT',
       'CRIM', 'AGE'] #'ZN', 'B', 'TAX']

calcula_metricas(dataset, atributos_selecionados)

Modelo com 13 atributos
MAE: -4.005 (2.084)
MSE: -23.747 (11.143)
RMSE: 4.873 
R^2: 0.718 (0.099)

Modelo com atributos selecionados com VarianceThreshold
MAE: -4.126 (1.883)
MSE: -24.843 (11.326)
RMSE: 4.984 
R^2: 0.704 (0.101)


## Seleção de Atributos - Utilizando Árvores de Decisão 

Bagged Decision Trees, como o algoritmo RandomForest, podem ser usados para estimar a importância de cada atributo. Esse método retorna um score para cada atributo.

Quanto maior o score, maior a importância do atributo.

In [32]:
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(n_estimators=30, max_depth=6)
clf = clf.fit(X, y)
clf.feature_importances_           

array([3.69270713e-02, 1.31465313e-04, 2.88823985e-03, 1.29966494e-05,
       2.25686487e-02, 4.63482899e-01, 8.54632101e-03, 6.32979552e-02,
       2.99235146e-03, 1.18958678e-02, 1.19626946e-02, 8.64256485e-03,
       3.66650924e-01])

In [33]:
dfatributos3 = pd.DataFrame( {'Atributo': dataset.columns[:-1],
                              'Importancia': clf.feature_importances_  })
                             
dfatributos3 = dfatributos3.sort_values(by='Importancia', ascending=False)
dfatributos3

Unnamed: 0,Atributo,Importancia
5,RM,0.463483
12,LSTAT,0.366651
7,DIS,0.063298
0,CRIM,0.036927
4,NOX,0.022569
10,PTRATIO,0.011963
9,TAX,0.011896
11,B,0.008643
6,AGE,0.008546
8,RAD,0.002992


# Atividade 1-A: Criar o modelo e as métricas, comparando com o modelo original

## Seleção de Atributos - Extra Trees Classifier

In [34]:
# Feature Importance with Extra Trees Classifier

from sklearn.ensemble import ExtraTreesRegressor

# feature extraction
model = ExtraTreesRegressor()
model.fit(X, y)
print(model.feature_importances_)

[0.03152993 0.00358358 0.04165332 0.01328916 0.04038686 0.32114342
 0.01932855 0.03539281 0.02073321 0.03606272 0.04498046 0.0181595
 0.37375645]


In [35]:
dfatributos4 = pd.DataFrame( {'Atributo': dataset.columns[:-1],
                              'Importancia': model.feature_importances_ })
                             
dfatributos4 = dfatributos3.sort_values(by='Importancia', ascending=False).reset_index(drop=True)
dfatributos4

Unnamed: 0,Atributo,Importancia
0,RM,0.463483
1,LSTAT,0.366651
2,DIS,0.063298
3,CRIM,0.036927
4,NOX,0.022569
5,PTRATIO,0.011963
6,TAX,0.011896
7,B,0.008643
8,AGE,0.008546
9,RAD,0.002992


In [36]:
# Seleciona os atributos
atributos_selecionados = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', # 'DIS', 
                              'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'] # 'AGE',

calcula_metricas(dataset, atributos_selecionados)

MAE: -4.149 (2.069)
MSE: -26.130 (12.988)
RMSE: 5.112 
R^2: 0.692 (0.112)


# Atividade 1-B: Criar o modelo e as métricas, comparando com o modelo original