# Seleção de Atributos (variáveis)

# Métricas

### Selecionar os melhores atributos, depois treinar o modelo e aplicar as métricas

In [53]:
import warnings
warnings.filterwarnings('ignore')

# Criando um modelo
from sklearn import linear_model

def calcula_metricas(dataset, atributos_selecionados):

    X = dataset[atributos_selecionados ].values
    y = dataset['TARGET'].values

    modelo = linear_model.LinearRegression() 
    modelo.fit(X,y)

    # MAE - Mean Absolute Error
    from sklearn import model_selection
    kfold = 10
    resultado = model_selection.cross_val_score(modelo, X, y, cv = kfold, 
                                                scoring = 'neg_mean_absolute_error')

    # Print do resultado
    print("MAE: %.3f (%.3f)" % (resultado.mean(), resultado.std()))

    # MSE - Mean Squared Error
    # Definindo os valores para o número de folds
    num_folds = 10
    num_instances = len(X)
    seed = 7

    # Separando os dados em folds
    kfold = model_selection.KFold(num_folds, random_state = seed, shuffle=True)

    resultado = model_selection.cross_val_score(modelo, X, y, cv = kfold, scoring = 'neg_mean_squared_error')

    # Print do resultado
    print("MSE: %.3f (%.3f)" % (resultado.mean(), resultado.std()))

    # RMSE (Root Mean Squared Error
    from math import sqrt
    print("RMSE: %.3f " % (sqrt(abs(resultado.mean()))))

    # R2
    resultado = model_selection.cross_val_score(modelo, X, y, cv = kfold, scoring = 'r2')

    # Print do resultado
    print("R^2: %.3f (%.3f)" % (resultado.mean(), resultado.std()))

# Seleciona os atributos
atributos_selecionados = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'TAX', 'PTRATIO', 'B', 'LSTAT'] # 'AGE',

calcula_metricas(dataset, atributos_selecionados)

MAE: -4.094 (2.137)
MSE: -27.032 (13.097)
RMSE: 5.199 
R^2: 0.678 (0.114)


## Carregando o Dataset Boston Houses

1. CRIM: per capita crime rate by town 
2. ZN: proportion of residential land zoned for lots over 25,000 sq.ft. 
3. INDUS: proportion of non-residential acres per town 
4. CHAS: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) 
5. NOX: nitric oxides concentration (parts per 10 million) 
6. RM: average number of rooms per dwelling 
7. AGE: proportion of owner-occupied units built prior to 1940 
8. DIS: weighted distances to five Boston employment centres 
9. RAD: index of accessibility to radial highways 
10. TAX: full-value property-tax rate per 10,000 
11. PTRATIO: pupil-teacher ratio by town 
12. B: 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town 
13. LSTAT: % lower status of the population 
14. TARGET (MEDV): Median value homes in $1000's

In [54]:
!wget https://raw.githubusercontent.com/selva86/datasets/refs/heads/master/BostonHousing.csv

--2024-10-28 18:53:11--  https://raw.githubusercontent.com/selva86/datasets/refs/heads/master/BostonHousing.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8002::154, 2606:50c0:8001::154, 2606:50c0:8000::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8002::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 35735 (35K) [text/plain]
Saving to: ‘BostonHousing.csv.2’


2024-10-28 18:53:12 (45.4 MB/s) - ‘BostonHousing.csv.2’ saved [35735/35735]



In [55]:
!head BostonHousing.csv

"crim","zn","indus","chas","nox","rm","age","dis","rad","tax","ptratio","b","lstat","medv"
0.00632,18,2.31,"0",0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24
0.02731,0,7.07,"0",0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
0.02729,0,7.07,"0",0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
0.03237,0,2.18,"0",0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
0.06905,0,2.18,"0",0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2
0.02985,0,2.18,"0",0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21,28.7
0.08829,12.5,7.87,"0",0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43,22.9
0.14455,12.5,7.87,"0",0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15,27.1
0.21124,12.5,7.87,"0",0.524,5.631,100,6.0821,5,311,15.2,386.63,29.93,16.5


In [56]:
lower = ["crim","zn","indus","chas","nox","rm","age","dis","rad","tax","ptratio","b","lstat","medv"]
upper = [x.upper() for x in lower]
print(upper)

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']


In [57]:
import pandas as pd

file = 'BostonHousing.csv'
df = pd.read_csv(file)
df.columns = upper
df = df.rename(columns={'MEDV': 'TARGET'})
dataset = df.copy()
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,TARGET
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


## Seleção de Atributos - Utilizando RFE - Recursive Feature Elimination

Esta técnica para seleção de atributos, que recursivamente remove os atributos e constrói o modelo com os atributos remanescentes. <br> Esta técnica utiliza a acurácia do modelo para identificar os atributos que mais contribuem para prever a variável alvo. <br> Em inglês esta técnica é chamada Recursive Feature Elimination (RFE).


In [58]:
print('Número de Atributos:', len(dataset.columns) -1)

Número de Atributos: 13


In [59]:
print(list(dataset.columns))

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'TARGET']


# Aplicando a RFE para Seleção dos Melhores Atributos

In [60]:
%%time 
# aplicando a RFE
from sklearn.feature_selection import RFE
from sklearn import linear_model

X = dataset.iloc[:,:-1].values
y = dataset['TARGET'].values

num_atributos_relevantes = 8
estimator = modelo = linear_model.LinearRegression() #normalize = False, fit_intercept = True)
selector = RFE(estimator, n_features_to_select=num_atributos_relevantes, step=1)
selector = selector.fit(X, y)

print("Num Features: ", selector.n_features_)

print(dataset.columns)
print("Selected Features: ", selector.support_)
print("Feature Ranking: ", selector.ranking_)

Num Features:  8
Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'TARGET'],
      dtype='object')
Selected Features:  [ True False False  True  True  True False  True  True False  True False
  True]
Feature Ranking:  [1 3 2 1 1 1 6 1 1 4 1 5 1]
CPU times: user 9.64 ms, sys: 2.7 ms, total: 12.3 ms
Wall time: 15.6 ms


In [61]:
dfatributos = pd.DataFrame( {'Atributo': dataset.columns[:-1],
                              'Importancia': selector.ranking_ })
                             
dfatributos = dfatributos.sort_values(by='Importancia', ascending=True)
dfatributos

Unnamed: 0,Atributo,Importancia
0,CRIM,1
3,CHAS,1
4,NOX,1
5,RM,1
7,DIS,1
8,RAD,1
10,PTRATIO,1
12,LSTAT,1
2,INDUS,2
1,ZN,3


In [62]:
%%time 

# aplicando a RFE
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor

modelo = RandomForestRegressor(n_estimators=35, max_depth=8)


X = dataset.iloc[:,:-1].values
y = dataset['TARGET'].values

num_atributos_relevantes = 8
estimator = modelo
selector = RFE(estimator, n_features_to_select=num_atributos_relevantes, step=1)
selector = selector.fit(X, y)

print("Num Features: ", selector.n_features_)

print(dataset.columns)
print("Selected Features: ", selector.support_)
print("Feature Ranking: ", selector.ranking_)

Num Features:  8
Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'TARGET'],
      dtype='object')
Selected Features:  [ True False False False  True  True False  True False  True  True  True
  True]
Feature Ranking:  [1 5 3 6 1 1 2 1 4 1 1 1 1]
CPU times: user 788 ms, sys: 22 ms, total: 810 ms
Wall time: 1.03 s


In [63]:
dfatributos = pd.DataFrame( {'Atributo': dataset.columns[:-1],
                              'Importancia': selector.ranking_ })
                             
dfatributos = dfatributos.sort_values(by='Importancia', ascending=True)
dfatributos

Unnamed: 0,Atributo,Importancia
0,CRIM,1
4,NOX,1
5,RM,1
7,DIS,1
9,TAX,1
10,PTRATIO,1
11,B,1
12,LSTAT,1
6,AGE,2
2,INDUS,3


In [64]:
%%time 

# aplicando a RFE
from sklearn.feature_selection import RFE
from sklearn.svm import SVR


X = dataset.iloc[:,:-1].values
y = dataset['TARGET'].values

num_atributos_relevantes = 8
estimator = SVR(kernel="linear")
selector = RFE(estimator, n_features_to_select=num_atributos_relevantes, step=1)
selector = selector.fit(X, y)

print("Num Features: ", selector.n_features_)

print(dataset.columns)
print("Selected Features: ", selector.support_)
print("Feature Ranking: ", selector.ranking_)

Num Features:  8
Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'TARGET'],
      dtype='object')
Selected Features:  [ True False  True  True  True  True False  True False False  True False
  True]
Feature Ranking:  [1 2 1 1 1 1 3 1 5 6 1 4 1]
CPU times: user 4.07 s, sys: 64.9 ms, total: 4.13 s
Wall time: 4.45 s


In [65]:
dfatributos = pd.DataFrame( {'Atributo': dataset.columns[:-1],
                              'Importancia': selector.ranking_ })
                             
dfatributos = dfatributos.sort_values(by='Importancia', ascending=True)
dfatributos

Unnamed: 0,Atributo,Importancia
0,CRIM,1
2,INDUS,1
3,CHAS,1
4,NOX,1
5,RM,1
7,DIS,1
10,PTRATIO,1
12,LSTAT,1
1,ZN,2
6,AGE,3


# Comparação de Modelos

In [66]:
print("Modelo com 13 atributos")
atributos_selecionados = ['CRIM', 'INDUS', 'CHAS', 'NOX', 'RM', 'DIS', 'PTRATIO', 'LSTAT', 'ZN', 'AGE', 'B', 'RAD', 'TAX']
calcula_metricas(dataset, atributos_selecionados)

print("\nModelo com atributos selecionados com RFE")
# Seleciona os atributos
atributos_selecionados = ['CRIM', 'INDUS', 'CHAS', 'NOX', 'RM', 'DIS', 'PTRATIO', 'LSTAT']
       #'ZN', 'AGE', 'B', 'RAD', 'TAX']

calcula_metricas(dataset, atributos_selecionados)

Modelo com 13 atributos
MAE: -4.005 (2.084)
MSE: -23.747 (11.143)
RMSE: 4.873 
R^2: 0.718 (0.099)

Modelo com atributos selecionados com RFE
MAE: -3.872 (1.752)
MSE: -24.967 (11.602)
RMSE: 4.997 
R^2: 0.701 (0.104)


## Seleção de Atributos -  Seleciona os atributos pela variância

In [68]:
np.set_printoptions(suppress=True)

In [69]:
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold()
selector.fit_transform(X)
selector.variances_

array([ 73.84035967, 100.        ,  27.28      ,   0.06438548,
         0.0134011 ,   0.49269522,  97.1       ,   4.42525226,
        23.        , 524.        ,   4.6777263 , 396.58      ,
        36.24      ])

In [70]:
dfatributos2 = pd.DataFrame( {'Atributo': dataset.columns[:-1],
                              'Importancia': selector.variances_  })
                             
dfatributos2 = dfatributos2.sort_values(by='Importancia', ascending=True).reset_index(drop=True)
dfatributos2

Unnamed: 0,Atributo,Importancia
0,NOX,0.013401
1,CHAS,0.064385
2,RM,0.492695
3,DIS,4.425252
4,PTRATIO,4.677726
5,RAD,23.0
6,INDUS,27.28
7,LSTAT,36.24
8,CRIM,73.84036
9,AGE,97.1


In [71]:
dfatributos2.Atributo.values

array(['NOX', 'CHAS', 'RM', 'DIS', 'PTRATIO', 'RAD', 'INDUS', 'LSTAT',
       'CRIM', 'AGE', 'ZN', 'B', 'TAX'], dtype=object)

In [72]:
print("Modelo com 13 atributos")
atributos_selecionados = ['NOX', 'CHAS', 'RM', 'DIS', 'PTRATIO', 'RAD', 'INDUS', 'LSTAT',
       'CRIM', 'AGE', 'ZN', 'B', 'TAX']
calcula_metricas(dataset, atributos_selecionados)

print("\nModelo com atributos selecionados com VarianceThreshold")
# Seleciona os atributos
atributos_selecionados = ['NOX', 'CHAS', 'RM', 'DIS', 'PTRATIO', 'RAD', 'INDUS', 'LSTAT',
       'CRIM', 'AGE'] #'ZN', 'B', 'TAX']

calcula_metricas(dataset, atributos_selecionados)

Modelo com 13 atributos
MAE: -4.005 (2.084)
MSE: -23.747 (11.143)
RMSE: 4.873 
R^2: 0.718 (0.099)

Modelo com atributos selecionados com VarianceThreshold
MAE: -4.126 (1.883)
MSE: -24.843 (11.326)
RMSE: 4.984 
R^2: 0.704 (0.101)


## Seleção de Atributos - Utilizando Árvores de Decisão 

Bagged Decision Trees, como o algoritmo RandomForest, podem ser usados para estimar a importância de cada atributo. Esse método retorna um score para cada atributo.

Quanto maior o score, maior a importância do atributo.

In [73]:
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(n_estimators=30, max_depth=6)
clf = clf.fit(X, y)
clf.feature_importances_           

array([0.03640266, 0.00050954, 0.00429636, 0.00010724, 0.02087586,
       0.41541683, 0.00688622, 0.06037615, 0.00256316, 0.01244706,
       0.01678201, 0.00601979, 0.41731712])

In [74]:
dfatributos3 = pd.DataFrame( {'Atributo': dataset.columns[:-1],
                              'Importancia': clf.feature_importances_  })
                             
dfatributos3 = dfatributos3.sort_values(by='Importancia', ascending=False)
dfatributos3

Unnamed: 0,Atributo,Importancia
12,LSTAT,0.417317
5,RM,0.415417
7,DIS,0.060376
0,CRIM,0.036403
4,NOX,0.020876
10,PTRATIO,0.016782
9,TAX,0.012447
6,AGE,0.006886
11,B,0.00602
2,INDUS,0.004296


# Atividade 1-A: Criar o modelo e as métricas, comparando com o modelo original

## Seleção de Atributos - Extra Trees Classifier

In [None]:
# Feature Importance with Extra Trees Classifier

from sklearn.ensemble import ExtraTreesRegressor

# feature extraction
model = ExtraTreesRegressor()
model.fit(X, y)
print(model.feature_importances_)

In [None]:
dfatributos4 = pd.DataFrame( {'Atributo': dataset.columns[:-1],
                              'Importancia': model.feature_importances_ })
                             
dfatributos4 = dfatributos3.sort_values(by='Importancia', ascending=False).reset_index(drop=True)
dfatributos4

In [None]:
# Seleciona os atributos
atributos_selecionados = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', # 'DIS', 
                              'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'] # 'AGE',

calcula_metricas(dataset, atributos_selecionados)

# Atividade 1-B: Criar o modelo e as métricas, comparando com o modelo original