In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.base import clone
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.feature_selection import RFE, RFECV, SelectFromModel, SequentialFeatureSelector
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PolynomialFeatures

# Load data

In [3]:
X, y, feature_names, _, _ = load_boston().values()

In [4]:
X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(X, columns = feature_names), pd.Series(y, name='label'), test_size = .2, random_state=1)

In [5]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train: Linear Regression

In [270]:
rfe = RFECV(estimator=model_lr, cv=5, scoring='r2')

X_train = rfe.fit_transform(X_train, y_train)
X_test = rfe.transform(X_test)

In [271]:
pf = PolynomialFeatures(interaction_only=True, include_bias=False, degree=2)

X_train = pf.fit_transform(X_train)
X_test = pf.transform(X_test)

In [41]:
from sklearn.linear_model import LinearRegression
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)

pred_lr = model_lr.predict(X_test)
print(model_lr.score(X_test, y_test))
print(mean_absolute_error(y_test, pred_lr))
print(mean_absolute_percentage_error(y_test, pred_lr))

0.668759493535632
3.1890919658878483
0.16866394539378712


# Train: Random Forest

In [6]:
model = RandomForestRegressor(random_state=42)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(X, columns = feature_names), pd.Series(y, name='label'), test_size = .2, random_state=42)

In [8]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [284]:
rfe = RFECV(estimator=model_lr, cv=5, scoring='neg_mean_absolute_error')

X_train = rfe.fit_transform(X_train, y_train)
X_test = rfe.transform(X_test)

In [287]:
rfe.ranking_

array([1, 3, 4, 2, 1, 1, 5, 1, 1, 1, 1, 1, 1])

In [285]:
pf = PolynomialFeatures(interaction_only=True, include_bias=False, degree=2)

X_train = pf.fit_transform(X_train)
X_test = pf.transform(X_test)

In [27]:
model.fit(X_train, y_train)

model.score(X_test, y_test, )

0.8920995891343227

In [42]:
rfe = RFECV(estimator=model, cv=5, scoring='r2')
sfm = SelectFromModel(estimator=model, threshold='median')
sfs = SequentialFeatureSelector(estimator=model, n_features_to_select=7, scoring='r2')

In [43]:
rfe.fit(X_train, y_train)

RFECV(cv=5, estimator=RandomForestRegressor(random_state=42), scoring='r2')

In [44]:
X_train = rfe.transform(X_train)
X_test = rfe.transform(X_test)

In [46]:
X_train = pf.fit_transform(X_train)
X_test = pf.transform(X_test)

## Recursive Feature Elimination

In [86]:
rfe = RFE(estimator=model, )

In [87]:
rfe.fit(X_train, y_train)

rfe.ranking_

array([1, 7, 5, 8, 2, 1, 3, 1, 6, 1, 1, 4, 1])

In [92]:
model.fit(X_train, y_train)

RandomForestRegressor(random_state=42)

In [93]:
result = np.zeros(feature_names.size)
rk = np.argsort(model.feature_importances_)
ch = int(feature_names.size/2)
result[rk[-ch:]] = 1

result[rk[:-ch]] = np.arange(feature_names.size - ch +1, 1, -1)

In [94]:
np.alltrue(result == rfe.ranking_)

True

## Sequencial Feature Selection: forward

In [95]:
sfs = SequentialFeatureSelector(estimator=model, cv=3)

In [97]:
sfs.fit(X_train, y_train)

sfs.support_

array([False, False, False, False,  True,  True, False, False,  True,
        True,  True, False,  True])

In [120]:
features_support = np.zeros(X_train.shape[1], dtype = bool)

In [121]:
n_select = X_train.shape[1]//2
n_cv =3

In [122]:
for _ in range(n_select):
    non_selected = np.flatnonzero(~features_support)
    dict_score = dict()
    for f in non_selected:
        candidates = features_support.copy()
        candidates[f] = True
        cvs = cross_val_score(estimator=model, X= X_train[:, candidates], y= y_train, cv=n_cv).mean()
        dict_score[f] = cvs
    selected = max(dict_score, key=lambda x: dict_score[x])
    features_support[selected] = True

In [125]:
np.alltrue(features_support==sfs.support_)

True

## Select from model

In [55]:
sfm = SelectFromModel(estimator=model, threshold='mean')

In [56]:
sfm.fit(X_train, y_train)

sfm.get_support()

array([False, False, False, False, False,  True, False, False, False,
       False, False, False,  True])

In [25]:
sfm.threshold_

0.07692307692307693

In [28]:
model.feature_importances_.mean()

0.07692307692307693

In [51]:
class SFM:
    def __init__(self, estimator, strategy='mean'):
        self.estimator = clone(estimator)
        self.strategy = strategy
    
    def fit(self, X, y):
        self.estimator.fit(X, y)
        
    def get_support(self):
        try:
            self.importance = self.estimator.feature_importances_
        except:
            self.importance = np.abs(self.estimator.coef_.flatten())
            
        if self.strategy == 'mean':
            self.threshold = self.importance.mean()
        elif self.strategy == 'median':
            self.threshold = np.median(self.importance)
        else:
            self.threshold = self.strategy.copy()
            
        return self.importance > self.threshold

In [52]:
sfm0 = SFM(estimator=model, strategy='mean')

In [53]:
sfm0.fit(X_train, y_train)

sfm0.get_support()

array([False, False, False, False, False,  True, False, False, False,
       False, False, False,  True])

In [57]:
np.alltrue(sfm0.get_support() == sfm.get_support())

True