# Notebook 2: Feature Selection and Importance Analysis

In the Notebook 1, we observed there are many potentially irrelevant features which can decrease the accuracy of the model we build and lead to more overfitting. In this notebook, I'll use several techniques to determine the whether Feature Selection is necessary.

In [40]:
import pandas as pd
import math 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')


from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

from sklearn.feature_selection import SelectFromModel, SelectKBest, f_regression, mutual_info_regression, RFE
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.model_selection import cross_val_score 
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor

from feature_engine import selection 

import lightgbm as lgb 
import xgboost as xgb
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor


#your info here
__author__ = "Vusal Babashov"
__email__ = "vbabashov@gmail.com"
__website__ = 'https://vbabashov.github.io'

In [41]:
#conda update scikit-learn

In [42]:
#import sklearn
#print(sklearn.__version__)

In [43]:
#load the data into a Pandas dataframe
file_path = "/Users/vusalbabashov/Desktop/house-prices/data/"
df_train = pd.read_csv(file_path + "train.csv")
df_test_feature = pd.read_csv(file_path + "test.csv")
df_test_target = pd.read_csv(file_path + "sample_submission.csv")

In [44]:
#df_merged = pd.merge(left=df_test_feature.reset_index(), right=df_test_target.reset_index(), on='Id') # merge the test feature and target dataframes
df_merged = pd.merge(left=df_test_feature, right=df_test_target, how='inner', on='Id', left_index=False, right_index=False)
df_raw = pd.concat([df_train, df_merged]) #concatenate the tran and test dataframes
df_new = df_raw.reset_index(drop=True)

In [45]:
#From EDA, we know that 'Alley', 'PoolQC', 'Fence', 'MiscFeature' are missing in big proportion, so we'll drop them and 'Id'.
df_new.drop(columns = ['Id','Alley', 'PoolQC', 'Fence', 'MiscFeature'], inplace=True)

In [46]:
df_new.shape

(2919, 76)

In [47]:
# Drop the the 5 outliers GrLivArea > 4000 sq feet
df_new.drop(df_new[(df_new['GrLivArea'] > 4000)].index, inplace = True)

In [48]:
df_new.shape

(2914, 76)

In [49]:
# columns broken down by variable types
nominal_features = ['MSSubClass', 'MSZoning', 'Street', 'LandContour', 'LotConfig', 
                   'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
                   'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 
                   'Foundation', 'Heating', 'CentralAir', 'GarageType', 'MoSold',
                   'SaleType', 'SaleCondition'] # removed Alley, MiscFeature, 

ordinal_features = ['LotShape', 'Utilities', 'LandSlope', 'OverallQual', 'OverallCond', 
                   'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 
                   'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'Electrical', 'KitchenQual', 
                   'Functional', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond',
                   'PavedDrive'] #removed PoolQC, Fence,


numeric_features = ['LotFrontage','LotArea','YearBuilt','YearRemodAdd','MasVnrArea','BsmtFinSF1',
                  'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea',
                  'BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr', 'TotRmsAbvGrd',
                  'Fireplaces','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch',
                  '3SsnPorch','ScreenPorch','PoolArea','MiscVal', 'GarageYrBlt', 'YrSold','SalePrice'] #removed ID, SalePrice is a target 

In [50]:
# remove the duplicate rows if any
df_new = df_new.drop_duplicates(subset=nominal_features+ordinal_features+numeric_features)

In [51]:
df_new.shape

(2914, 76)

In [52]:
# Ordinal Category Values
lot_shape = ['IR3','IR2','IR1','Reg']
utilities = ['ELO', 'NoSeWa', 'NoSewr','AllPub']
land_slope = ['Sev','Mod','Gtl']
overall_qual = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]  # already in the ordinal structure
overall_cond = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]  # already in the ordinal structure
exter_qual = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
exter_cond = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
bsmt_qual  = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
bsmt_cond  = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
bsmt_exposure  = ['NA', 'No', 'Mn', 'Av', 'Gd']
bsmt_fin_type1 = ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
bsmt_fin_type2 = ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
heating_qual = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
electrical = ['Mix', 'FuseP', 'FuseF', 'FuseA', 'SBrkr']
kitchen_qual = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
functional = ['Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ']
fire_place_qual = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
garage_finish = ['NA', 'Unf', 'RFn', 'Fin']
garage_qual = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
garage_cond = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
paved_drive = ['N', 'P', 'Y']
#pool_qc = ['NA', 'Fa', 'TA', 'Gd', 'Ex'] 
#fence = ['NA', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv']

ordinal_categories_list = [lot_shape , utilities, land_slope, overall_qual, overall_cond, exter_qual, exter_cond, bsmt_qual, 
                          bsmt_cond, bsmt_exposure, bsmt_fin_type1, bsmt_fin_type2, heating_qual, electrical, kitchen_qual,
                          functional, fire_place_qual, garage_finish, garage_qual, garage_cond, paved_drive]  

In [53]:
def impute_missing_val_df (df, nominal_features, ordinal_features, numeric_features):
    ''' Imputes the continious columns with mean and categorical columns (which has less than 80% missingness) with the most frequent value'''
    imputer_con = SimpleImputer(missing_values=np.nan, strategy='median')
    imputer_cat = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    for col in df.columns:
        if df[col].isnull().sum() > 0:    
            if col in nominal_features + ordinal_features:
                df[col] = imputer_cat.fit_transform(df[col].values.reshape(-1,1))
            else:            
                df[col] = imputer_con.fit_transform(df[col].values.reshape(-1,1))
    return df      

In [54]:
df_imputed = impute_missing_val_df (df_new, nominal_features, ordinal_features, numeric_features)

In [64]:
df_imputed[nominal_features+ordinal_features] = df_imputed[nominal_features+ordinal_features].astype('category')
df_imputed[numeric_features] = df_imputed[numeric_features].astype('float')

In [65]:
df_imputed.shape

(2914, 76)

In [66]:
df_imputed.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450.0,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0.0,0.0,0.0,0.0,0.0,2,2008.0,WD,Normal,208500.0
1,20,RL,80.0,9600.0,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0.0,0.0,0.0,0.0,0.0,5,2007.0,WD,Normal,181500.0
2,60,RL,68.0,11250.0,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0.0,0.0,0.0,0.0,0.0,9,2008.0,WD,Normal,223500.0
3,70,RL,60.0,9550.0,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272.0,0.0,0.0,0.0,0.0,2,2006.0,WD,Abnorml,140000.0
4,60,RL,84.0,14260.0,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0.0,0.0,0.0,0.0,0.0,12,2008.0,WD,Normal,250000.0


In [57]:
X_imputed = df_imputed.iloc [:, 0:-1]
y_imputed = df_imputed.iloc[:,-1]

In [58]:
rfm = RandomForestRegressor(
    n_estimators=500,
    random_state=0,
    )

rfe = selection.RecursiveFeatureElimination(
    estimator = rfm,
    scoring ='neg_root_mean_squared_error',
    cv=3,
    threshold=0.001,
    variables=None,
)

In [59]:
rfe.fit(X_imputed, y_imputed)

RecursiveFeatureElimination(estimator=RandomForestRegressor(n_estimators=500,
                                                            random_state=0),
                            scoring='neg_root_mean_squared_error',
                            threshold=0.001,
                            variables=['MSSubClass', 'LotFrontage', 'LotArea',
                                       'OverallQual', 'OverallCond',
                                       'YearBuilt', 'YearRemodAdd',
                                       'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
                                       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF',
                                       '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
                                       'BsmtFullBath', 'BsmtHalfBath',
                                       'FullBath', 'HalfBath', 'BedroomAbvGr',
                                       'KitchenAbvGr', 'TotRmsAbvGrd',
                                       'Fireplaces', 'Ga

In [60]:
rfe.features_to_drop_

['MasVnrArea',
 'TotalBsmtSF',
 '1stFlrSF',
 'LowQualFinSF',
 'HalfBath',
 'BedroomAbvGr',
 'PoolArea',
 'YrSold']

In [18]:
#One Hot Encoding for Nominal Features
ohe = OneHotEncoder(sparse = False)
T=ohe.fit_transform(df_imputed[nominal_features])
#T

In [19]:
nominal_feature_names = ohe.get_feature_names(nominal_features)

In [20]:
nominal_feature_names  # nominal cateogy levels one-hot encoded

array(['MSSubClass_20', 'MSSubClass_30', 'MSSubClass_40', 'MSSubClass_45',
       'MSSubClass_50', 'MSSubClass_60', 'MSSubClass_70', 'MSSubClass_75',
       'MSSubClass_80', 'MSSubClass_85', 'MSSubClass_90',
       'MSSubClass_120', 'MSSubClass_150', 'MSSubClass_160',
       'MSSubClass_180', 'MSSubClass_190', 'MSZoning_C (all)',
       'MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL', 'MSZoning_RM',
       'Street_Grvl', 'Street_Pave', 'LandContour_Bnk', 'LandContour_HLS',
       'LandContour_Low', 'LandContour_Lvl', 'LotConfig_Corner',
       'LotConfig_CulDSac', 'LotConfig_FR2', 'LotConfig_FR3',
       'LotConfig_Inside', 'Neighborhood_Blmngtn', 'Neighborhood_Blueste',
       'Neighborhood_BrDale', 'Neighborhood_BrkSide',
       'Neighborhood_ClearCr', 'Neighborhood_CollgCr',
       'Neighborhood_Crawfor', 'Neighborhood_Edwards',
       'Neighborhood_Gilbert', 'Neighborhood_IDOTRR',
       'Neighborhood_MeadowV', 'Neighborhood_Mitchel',
       'Neighborhood_NAmes', 'Neighborhood_NPkVill'

In [21]:
# Ordinal Encoding Ordinal Features
ore = OrdinalEncoder(categories=ordinal_categories_list)
Z = ore.fit_transform(df_imputed[ordinal_features])
#Z

In [22]:
ore.categories_ #ordinal category levels

[array(['IR3', 'IR2', 'IR1', 'Reg'], dtype=object),
 array(['ELO', 'NoSeWa', 'NoSewr', 'AllPub'], dtype=object),
 array(['Sev', 'Mod', 'Gtl'], dtype=object),
 array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 array(['Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['NA', 'No', 'Mn', 'Av', 'Gd'], dtype=object),
 array(['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'], dtype=object),
 array(['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'], dtype=object),
 array(['Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['Mix', 'FuseP', 'FuseF', 'FuseA', 'SBrkr'], dtype=object),
 array(['Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'],
       dtype=object),
 array(['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype

In [23]:
list=[pd.DataFrame(T,columns=nominal_feature_names).reset_index(drop=True), 
                pd.DataFrame(Z,columns=ordinal_features).reset_index(drop=True), df_imputed[numeric_features].reset_index(drop=True)]

In [24]:
df_encoded = pd.concat(list, axis=1)

In [25]:
df_encoded.shape

(2914, 236)

In [26]:
df_encoded.head()

Unnamed: 0,MSSubClass_20,MSSubClass_30,MSSubClass_40,MSSubClass_45,MSSubClass_50,MSSubClass_60,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,GarageYrBlt,YrSold,SalePrice
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,61,0,0,0,0,0,2003.0,2008,208500.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,298,0,0,0,0,0,0,1976.0,2007,181500.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,42,0,0,0,0,0,2001.0,2008,223500.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,35,272,0,0,0,0,1998.0,2006,140000.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,192,84,0,0,0,0,0,2000.0,2008,250000.0


In [27]:
X = df_encoded.iloc [:, 0:-1]
y = df_encoded.iloc [:,-1]

In [28]:
X.shape

(2914, 235)

In [29]:
y.shape

(2914,)

### 1. Correlation Feature Selection, SelectKBest 

In [35]:
fs = SelectKBest(score_func=f_regression, k = 30) # Select 30 most important features

In [36]:
f_bestK = fs.fit(X, y) #use the entire dataset

In [38]:
for i in range(len(f_bestK.scores_)):
    print('Feature %s: %f' % (X.columns[i], f_bestK.scores_[i]))
print ("Selected Features: %s" % X.columns[f_bestK.get_support()]) # Top 30 important Features

Feature MSSubClass_20: 5.337222
Feature MSSubClass_30: 119.340491
Feature MSSubClass_40: 0.929564
Feature MSSubClass_45: 15.346155
Feature MSSubClass_50: 35.258845
Feature MSSubClass_60: 292.197033
Feature MSSubClass_70: 0.876965
Feature MSSubClass_75: 1.144184
Feature MSSubClass_80: 0.411581
Feature MSSubClass_85: 4.977926
Feature MSSubClass_90: 7.864528
Feature MSSubClass_120: 0.272428
Feature MSSubClass_150: 0.264593
Feature MSSubClass_160: 36.733723
Feature MSSubClass_180: 18.595984
Feature MSSubClass_190: 10.166585
Feature MSZoning_C (all): 17.756391
Feature MSZoning_FV: 4.363566
Feature MSZoning_RH: 9.204038
Feature MSZoning_RL: 142.885495
Feature MSZoning_RM: 170.787894
Feature Street_Grvl: 3.099171
Feature Street_Pave: 3.099171
Feature LandContour_Bnk: 14.749796
Feature LandContour_HLS: 16.499173
Feature LandContour_Low: 6.104743
Feature LandContour_Lvl: 1.878049
Feature LotConfig_Corner: 0.163378
Feature LotConfig_CulDSac: 46.693773
Feature LotConfig_FR2: 0.005021
Feature LotC

In [39]:
#features_train = fit.transform(X_train) # reduce the feature matrix to the important ones

In [40]:
#features_test = fit.transform(X_test)

### 2. Mutual Information Statistics Feature Selection, SelectKBest

In [41]:
mi = SelectKBest(score_func = mutual_info_regression, k=30)

In [42]:
mi_bestK = mi.fit(X, y)

In [43]:
#for i in range(len(mi_bestK.scores_)):
#     print('Feature %s: %f' % (X.columns[i], mi_bestK.scores_[i]))
print ("Selected Features: %s" % X.columns[mi_bestK.get_support()])

Selected Features: Index(['MSSubClass_60', 'MSZoning_RL', 'MSZoning_RM', 'Foundation_PConc',
       'GarageType_Detchd', 'OverallQual', 'ExterQual', 'BsmtQual',
       'BsmtFinType1', 'HeatingQC', 'KitchenQual', 'GarageFinish',
       'LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea',
       'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars',
       'GarageArea', 'OpenPorchSF', 'GarageYrBlt'],
      dtype='object')


### 3a. Recursive Feature Elimination with LightGBM

In [44]:
model = LGBMRegressor(random_state=1)

In [45]:
rfe = RFE (model, 30)

In [46]:
rfe_lgb = rfe.fit(X, y)

In [47]:
#for i in range(len(rfe_lgb.ranking_)):
#    print('Feature %s: %f' % (X.columns[i], rfe_lgb.ranking_[i]))
print ("Selected Features: %s" % X.columns[rfe_lgb.support_])

Selected Features: Index(['OverallQual', 'OverallCond', 'ExterQual', 'BsmtQual', 'BsmtExposure',
       'BsmtFinType1', 'HeatingQC', 'KitchenQual', 'Functional', 'FireplaceQu',
       'GarageFinish', 'LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd',
       'MasVnrArea', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF',
       '2ndFlrSF', 'GrLivArea', 'BedroomAbvGr', 'TotRmsAbvGrd', 'GarageArea',
       'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'GarageYrBlt', 'YrSold'],
      dtype='object')


### 3b.Recursive Feature Elimination (RFE) with XGBoost

In [48]:
#from sklearn.tree import DecisionTreeRegressor
rfe = RFE(estimator=XGBRegressor(), n_features_to_select=30)
rfe_xgb = rfe.fit(X,y)
# for i in range(X.shape[1]):
#     print('Column: %s, Rank: %.3f' % (X.columns[i], rfe_xgb.ranking_[i]))
print ("Selected Features: %s" % X.columns[rfe_xgb.get_support()])

Selected Features: Index(['Exterior1st_AsphShn', 'Exterior1st_BrkComm', 'Heating_GasA',
       'MoSold_1', 'OverallQual', 'KitchenQual', 'FireplaceQu', 'GarageFinish',
       'PavedDrive', 'LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd',
       'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BedroomAbvGr', 'KitchenAbvGr',
       'TotRmsAbvGrd', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'MiscVal', 'GarageYrBlt'],
      dtype='object')


### 4. Sequential Feature Selection - Backward Selection

In [53]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=3)
sfs = SequentialFeatureSelector(knn, direction = 'backward', n_features_to_select=30)
back_bestK=sfs.fit(X, y)
for i in range(X_encoded.shape[1]):
    print('Column: %s, Selected: %s' % (X.columns[i], back_bestK.support_[i]))
print("Features selected by backward selection: "  
      f"{X.columns[back_bestK.get_support()].tolist()}")    

Features selected by backward selection: ['MoSold_4', 'SaleCondition_Normal', 'OverallCond', 'ExterQual', 'KitchenQual', 'PavedDrive', 'LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'GarageYrBlt']


In [57]:
for i in range(X.shape[1]):
    print('Column: %s, Selected: %s' % (X.columns[i], back_bestK.support_[i]))

Column: MSSubClass_20, Selected: False
Column: MSSubClass_30, Selected: False
Column: MSSubClass_40, Selected: False
Column: MSSubClass_45, Selected: False
Column: MSSubClass_50, Selected: False
Column: MSSubClass_60, Selected: False
Column: MSSubClass_70, Selected: False
Column: MSSubClass_75, Selected: False
Column: MSSubClass_80, Selected: False
Column: MSSubClass_85, Selected: False
Column: MSSubClass_90, Selected: False
Column: MSSubClass_120, Selected: False
Column: MSSubClass_150, Selected: False
Column: MSSubClass_160, Selected: False
Column: MSSubClass_180, Selected: False
Column: MSSubClass_190, Selected: False
Column: MSZoning_C (all), Selected: False
Column: MSZoning_FV, Selected: False
Column: MSZoning_RH, Selected: False
Column: MSZoning_RL, Selected: False
Column: MSZoning_RM, Selected: False
Column: Street_Grvl, Selected: False
Column: Street_Pave, Selected: False
Column: LandContour_Bnk, Selected: False
Column: LandContour_HLS, Selected: False
Column: LandContour_Low, 

### 5. Sequential Feature Selection - Forward Selection

In [50]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=3)
forward_bestK = SequentialFeatureSelector(knn, direction = 'forward', n_features_to_select=30)
forward_bestK.fit(X, y)
#for i in range(X_encoded.shape[1]):
#    print('Column: %s, Selected: %s' % (X.columns[i], for_bestK.support_[i]))
print("Features selected by forward selection: "
      f"{X.columns[forward_bestK.get_support()].tolist()}")   

Features selected by forward selection: ['MSSubClass_150', 'MSSubClass_160', 'MSSubClass_180', 'Neighborhood_NPkVill', 'Neighborhood_NWAmes', 'Condition2_RRAe', 'Condition2_RRAn', 'HouseStyle_2.5Unf', 'RoofStyle_Flat', 'RoofMatl_Tar&Grv', 'Exterior1st_Stucco', 'Exterior2nd_Brk Cmn', 'Exterior2nd_CBlock', 'Exterior2nd_Stone', 'Heating_Floor', 'Heating_GasW', 'Heating_OthW', 'Heating_Wall', 'MoSold_2', 'MoSold_12', 'SaleType_COD', 'SaleType_CWD', 'SaleType_Con', 'SaleType_ConLD', 'SaleType_ConLI', 'SaleType_ConLw', 'SaleType_Oth', 'SaleCondition_Abnorml', 'SaleCondition_Alloca', 'PoolArea']


In [58]:
conda install -c conda-forge feature_engine

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /Users/vusalbabashov/opt/anaconda3

  added / updated specs:
    - feature_engine


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    feature_engine-1.0.2       |     pyhd8ed1ab_0          49 KB  conda-forge
    openssl-1.1.1j             |       hbcf498f_0         1.9 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         1.9 MB

The following NEW packages will be INSTALLED:

  feature_engine     conda-forge/noarch::feature_engine-1.0.2-pyhd8ed1ab_0

The following packages will be SUPERSEDED by a higher-priority channel:

  ca-certificates    pkgs/main::ca-certificates-2021.1.19-~ --> conda-forge::ca-certificates-2020.12.5-h033912b_0
  openssl              pkgs/main::openssl-1.1.

Despite having different set of important features, some features (i.e., numeric cols) appear as important on all methods.