# Notebook 2: Feature Selection

From EDA, we observed there are many potentially irrelevant features which can decrease the accuracy of the model we build and lead to more overfitting. In this notebook, I'll use several techniques to determine the most important features.

In [98]:
import pandas as pd
import math 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')


from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

from sklearn.feature_selection import SelectFromModel, SelectKBest, f_regression, mutual_info_regression, RFE
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.model_selection import cross_val_score 
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor


import lightgbm as lgb 
import xgboost as xgb
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor


#your info here
__author__ = "Vusal Babashov"
__email__ = "vbabashov@gmail.com"
__website__ = 'https://vbabashov.github.io'

In [99]:
#conda update scikit-learn

In [100]:
#import sklearn
#print(sklearn.__version__)

In [101]:
#load the data into a Pandas dataframe
file_path = "/Users/vusalbabashov/Desktop/house-prices/data/"
df_train = pd.read_csv(file_path + "train.csv")
df_test_feature = pd.read_csv(file_path + "test.csv")
df_test_target = pd.read_csv(file_path + "sample_submission.csv")

In [102]:
#df_merged = pd.merge(left=df_test_feature.reset_index(), right=df_test_target.reset_index(), on='Id') # merge the test feature and target dataframes
df_merged = pd.merge(left=df_test_feature, right=df_test_target, how='inner', on='Id', left_index=False, right_index=False)
df_raw = pd.concat([df_train, df_merged]) #concatenate the tran and test dataframes
df_new = df_raw.reset_index(drop=True)

In [103]:
#From EDA, we know that 'Alley', 'PoolQC', 'Fence', 'MiscFeature' are missing in big proportion, so we'll drop them and 'Id'.
df_new.drop(columns = ['Id','Alley', 'PoolQC', 'Fence', 'MiscFeature'], inplace=True)

In [104]:
df_new.shape

(2919, 76)

In [105]:
# Drop the the 5 outliers GrLivArea > 4000 sq feet
df_new.drop(df_new[(df_new['GrLivArea'] > 4000)].index, inplace = True)

In [106]:
df_new.shape

(2914, 76)

In [107]:
# columns broken down by variable types
nominal_features = ['MSSubClass', 'MSZoning', 'Street', 'LandContour', 'LotConfig', 
                   'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
                   'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 
                   'Foundation', 'Heating', 'CentralAir', 'GarageType',
                   'SaleType', 'SaleCondition'] # removed Alley, MiscFeature, 

ordinal_features = ['LotShape', 'Utilities', 'LandSlope', 'OverallQual', 'OverallCond', 
                   'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 
                   'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'Electrical', 'KitchenQual', 
                   'Functional', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond',
                   'PavedDrive'] #removed PoolQC, Fence,


numeric_features = ['LotFrontage','LotArea','YearBuilt','YearRemodAdd','MasVnrArea','BsmtFinSF1',
                  'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea',
                  'BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr', 'TotRmsAbvGrd',
                  'Fireplaces','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch',
                  '3SsnPorch','ScreenPorch','PoolArea','MiscVal', 'GarageYrBlt', 'YrSold', 'MoSold' , 'SalePrice'] #removed ID, SalePrice is a target 

In [108]:
# remove the duplicate rows 
df_new = df_new.drop_duplicates(subset=nominal_features+ordinal_features+numeric_features)

In [109]:
df_new.shape

(2914, 76)

In [110]:
# Ordinal Category Values
lot_shape = ['IR3','IR2','IR1','Reg']
utilities = ['ELO', 'NoSeWa', 'NoSewr','AllPub']
land_slope = ['Sev','Mod','Gtl']
overall_qual = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]  # already in the ordinal structure
overall_cond = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]  # already in the ordinal structure
exter_qual = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
exter_cond = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
bsmt_qual  = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
bsmt_cond  = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
bsmt_exposure  = ['NA', 'No', 'Mn', 'Av', 'Gd']
bsmt_fin_type1 = ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
bsmt_fin_type2 = ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
heating_qual = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
electrical = ['Mix', 'FuseP', 'FuseF', 'FuseA', 'SBrkr']
kitchen_qual = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
functional = ['Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ']
fire_place_qual = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
garage_finish = ['NA', 'Unf', 'RFn', 'Fin']
garage_qual = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
garage_cond = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
paved_drive = ['N', 'P', 'Y']
#pool_qc = ['NA', 'Fa', 'TA', 'Gd', 'Ex'] 
#fence = ['NA', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv']

ordinal_categories_list = [lot_shape , utilities, land_slope, overall_qual, overall_cond, exter_qual, exter_cond, bsmt_qual, 
                          bsmt_cond, bsmt_exposure, bsmt_fin_type1, bsmt_fin_type2, heating_qual, electrical, kitchen_qual,
                          functional, fire_place_qual, garage_finish, garage_qual, garage_cond, paved_drive]  

In [111]:
def impute_missing_val_df (df, nominal_features, ordinal_features, numeric_features):
    ''' Imputes the continious columns with mean and categorical columns (which has less than 80% missingness) with the most frequent value'''
    imputer_con = SimpleImputer(missing_values=np.nan, strategy='mean')
    imputer_cat = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    for col in df.columns:
        if df[col].isnull().sum() > 0:    
            if col in nominal_features + ordinal_features:
                df[col] = imputer_cat.fit_transform(df[col].values.reshape(-1,1))
            else:            
                df[col] = imputer_con.fit_transform(df[col].values.reshape(-1,1))
    return df      

In [112]:
df_imputed = impute_missing_val_df (df_new, nominal_features, ordinal_features, numeric_features)

In [113]:
df_imputed.shape

(2914, 76)

In [114]:
#One Hot Encoding for Nominal Features
ohe = OneHotEncoder(sparse = False)
T=ohe.fit_transform(df_imputed[nominal_features])
#T

In [115]:
nominal_feature_names = ohe.get_feature_names(nominal_features)

In [175]:
nominal_feature_names  # nominal cateogy levels one-hot encoded

array(['MSSubClass_20', 'MSSubClass_30', 'MSSubClass_40', 'MSSubClass_45',
       'MSSubClass_50', 'MSSubClass_60', 'MSSubClass_70', 'MSSubClass_75',
       'MSSubClass_80', 'MSSubClass_85', 'MSSubClass_90',
       'MSSubClass_120', 'MSSubClass_150', 'MSSubClass_160',
       'MSSubClass_180', 'MSSubClass_190', 'MSZoning_C (all)',
       'MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL', 'MSZoning_RM',
       'Street_Grvl', 'Street_Pave', 'LandContour_Bnk', 'LandContour_HLS',
       'LandContour_Low', 'LandContour_Lvl', 'LotConfig_Corner',
       'LotConfig_CulDSac', 'LotConfig_FR2', 'LotConfig_FR3',
       'LotConfig_Inside', 'Neighborhood_Blmngtn', 'Neighborhood_Blueste',
       'Neighborhood_BrDale', 'Neighborhood_BrkSide',
       'Neighborhood_ClearCr', 'Neighborhood_CollgCr',
       'Neighborhood_Crawfor', 'Neighborhood_Edwards',
       'Neighborhood_Gilbert', 'Neighborhood_IDOTRR',
       'Neighborhood_MeadowV', 'Neighborhood_Mitchel',
       'Neighborhood_NAmes', 'Neighborhood_NPkVill'

In [117]:
# Ordinal Encoding Ordinal Features
ore = OrdinalEncoder(categories=ordinal_categories_list)
Z = ore.fit_transform(df_imputed[ordinal_features])
#Z

In [176]:
ore.categories_ #ordinal category levels

[array(['IR3', 'IR2', 'IR1', 'Reg'], dtype=object),
 array(['ELO', 'NoSeWa', 'NoSewr', 'AllPub'], dtype=object),
 array(['Sev', 'Mod', 'Gtl'], dtype=object),
 array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]),
 array(['Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['NA', 'No', 'Mn', 'Av', 'Gd'], dtype=object),
 array(['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'], dtype=object),
 array(['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'], dtype=object),
 array(['Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['Mix', 'FuseP', 'FuseF', 'FuseA', 'SBrkr'], dtype=object),
 array(['Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype=object),
 array(['Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'],
       dtype=object),
 array(['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'], dtype

In [119]:
list=[pd.DataFrame(T,columns=nominal_feature_names).reset_index(drop=True), 
                pd.DataFrame(Z,columns=ordinal_features).reset_index(drop=True), df_imputed[numeric_features].reset_index(drop=True)]

In [120]:
df_encoded = pd.concat(list, axis=1)

In [121]:
df_encoded.shape

(2914, 225)

In [122]:
df_encoded.head()

Unnamed: 0,MSSubClass_20,MSSubClass_30,MSSubClass_40,MSSubClass_45,MSSubClass_50,MSSubClass_60,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,GarageYrBlt,YrSold,MoSold,SalePrice
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,61,0,0,0,0,0,2003.0,2008,2,208500.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1976.0,2007,5,181500.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,42,0,0,0,0,0,2001.0,2008,9,223500.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,35,272,0,0,0,0,1998.0,2006,2,140000.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,84,0,0,0,0,0,2000.0,2008,12,250000.0


In [123]:
X = df_encoded.iloc [:, 0:-1]
y = df_encoded.iloc [:,-1]

In [124]:
X.shape

(2914, 224)

In [125]:
y.shape

(2914,)

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=0)

### 1. Correlation Feature Selection, SelectKBest 

In [150]:
fs = SelectKBest(score_func=f_regression, k = 30) # Select 30 most important features

In [151]:
f_bestK = fs.fit(X, y) #use the entire dataset

In [165]:
#for i in range(len(f_bestK.scores_)):
#    print('Feature %s: %f' % (X.columns[i], f_bestK.scores_[i]))
print ("Selected Features: %s" % X.columns[f_bestK.get_support()]) # Top 30 important Features

Selected Features: Index(['MSSubClass_60', 'Neighborhood_NridgHt', 'MasVnrType_None',
       'Foundation_PConc', 'GarageType_Detchd', 'SaleType_New',
       'SaleCondition_Partial', 'OverallQual', 'ExterQual', 'BsmtQual',
       'BsmtExposure', 'HeatingQC', 'KitchenQual', 'GarageFinish',
       'LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       'BsmtFinSF1', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea',
       'FullBath', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea',
       'GarageYrBlt'],
      dtype='object')


In [None]:
#features_train = fit.transform(X_train) # reduce the feature matrix to the important ones

In [None]:
#features_test = fit.transform(X_test)

### 2. Mutual Information Statistics Feature Selection, SelectKBest

In [154]:
mi = SelectKBest(score_func = mutual_info_regression, k=30)

In [155]:
mi_bestK = mi.fit(X, y)

In [166]:
#for i in range(len(mi_bestK.scores_)):
#     print('Feature %s: %f' % (X.columns[i], mi_bestK.scores_[i]))
print ("Selected Features: %s" % X.columns[mi_bestK.get_support()])

Selected Features: Index(['MSSubClass_60', 'Foundation_PConc', 'GarageType_Detchd',
       'SaleType_New', 'OverallQual', 'ExterQual', 'BsmtQual', 'BsmtFinType1',
       'HeatingQC', 'KitchenQual', 'GarageFinish', 'LotFrontage', 'LotArea',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'TotalBsmtSF',
       '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'FullBath', 'BedroomAbvGr',
       'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'GarageYrBlt'],
      dtype='object')


### 3a. Recursive Feature Elimination with LightGBM

In [157]:
model = LGBMRegressor(random_state=1)

In [158]:
rfe = RFE (model, 30)

In [159]:
rfe_lgb = rfe.fit(X, y)

In [161]:
#for i in range(len(rfe_lgb.ranking_)):
#    print('Feature %s: %f' % (X.columns[i], rfe_lgb.ranking_[i]))
print ("Selected Features: %s" % X.columns[rfe_lgb.support_])

Selected Features: Index(['OverallQual', 'OverallCond', 'ExterQual', 'BsmtQual', 'BsmtExposure',
       'BsmtFinType1', 'KitchenQual', 'Functional', 'GarageFinish',
       'LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'GrLivArea', 'BedroomAbvGr', 'TotRmsAbvGrd', 'GarageCars', 'GarageArea',
       'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'GarageYrBlt', 'YrSold',
       'MoSold'],
      dtype='object')


### 3b.Recursive Feature Elimination (RFE) with XGBoost

In [167]:
#from sklearn.tree import DecisionTreeRegressor
rfe = RFE(estimator=XGBRegressor(), n_features_to_select=30)
rfe_xgb = rfe.fit(X,y)
# for i in range(X.shape[1]):
#     print('Column: %s, Rank: %.3f' % (X.columns[i], rfe_xgb.ranking_[i]))
print ("Selected Features: %s" % X.columns[rfe_xgb.get_support()])

Selected Features: Index(['MSZoning_C (all)', 'Neighborhood_StoneBr', 'Exterior1st_BrkComm',
       'GarageType_CarPort', 'OverallQual', 'KitchenQual', 'Functional',
       'FireplaceQu', 'GarageFinish', 'LotFrontage', 'LotArea', 'YearBuilt',
       'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF',
       '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BedroomAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'ScreenPorch',
       'GarageYrBlt', 'YrSold', 'MoSold'],
      dtype='object')


### 4. Sequential Feature Selection - Backward Selection

In [172]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=3)
sfs = SequentialFeatureSelector(knn, direction = 'backward', n_features_to_select=30)
back_bestK=sfs.fit(X, y)
#for i in range(X_encoded.shape[1]):
#    print('Column: %s, Selected: %s' % (X_encoded.columns[i], sfs.support_[i]))
print("Features selected by backward selection: "
      f"{X.columns[back_bestK.get_support()].tolist()}")    

Features selected by backward selection: ['OverallQual', 'GarageCond', 'PavedDrive', 'LotFrontage', 'LotArea', 'YearBuilt', 'MasVnrArea', 'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'GarageYrBlt', 'YrSold', 'MoSold']


### 5. Sequential Feature Selection - Forward Selection

In [174]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=3)
forward_bestK = SequentialFeatureSelector(knn, direction = 'forward', n_features_to_select=30)
forward_bestK.fit(X, y)
#for i in range(X_encoded.shape[1]):
#    print('Column: %s, Selected: %s' % (X.columns[i], for_bestK.support_[i]))
print("Features selected by forward selection: "
      f"{X.columns[forward_bestK.get_support()].tolist()}")   

Features selected by forward selection: ['MSSubClass_85', 'MSSubClass_150', 'MSSubClass_160', 'MSSubClass_180', 'Street_Grvl', 'Street_Pave', 'LotConfig_FR2', 'Neighborhood_Blmngtn', 'Neighborhood_Blueste', 'Neighborhood_Crawfor', 'Neighborhood_NPkVill', 'Condition1_RRNe', 'Condition2_RRAe', 'Condition2_RRAn', 'Condition2_RRNn', 'HouseStyle_1.5Unf', 'HouseStyle_2.5Fin', 'RoofMatl_Membran', 'RoofMatl_Metal', 'RoofMatl_Roll', 'RoofMatl_WdShake', 'Exterior1st_CBlock', 'Exterior1st_ImStucc', 'Exterior1st_Stone', 'Exterior2nd_CBlock', 'Exterior2nd_Other', 'Exterior2nd_Stone', 'Exterior2nd_Stucco', 'Heating_Floor', 'SaleType_ConLw']


Despite having different set of important features, some features (i.e., numeric cols) appear as important on all methods.