In [114]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [115]:
df = pd.read_feather('./dataset/cleaned_train.feather')
df.head()

Unnamed: 0,LotFrontage,LotArea,LotShape,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,BsmtQual,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,65.0,33.911281,0.0,7,5,2003,2003,19.953123,2.0,3.0,...,0,0,0,1,0,0,0,0,1,0
1,80.0,35.13049,0.0,6,8,1976,1976,0.0,1.0,3.0,...,0,0,0,1,0,0,0,0,1,0
2,68.0,36.700764,1.0,7,5,2001,2002,18.223314,2.0,3.0,...,0,0,0,1,0,0,0,0,1,0
3,60.0,35.079833,1.0,7,5,1915,1970,0.0,1.0,2.0,...,0,0,0,1,1,0,0,0,0,0
4,84.0,39.1663,1.0,8,5,2000,2000,26.177215,2.0,3.0,...,0,0,0,1,0,0,0,0,1,0


In [116]:
df[df.isin([np.nan, -np.nan, np.inf, -np.inf]).any(1)]

Unnamed: 0,LotFrontage,LotArea,LotShape,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,BsmtQual,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial


In [117]:
df_target = df.SalePrice
df = df.loc[:, df.columns != 'SalePrice']

## Constant features

In [118]:
from sklearn.feature_selection import VarianceThreshold

In [119]:
sel = VarianceThreshold(threshold=0.01)
sel.fit(df)

VarianceThreshold(threshold=0.01)

In [120]:
print(
    len([
        x for x in df.columns
        if x not in df.columns[sel.get_support()]
    ]))

[x for x in df.columns if x not in df.columns[sel.get_support()]]

71


['KitchenAbvGr',
 'MSSubClass_5.434086901752666',
 'MSSubClass_5.67279460942094',
 'MSSubClass_8.9630886370266',
 'MSZoning_C (all)',
 'Street_Grvl',
 'Street_Pave',
 'LotConfig_FR3',
 'LandSlope_Sev',
 'Neighborhood_Blueste',
 'Neighborhood_NPkVill',
 'Neighborhood_Veenker',
 'Condition1_PosA',
 'Condition1_RRAe',
 'Condition1_RRNe',
 'Condition1_RRNn',
 'Condition2_Artery',
 'Condition2_Feedr',
 'Condition2_PosA',
 'Condition2_PosN',
 'Condition2_RRAe',
 'Condition2_RRAn',
 'Condition2_RRNn',
 'HouseStyle_1.5Unf',
 'HouseStyle_2.5Fin',
 'HouseStyle_2.5Unf',
 'RoofStyle_Flat',
 'RoofStyle_Gambrel',
 'RoofStyle_Mansard',
 'RoofStyle_Shed',
 'RoofMatl_ClyTile',
 'RoofMatl_Membran',
 'RoofMatl_Metal',
 'RoofMatl_Roll',
 'RoofMatl_Tar&Grv',
 'RoofMatl_WdShake',
 'RoofMatl_WdShngl',
 'Exterior1st_AsphShn',
 'Exterior1st_BrkComm',
 'Exterior1st_CBlock',
 'Exterior1st_ImStucc',
 'Exterior1st_Stone',
 'Exterior2nd_AsphShn',
 'Exterior2nd_Brk Cmn',
 'Exterior2nd_CBlock',
 'Exterior2nd_ImStucc'

## Univariate features 

In [121]:
from sklearn.feature_selection import SelectKBest, f_regression

In [122]:
fs = SelectKBest(score_func=f_regression, k=10)
df_fs = fs.fit_transform(df, df_target)
df_fs.shape

(1460, 10)

In [123]:
fs.get_feature_names_out()

array(['OverallQual', 'ExterQual', 'TotalBsmtSF', 'GrLivArea',
       'KitchenQual', 'GarageCars', 'GarageArea', 'TotalFlrSFAbvGrd',
       'TotalSF', 'TotalBath'], dtype=object)

## Forward selection

In [124]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [125]:
X_train, X_test, y_train, y_test = train_test_split(
    df,
    df_target,
    test_size=0.3,
    random_state=0
)

In [133]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:  # we are interested in absolute coefficient value
                col_name = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(col_name)
    return col_corr

corr_features = correlation(X_train, 0.75)

df_corr = df.copy()
df_corr.drop(labels=corr_features, axis=1, inplace=True)

In [135]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

sfs1 = SFS(RandomForestRegressor(), 
           k_features=10, 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='r2',
           cv=3)

sfs1 = sfs1.fit(np.array(df_corr), df_target)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s

STOPPING EARLY DUE TO KEYBOARD INTERRUPT...

In [None]:
sfs1.k_feature_idx_

In [None]:
df.columns[list(sfs1.k_feature_idx_)]

## Cross-Validation

In [109]:
from sklearn.model_selection import KFold, cross_val_score

In [None]:
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X_train.values)
    rmse= np.sqrt(-cross_val_score(model, X_train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

https://www.kaggle.com/code/prashant111/extensive-analysis-eda-fe-modelling/notebook

https://scikit-learn.org/stable/auto_examples/feature_selection/plot_rfe_with_cross_validation.html#sphx-glr-auto-examples-feature-selection-plot-rfe-with-cross-validation-py

## LASSO Regression

In [136]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler, RobustScaler

In [137]:
scaler = StandardScaler()
scaler.fit(df.fillna(0))

StandardScaler()

In [152]:
sel_ = SelectFromModel(Lasso(alpha=0.0005))
sel_.fit(df, df_target)

SelectFromModel(estimator=Lasso(alpha=0.0005))

In [153]:
# make a list with the selected features and print the outputs
selected_feat = df.columns[(sel_.get_support())]

print('total features: {}'.format((df.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
    np.sum(sel_.estimator_.coef_ == 0)))

total features: 267
selected features: 114
features with coefficients shrank to zero: 153


In [154]:
selected_feat.tolist()

['LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'ExterQual',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinSF1',
 'BsmtFinType2',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'HeatingQC',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'KitchenQual',
 'Functional',
 'Fireplaces',
 'FireplaceQu',
 'GarageYrBlt',
 'GarageFinish',
 'GarageCars',
 'GarageArea',
 'GarageQual',
 'GarageCond',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'Fence',
 'MiscVal',
 'TotalFlrSFAbvGrd',
 'TotalSF',
 'Spaciousness',
 'TotalBath',
 'TotalPorchSF',
 'MSSubClass_4.1466870313687',
 'MSSubClass_4.875767801446074',
 'MSSubClass_6.622579749822654',
 'MSSubClass_7.202598230398585',
 'MSSubClass_7.903776119514091',
 'MSSubClass_8.646522023750673',
 'MSZoning_C (all)',
 'MSZoning_FV',
 'MSZoning_RM',
 'Alley_Pave',
 'LandC

- Quel alpha ?
- Dois je scaler les données ? RobustScaler parait plus approprié