![alt text](ames_iowa_downtown.avif "Ames downtown city")

The aim of the followinng project is to predict house sale prices in Ames town, Iowa, USA. Ames is a typical small provincial american town with a population around 66 thousand (according to the 2020 census). This example might be generalized to other small towns in the United States, especially in Iowa. However, the main reason why this notebook was ever created is to present the typical Data Science workflow when dealing with regression predictive type of problem.

# Presets

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import plotly.io as pio

from sklearn import decomposition
from statsmodels.stats.outliers_influence import variance_inflation_factor
from prince import MCA
from sklearn import metrics

# Models
from sklearn.ensemble import (
    RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor,
    HistGradientBoostingRegressor, StackingRegressor, VotingRegressor
)
from sklearn.linear_model import (
    LinearRegression, Lars, Lasso, LassoLars, Ridge, ARDRegression,
    ElasticNet, BayesianRidge, GammaRegressor, HuberRegressor, RANSACRegressor,
    PassiveAggressiveRegressor, TheilSenRegressor, RidgeCV
)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from xgboost import XGBRFRegressor, XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [None]:
from FunctionsEDA import *
from FunctionsFeatureSelection import *
from FunctionsDataPreparation import *

np.random.seed(42)
pio.renderers.default = "browser"
pd.set_option('display.max_columns', 500)

In [None]:
data = pd.read_csv('train.csv')

# Basic statistics

In [None]:
show_data(data)

- Rows/cols ratio seems to be pretty small, taking into consideration the fact that lot of variables are of type 'object'. Thus, dimensionality reduction must be held for sure
- MiscFeature, PoolQC, FireplaceQu, Alley, Fence are almost totally absent
- There are significant outliers almost in every variable

# Qualitative (business) analysis

With the statistics shown above, we will try to analize and predict the influence of each variable in the sale price. Each variable will be given a predifined influence (low, medium, high) and the corresponding effect sign(stimulant, distimulant, mixed)

In [None]:
stim_vec = [
    'n', 'm', 'm', 's', 's', 's', 's', 'd', 'd',
    's', 'd', 'd', 'm', 'm', 'm', 'm', 'm', 'd', 'd',
    's', 'd', 'm', 'm', 'm', 'm', 'm', 'm', 'm',
    'm', 'd', 'd', 's', 'd', 'd', 'm',  'm', 'd', 's',
    'm', 'd', 'm', 'm',
    's', 's', 'd', 's', 's', 's', 's', 's', 's', 's', 's', 'd', 's', 'd', 's', 'd',
    's', 'd', 'm', 's', 's', 'd', 'd',
    's', 's', 's', 's', 's',
    's', 'd', 'd', 's', 's', 's', 'm', 'm', 's', 's', 'n'
]
imp_vec = [
    'l', 'l', 'h', 'l', 'h', 'l', 'm', 'm', 'l',
    'h', 'm', 'l', 'm', 'h', 'h', 'h', 'l', 'm', 'm',
    'h', 'h', 'l', 'l', 'l', 'l', 'l', 'l', 'l',
    'l', 'l', 'l', 'm', 'm', 'm', 'l', 'l', 'l', 'm',
    'l', 'm', 'l', 'l',
    'l', 'l', 'l', 'l', 'l', 'l', 'l', 'l', 'l', 'm', 'l', 'l', 'm', 'l', 'l', 'l',
    'm', 'l', 'l', 'l', 'l', 'l', 'l', 
    'l', 'l', 'l', 'l', 'l',
    'm', 'l', 'm', 'm', 'l', 'l', 'm', 'm', 'l', 'l', 'n'
]

quantitative_eda(data, stim_vec, imp_vec)

# Feature engineering

In [None]:
X = data.copy()

In [None]:
X['MSSubClass'] = X['MSSubClass'].astype(str)


X['PorchType_WoodDeck'] = np.where(X['WoodDeckSF'] != 0, 1, 0)
X['PorchType_OpenPorchSF'] = np.where(X['OpenPorchSF'] != 0, 1, 0)
X['PorchType_Enclosed'] = np.where(X['EnclosedPorch'] != 0, 1, 0)
X['PorchType_Screen'] = np.where(X['ScreenPorch'] != 0, 1, 0)
X['PorchType_3SsnPorch'] = np.where(X['3SsnPorch'] != 0, 1, 0)
X['IsBsmFin'] = np.where(X['BsmtUnfSF'] != 0, 1, 0)
X['IsFireplace'] = np.where(X['Fireplaces'] != 0, 1, 0)
X['IsMiscVal'] = np.where(X['MiscVal'] != 0, 1, 0)
X['IsLowQual'] = np.where(X['LowQualFinSF'] != 0, 1, 0)

X['PerRoomSF'] = X['GrLivArea']/X['TotRmsAbvGrd']
X['PorchSF'] = (X['WoodDeckSF'] + X['OpenPorchSF'] + 
                X['EnclosedPorch'] + X['3SsnPorch'] + 
                X['ScreenPorch'])

X['TotalBaths'] = X['BsmtFullBath']+0.5*X['BsmtHalfBath']+X['FullBath']+X['HalfBath']*0.5

X['FloorRatio'] = X['2ndFlrSF']/X['1stFlrSF']
X['BsmFinUnfRatio'] = X['BsmtUnfSF']/X['TotalBsmtSF']
X['BsmLowQualRatio'] = X['LowQualFinSF']/X['TotalBsmtSF']
X['BsmFin2BsmFin1Ratio'] = X['BsmtFinSF2']/X['BsmtFinSF1']
X['GarageLivRatio'] = X['GarageArea']/X['GrLivArea']

X['KitchenRatio'] = X['KitchenAbvGr']/X['TotRmsAbvGrd']
X['BathRatio'] = X['TotalBaths']/X['TotRmsAbvGrd']
X['BedroomRatio'] = X['BedroomAbvGr']/X['TotRmsAbvGrd']

X['IsRennovated'] = np.where(X['YearBuilt'] != X['YearRemodAdd'], 1, 0)

In [None]:
show_data(X)

fill na with zero; fill na in garage year built with the minimum year

In [None]:
cols_to_fill = [
    'LotFrontage', 'MasVnrArea', 
    'BsmFin2BsmFin1Ratio', 'BsmLowQualRatio', 'BsmFinUnfRatio', 'BsmFinUnfRatio'
]
for i in cols_to_fill:
    X[i] = X[i].fillna(0)
    
X['BsmLowQualRatio'] = X['BsmLowQualRatio'].replace([np.inf, -np.inf], 1)
X['GarageYrBlt'] = X['GarageYrBlt'].fillna(X['GarageYrBlt'].min())

# EDA

## Numeric

In [None]:
plots_numeric = show_plots_single(X, 'SalePrice', f_size=7, p_height=3000)

In [None]:
plots_numeric['num_single'].show()

In [None]:
plots_numeric['num_split'].show()
plots_numeric['num_split_rid'].show()
plots_numeric['num_vs_y_split'].show()

In [None]:
plots_numeric['num_split_hist'].show()

In [None]:
corr_heatmap(X)

- drop Id
- YearBuilt, GarageYrBlt are alike
- Porch types are rather rare variables
- mor than 4 baths - outliers
- most of the vars are not normally distributed - check out transformed ones

In [None]:
X['TotalBaths'] = np.where(X['TotalBaths']>4, 4, X['TotalBaths'])
X['Fireplaces'] = np.where(X['Fireplaces'] > 2, 2, X['Fireplaces'])
X = X.drop(columns='Id')

In [None]:
distribs_lst=(
    get_common_distributions()+
    ['powerlognorm', 'johnsonsu', 'johnsonsb', 'lognorm', 'laplace',
     'gumbel_l', 'gumbel_r', 'genhyperbolic', 'gennorm', 'logistic', 'wrapcauchy']
)

In [None]:
data_eda = fit_distributions(X, f_size=6, refit=True, distribs_lst=distribs_lst)
data_eda = prepare_data(data_eda, 'SalePrice_trnsf', what='standardize')

In [None]:
plots_numeric = show_plots_single(data_eda, 'SalePrice_trnsf', f_size=7, p_height=3000)

In [None]:
plots_numeric['num_split'].show()
plots_numeric['num_split_rid'].show()
plots_numeric['num_vs_y_split'].show()

In [None]:
plots_numeric['num_vs_y_single'].show()

In [None]:
corr_heatmap(data_eda)

In [None]:
corr_orig = X.corr().SalePrice.to_frame().reset_index()
corr_trnsf = data_eda.corr().SalePrice_trnsf.to_frame().reset_index()
corr_trnsf['index'] = corr_trnsf['index'].str.replace('_trnsf', '')
corr_all = corr_trnsf.merge(corr_orig, on='index', how='left')
corr_all['IsImprovement'] = np.where(
    corr_all['SalePrice_trnsf'].abs()>corr_all['SalePrice'].abs(),
    1,
    0)

In [None]:
print('Improvement on', corr_all['IsImprovement'].sum(), 'out of', corr_all.shape[0], 'columns')
corr_all.loc[corr_all['IsImprovement']==0, :]

In [None]:
#sns.pairplot(data, diag_kind="kde")

## Categorical

In [None]:
def prepare_data_cat(X):
    X['MSZoning'] = X.MSZoning.replace({'RH':'RM', 'FV':'RL', 'C (all)': 'RM'})
    X['IsLowDensityArea'] = np.where(X['MSZoning'] == 'RL', 1, 0)
    
    X['Neighborhood'] = X.Neighborhood.replace(
        {'Edwards': 'C0', 'OldTown': 'C0', 'Sawyer': 'C0', 'BrkSide': 'C0',
         'Blueste': 'C0', 'NPkVill': 'C0', 'Mitchel': 'C0', 'SWISU': 'C0', 'NAmes': 'C0',
         'MeadowV': 'C1', 'BrDale': 'C1', 'IDOTRR': 'C1',
         'NridgHt': 'C2', 'StoneBr': 'C2', 'NoRidge': 'C2',
         'SawyerW': 'C3', 'Blmngtn': 'C3', 'Gilbert': 'C3', 'NWAmes': 'C3',
         'Somerst': 'C4', 'Timber': 'C4', 'Veenker': 'C4',
         'Crawfor': 'C4', 'CollgCr': 'C4', 'ClearCr': 'C4',
        }
    )
    
    X['Condition1'] = X['Condition1'].replace(
        {'PosA': 'C1', 'PosN': 'C1', 'RRAn':'C1', 'RRNe': 'C1', 'RRNn': 'C1', 'Norm': 'C1',
         'Feedr':'C2', 'Artery':'C2', 'RRAe':'C2'
        }
    )
    X['Condition2'] = X['Condition2'].replace(
        {'PosA': 'C1', 'PosN': 'C1', 'RRAn':'C1', 'RRNe': 'C1', 'RRNn': 'C1', 'Norm': 'C1',
         'Feedr':'C2', 'Artery':'C2', 'RRAe':'C2'
        }
    )
    X['Condition1'] = np.where(X['Condition1'] == 'C1', 1, 0)
    X['Condition2'] = np.where(X['Condition2'] == 'C1', 1, 0)
    X['IsEnvGood'] = np.where(X['Condition2']+X['Condition1'] == 2, 1, 0)
    
    X['Alley'] = X.Alley.fillna('Pave')
    X['IsAlleyGravel'] = np.where(X['Alley'] == 'Pave', 0, 1)
    X['IsShapeReg'] = np.where(X['LotShape'] ==  'Reg', 1, 0)
    X['LandContour'] = X['LandContour'].replace(
        {'Lvl': 'Norm', 'Bnk': 'Norm', 'Low': 'Ir', 'HLS': 'Ir'}
    )
    X['IsLandFlat'] = np.where(X['LandContour'] == 'Ir', 0, 1)
    
    X['LotConfig'] = X['LotConfig'].replace(
        {'Inside': 'Norm', 'FR2': 'Norm', 'Corner': 'Norm', 'CulDSac': 'Ir', 'FR3': 'Ir'}
    )
    X['IsNarrow'] = np.where(X['LotConfig'] == 'Ir', 0, 1)
    X['IsSlope'] = np.where((X['LandSlope'] == 'Mod') | (X['LandSlope'] == 'Sev'), 1, 0)
    X['IsPaved'] = np.where((X['PavedDrive'] == 'N') | (X['PavedDrive'] == 'P'), 0, 1)
    
    X['MSSubClass'] = X['MSSubClass'].replace(
        {'20': 'C0', '70': 'C0', '75': 'C0', '80': 'C0',
         '60': 'C1', '120': 'C1',
         '90': 'C2', '85': 'C2', '40': 'C2', '160': 'C2', '50': 'C2', '190': 'C2',
         '45': 'C3', '30': 'C3', '180': 'C3'
        }
    )
    X['IsOneFamily'] = np.where(X['BldgType'] == '1Fam', 1, 0)
    X['IsOneFloor'] = X['HouseStyle'].replace(
        {'SFoyer': '1', 'SLvl': '1', '1Story': '1',
         '1.5Fin': '1', '1.5Unf': '1',
         '2Story': '0', '2.5Fin': '0', '2.5Unf': '0'
        }
    ).astype(np.int64)
    X['IsRoofGable'] = np.where(X['RoofStyle'] == 'Gable', 1, 0)
    X['IsRoofStandard'] = np.where(X['RoofMatl'] == 'CompShg', 1, 0)
    X['Exterior_1'] = X['Exterior1st'].replace(
        {'VinylSd': 'C0',
         'BrkFace': 'C1', 'BrkComm': 'C1', 'Plywood': 'C1', 'Brk Cmn': 'C1',
         'CmentBd': 'C2', 'Stone': 'C2', 'CemntBd': 'C2', 'Other': 'C2',
         'AsphShn': 'C3', 'AsbShng': 'C3', 'ImStucc': 'C3', 'CBlock': 'C3',
         'Wd Sdng': 'C4', 'Wd Shng': 'C4', 'WdShing': 'C4', 'Stucco': 'C4',
         'HdBoard': 'C4', 'MetalSd': 'C4'
        }
    )
    X['Exterior_2'] = X['Exterior2nd'].replace(
        {'VinylSd': 'C0',
         'BrkFace': 'C1', 'BrkComm': 'C1', 'Plywood': 'C1', 'Brk Cmn': 'C1',
         'CmentBd': 'C2', 'Stone': 'C2', 'CemntBd': 'C2', 'Other': 'C2',
         'AsphShn': 'C3', 'AsbShng': 'C3', 'ImStucc': 'C3', 'CBlock': 'C3',
         'Wd Sdng': 'C4', 'Wd Shng': 'C4', 'WdShing': 'C4', 'Stucco': 'C4',
         'HdBoard': 'C4', 'MetalSd': 'C4'
        }
    )
    X['MasVnrType'] = X['MasVnrType'].fillna('None')
    X['MasVnrType'] = X['MasVnrType'].replace({'BrkFace': 'Brick', 'BrkCmn': 'Brick'})
    X['IsExtQualGood'] = np.where((X['ExterQual'] == 'Ex') | (X['ExterQual'] == 'Good'), 1, 0)
    X['IsExtCondGood'] = np.where((X['ExterCond'] == 'Ex') | (X['ExterCond'] == 'Good'), 1, 0)
    
    X['IsBsmConcrete'] = np.where(X['Foundation'] == 'PConc', 1, 0)
    
    X['BsmtCond'] = X['BsmtCond'].fillna('Absent')
    X['BsmtQual'] = X['BsmtQual'].fillna('Absent')
    X['BsmtQual'] = X['BsmtQual'].replace(
        {'Ex': 'Good', 'Gd': 'Good', 'TA': 'Good', 'Fa': 'Bad', 'Po': 'Bad'}
    )
    X['BsmtCond'] = X['BsmtCond'].replace(
        {'Ex': 'Good', 'Gd': 'Good', 'TA': 'Good', 'Fa': 'Bad', 'Po': 'Bad'}
    )
    X['IsBsmQualGood'] = np.where((X['BsmtQual'] == 'Ex') | (X['BsmtQual'] == 'Good'), 1, 0)
    X['IsBsmCondGood'] = np.where((X['BsmtCond'] == 'Ex') | (X['BsmtCond'] == 'Good'), 1, 0)
    
    X['BsmtExposure'] = X['BsmtExposure'].fillna('Absent')
    X['BsmExposure'] = X['BsmtExposure'].replace(
        {'Mn': 'Norm', 'Av': 'Norm', 'Absent': 'No'}
    )
    
    X['BsmtFinType1'] = X['BsmtFinType1'].fillna('Absent')
    X['BsmFinType_1'] = X['BsmtFinType1'].replace(
        {'Unf': 'C0',
         'GLQ': 'C1',
         'CemntBd': 'C2',
         'Rec': 'C3', 'LwQ': 'C3', 'ALQ': 'C3', 'BLQ': 'C3', 'Absent': 'C3'
        }
    )
    X['BsmtFinType2'] = X['BsmtFinType2'].fillna('Absent')
    X['BsmFinType_2'] = X['BsmtFinType2'].replace(
        {'Unf': 'C0',
         'GLQ': 'C1',
         'CemntBd': 'C2',
         'Rec': 'C3', 'LWQ': 'C3', 'ALQ': 'C3', 'BLQ': 'C3', 'Absent': 'C3'
        }
    )
    
    X['GarageType'] = X['GarageType'].fillna('Absent')
    X['GarageType'] = X['GarageType'].replace(
        {'CarPort': 'Detchd', '2Types': 'Detchd', 'Basment': 'Detchd'}
    )
    X['GarageFinish'] = X['GarageFinish'].fillna('Absent')
    X['IsGarageFinished'] = np.where(X['GarageFinish'] == 'Unf', 0, 1)
    
    X['HeatingQC'] = X['HeatingQC'].replace(
        {'Ex': 'Good', 'Gd': 'Good', 'TA': 'Norm', 'Fa': 'Bad', 'Po': 'Bad'}
    )
    X['IsHeatGood'] = np.where(X['HeatingQC'] == 'Good', 1, 0)
    
    X['KitchenQual'] = X['KitchenQual'].replace(
        {'Ex': 'Good', 'Gd': 'Good', 'TA': 'Norm', 'Fa': 'Bad', 'Po': 'Bad'}
    )
    X['IsKitchenGood'] = np.where(X['KitchenQual'] == 'Good', 1, 0)
    
    X['IsCentralAir'] = np.where(X['CentralAir'] == 'Y', 1, 0)
    X['IsStandardElect'] = np.where(X['Electrical'] == 'SBrkr', 1, 0)
    
    X['FireplaceQu'] = X['FireplaceQu'].fillna('Absent')
    X['FireplaceQual'] = X['FireplaceQu'].replace(
        {'Ex': 'Good', 'Gd': 'Good', 'TA': 'Norm', 'Fa': 'Bad', 'Po': 'Bad', 'Absent': 'Bad'}
    )
    
    X['Fence'] = X['Fence'].fillna('Absent')
    
    X['IsFenceGoodOrAbsent'] = X['Fence'].replace(
        {'GdPrv': '1', 'Absent': '1', 'MnPrv': '0', 'MnWw': '0', 'GdWo': '0'}
    ).astype(np.int64)
    
    X['MiscFeature'] = X['MiscFeature'].fillna('Absent')
    X['IsShedPresent'] = np.where(X['MiscFeature'] == 'Shed', 1, 0)
    
    X['IsNewHouse'] = np.where(X['SaleType'] == 'New', 1, 0)
    
    X['SaleCond'] = X['SaleCondition'].replace(
        {'Alloca': 'C0', 'Normal': 'C0',
         'Partial': 'C1',
         'Abnorml': 'C2', 'Family': 'C2', 'AdjLand': 'C2'}
    )
    return X

In [None]:
data_eda_cat = prepare_data_cat(X)

## Data preparation after EDA

In [None]:
data_eda_final = fit_distributions(X, f_size=6, refit=True, distribs_lst=distribs_lst)

corr_orig = X.corr().SalePrice.to_frame().reset_index()
corr_trnsf = data_eda_final.corr().SalePrice_trnsf.to_frame().reset_index()
corr_trnsf['index'] = corr_trnsf['index'].str.replace('_trnsf', '')
corr_all = corr_trnsf.merge(corr_orig, on='index', how='left')
corr_all['IsImprovement'] = np.where(
    corr_all['SalePrice_trnsf'].abs()>corr_all['SalePrice'].abs(),
    1,
    0)
vars_to_keep = corr_all.loc[corr_all['IsImprovement']==0, :]['index'].to_list()
vars_to_drop = [i+'_trnsf' for i in vars_to_keep]

In [None]:
vars_to_drop = [
    'MasVnrArea_trnsf', 'BsmtFinSF1_trnsf', 'BsmtUnfSF_trnsf', 'TotalBsmtSF_trnsf',
    '1stFlrSF_trnsf', '2ndFlrSF_trnsf', 'BsmtFullBath_trnsf', 'BsmtHalfBath_trnsf',
    'TotRmsAbvGrd_trnsf', 'ScreenPorch_trnsf', 'PoolArea_trnsf', 'YrSold_trnsf',
    'PorchSF_trnsf', 'BedroomRatio_trnsf'
]

In [None]:
data_eda_final[vars_to_keep] = X.loc[:, vars_to_keep]

In [None]:
data_eda_final = data_eda_final.drop(columns=vars_to_drop)
data_eda_final = prepare_data_cat(data_eda_final)

In [None]:
data_eda_final = prepare_data(data_eda_final, 'SalePrice_trnsf')

In [None]:
corr_heatmap(data_eda_final)

# Centroids selection

In [None]:
S

In [None]:
df = data_eda_final.copy()
dependent_var = 'SalePrice_trnsf'
#df = df.select_dtypes(exclude='object')
#df = df.loc[:,
       #['LotFrontage_trnsf', 'LotArea_trnsf', 'OverallQual_trnsf', 'OverallCond_trnsf',
        #'YearBuilt_trnsf', 'YearRemodAdd_trnsf', 'SalePrice_trnsf']
      #]

In [None]:
hellwig_selection(df, 'SalePrice_trnsf')

In [None]:
#y = X['SalePrice']
#X.drop(columns=['SalePrice'], inplace=True)

# PCA, MCA, FCA

In [None]:
cols_to_pca = [
    'LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtUnfSF',
    'GrLivArea', 'GarageArea', 'PorchSF', 'FloorRatio', 'PerRoomSF'
]
x_to_pca = X[cols_to_pca]

In [None]:
pca = decomposition.PCA(n_components=5)
pca.fit(x_to_pca)
print(pca.explained_variance_ratio_)
print(pca.explained_variance_ratio_.sum())
#tmp = pca.transform(x_to_pca)

In [None]:
corr_heatmap(x_to_pca)

In [None]:
vif_data = pd.DataFrame()
vif_data["feature"] = x_to_pca.columns
vif_data["VIF"] = [
    variance_inflation_factor(x_to_pca.values, i) for i in range(len(x_to_pca.columns))
]
print(vif_data)

In [None]:
X = data.copy()
X = prepare_data_num(X)
X = prepare_data_cat(X)
X = make_cat_dummy(X)
X = drop_columns_selected(X)
X = filter_outliers(X)
X['LotFrontage'] = X['LotFrontage'].fillna(0)
X['MasVnrArea'] = X['MasVnrArea'].fillna(0)
X = standardize_num(X)

y = X['SalePrice']
X.drop(columns=['SalePrice'], inplace=True)

In [None]:
cols_to_pca = [
    'LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtUnfSF',
    'GrLivArea', 'GarageArea', 'PorchSF', 'FloorRatio', 'PerRoomSF', 'YearBuilt',
    'Fireplaces', 'OverallQual'
]

x_to_mca = X.drop(columns=cols_to_pca)

In [None]:
#corr_heatmap(x_to_mca)

In [None]:
print("Number of numeric cols:", len(cols_to_pca))
print("Number of 0-1 cols:", len(x_to_mca.columns))
print("Original rows-cols ratio:", X.shape[0]/(X.shape[1]))
print("Desired rows-cols ratio:", X.shape[0]/(X.shape[1]-33))
print("Desired number of 0-1 cols:", X.shape[1]-33-len(cols_to_pca))

- the goal is to reduce at least 3x times; better 4x

## Correlation

In [None]:
tmp = x_to_mca.copy()
tmp['y'] = y
#corr_heatmap(tmp)

In [None]:
del tmp

- just from the correlation perspective, top 12 variables are:
  - IsKitchenGood (0.606)
  - IsBsmConcrete (0.53)
  - Neighborhood_C2 (0.519)
  - BsmFinType_1_C1 (0.441)
  - IsGarageFinished (0.436)
  - FireQual_Good (0.400)
  - Garage_Detchd (-0.396)
  - IsHeatGood (0.379)
  - IsLowDensityArea (0.351)
  - Exterior_1_C4 (-0.349)
  - BsmFinType_1_C3 (-0.333)
  - MasVnrType_Stone (0.319) (IsNewHouse (0.322) is better but will be correlated with YearBuilt)

In [None]:
#corr_heatmap(x_to_mca[
    #['IsKitchenGood', 'IsBsmConcrete', 'Neighborhood_C2',
     #'BsmFinType_1_C1', 'IsGarageFinished', 'FireplaceQual_Good',
     #'GarageType_Detchd', 'IsHeatGood', 'IsLowDensityArea',
     #'Exterior_1_C4', 'BsmFinType_1_C3', 'MasVnrType_Stone'
    #]
#])

## MCA

In [None]:
x_to_mca = x_to_mca.apply(lambda x:  x.astype(str))
mca = MCA(n_components = 11)
mca = mca.fit(x_to_mca)
sum(mca.explained_inertia_)

- the repcentage of variable explained is too small -> keep the original ones or use centroids

# Feature selection

In [None]:
def prep_data_full(data):
    X = data.copy()
    X = prepare_data_num(X)
    X = prepare_data_cat(X)
    X = make_cat_dummy(X)
    X = drop_columns_selected(X)
    X = filter_outliers(X)
    X['LotFrontage'] = X['LotFrontage'].fillna(0)
    X['MasVnrArea'] = X['MasVnrArea'].fillna(0)
    X = standardize_num(X)
    
    y = X['SalePrice']
    X.drop(columns=['SalePrice'], inplace=True)
    return X, y

In [None]:
X, y = prep_data_full(data)

In [None]:
tmp = feature_selection(X, y)

In [None]:
tmp.apply(lambda x: x.sort_values().values)

- 'GarageArea', 'GrLivArea', 'BsmtFinSF1', 'LotArea', 'MasVnrArea', 'OverallQual', 'PerRoomSF', 'PorchSF', 'YearBuilt', 'Neighborhood_C2', 'IsKitchenGood'

In [None]:
corr_heatmap(X.loc[:, ['OverallQual',
 'YearBuilt',
 'GrLivArea',
 'Fireplaces',
 'GarageArea',
 'PerRoomSF',
 'IsBsmConcrete',
 'IsGarageFinished',
 'IsKitchenGood',
 'Neighborhood_C2',
 'BsmFinType_1_C1']])

In [None]:
features_centroids = [
    'LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtUnfSF',
    'GrLivArea', 'GarageArea', 'PorchSF', 'FloorRatio', 'PerRoomSF', 
    'OverallQual', 'YearBuilt', 'Fireplaces', 'IsKitchenGood',
    'IsBsmConcrete', 'Neighborhood_C2', 'BsmFinType_1_C1', 'IsGarageFinished',
    'FireplaceQual_Good', 'GarageType_Detchd', 'IsHeatGood', 'IsLowDensityArea',
    'Exterior_1_C4', 'BsmFinType_1_C3', 'MasVnrType_Stone'
]

features_important = [
    'GarageArea', 'GrLivArea', 'BsmtFinSF1', 'LotArea', 'MasVnrArea',
    'OverallQual', 'PerRoomSF', 'PorchSF', 'YearBuilt', 'Neighborhood_C2',
    'IsKitchenGood'
]

In [None]:
show_best_feature_set(
    features_original=X,
    features_centroids=X.loc[:, features_centroids],
    features_selected=X.loc[:, features_important],
    y = y,
    p_type='regression'
)

- use centroids as final feature set

# Model selection

## Grids

In [None]:
model_grid_ga_lars = dict(
    n_nonzero_coefs=Integer(25, 1000)
)

model_grid_ga_lasso = dict(
    alpha=Continuous(0.001, 5.0, distribution='uniform'),
    max_iter=Integer(100, 1000),
)

model_grid_ga_ridge = dict(
    alpha=Continuous(0.001, 5.0, distribution='uniform'),
    max_iter=Integer(100, 1000),
)

model_grid_ga_larslasso = dict(
    max_iter=Integer(100, 1000)
)

model_grid_ga_elastic_net = dict(
    alpha=Continuous(0.001, 5.0, distribution='uniform'),
    max_iter=Integer(100, 1000),
    l1_ratio=Continuous(0, 1, distribution='uniform'),
)

model_grid_ga_ard_reg = dict(
    n_iter=Integer(100, 1000)
)

model_grid_ga_ridge_bayes = dict(
    n_iter=Integer(100, 1000)
)

model_grid_ga_gamma_reg = dict(
    alpha=Continuous(0.001, 5.0, distribution='uniform'),
    max_iter=Integer(100, 1000)
)

model_grid_ga_huber_reg = dict(
    alpha=Continuous(0.001, 5.0, distribution='uniform'),
    max_iter=Integer(100, 1000),
    epsilon=Continuous(1, 3, distribution='uniform'),
    
)

model_grid_ga_passive_aggressive = dict(
    C=Continuous(0.1, 1000, distribution='uniform'),
    max_iter=Integer(100, 1000),
)

model_grid_ga_theilsen = dict(
    max_iter=Integer(100, 1000),
)

In [None]:
model_grid_ga_knn = {
    'n_neighbors': Integer(3, 20),
    'leaf_size': Integer(20, 50)
}

model_grid_ga_svm = {
    'C': Continuous(0.1, 1000, distribution='uniform'),
    'gamma': Continuous(0.0001, 1, distribution='uniform'),
    # kernel ‘linear’, ‘poly’, ‘rbf’
    # epsilon=0.1
}

model_grid_ga_rf = {
    'max_depth': Integer(10, 80),
    'max_features': Integer(1, 7),
    'min_samples_leaf': Integer(1, 7),
    'min_samples_split': Integer(2, 10),
    'n_estimators': Integer(25, 500)#,
}

model_grid_ga_extra_trees = {
    'max_depth': Integer(10, 80),
    'max_features': Integer(1, 7),
    'min_samples_leaf': Integer(1, 7),
    'min_samples_split': Integer(2, 10),
    'n_estimators': Integer(25, 500)#,
}

model_grid_ga_adaboost = {
    "learning_rate": Continuous(0.01, 0.50, distribution='uniform'),
    "n_estimators": Integer(25, 500)
}

model_grid_ga_xgboost_hist = {
    'learning_rate': Continuous(0.01, 0.50, distribution='uniform'),
    'max_iter': Integer(100, 1000),
    'max_leaf_nodes': Integer(21, 51),
    'min_samples_leaf': Integer(10, 40),
    'l2_regularization': Integer(0, 3),
}

model_grid_ga_xgboost = {
    "subsample": Continuous(0.75, 1, distribution='uniform'),
    "colsample_bytree": Continuous(0.75, 1, distribution='uniform'),
    "max_depth": Integer(2, 16),
    "min_child_weight": Integer(2, 15),
    "learning_rate": Continuous(0.01, 0.50, distribution='uniform'),
    "n_estimators": Integer(25, 500)
}

model_grid_ga_lgbm = {
    "learning_rate": Continuous(0.01, 0.50, distribution='uniform'),
    "n_estimators": Integer(25, 500),
    "num_iterations": Integer(100, 1000),
    "lambda_l2": Integer(0, 3),
    "bagging_fraction": Continuous(0.8, 1, distribution='uniform'),
    "min_data_in_leaf": Integer(10, 40),
    "num_leaves": Integer(21, 51),
}

model_grid_ga_catboost = {
    'iterations': Integer(100, 1000),
    'learning_rate': Continuous(0.01, 0.50, distribution='uniform'),
    #'depth': ,
    'l2_leaf_reg': Integer(0, 3),
    #'model_size_reg': ,
    #'rsm': ,
    'max_depth': Integer(10, 80),
    'n_estimators': Integer(25, 500),
    #'num_boost_round': ,
    #'num_trees': ,
    'min_data_in_leaf': Integer(10, 40),
    'min_child_samples': Integer(10, 40),
    'num_leaves': Integer(21, 51),
}

model_grid_ga_nnet = {
    'learning_rate_init': Continuous(0.01, 0.50, distribution='uniform'),
    'max_iter': Integer(200, 2000),
    'hidden_layer_sizes': Integer(100, 1000)
    # learning_rate{‘constant’, ‘invscaling’, ‘adaptive’}
}

## Models on full data

In [None]:
print(y.describe())
print('\nAccuracy on rf model:', 1-23026/y.median())

In [None]:
# LinearRegression: as it is
# RANSACRegressor: as it is

# Submission