# Data Preprocessing 

In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from numpy import random
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score

from scipy.stats import skew, norm
from feature_engine.transformation import YeoJohnsonTransformer
from category_encoders.binary import BinaryEncoder

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [69]:
import xgboost as xgb
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from mlxtend.regressor import StackingRegressor
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from mlxtend.regressor import StackingCVRegressor

NEW: To set up feature engineering functions

In [146]:
from sklearn.feature_selection import mutual_info_regression
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

NEW: To set up the printing parameters

In [143]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

In [386]:
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')
train = pd.read_csv('train.csv')

In [387]:
train.shape, test.shape

((1460, 81), (1459, 80))

Based on an initial analysis the train and test datasets have similar characteristics, so it will be easier to combine them for the data preprocessing work. 

In [388]:
train_test = pd.concat([train, test], ignore_index=True)

In [389]:
train_test.shape

(2919, 81)

To show the amount of 'NaN' values in each column of the dataset. 

In [390]:
pd.isnull(train_test).sum()[pd.isnull(train_test).sum() > 0]

MSZoning           4
LotFrontage      486
Alley           2721
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType        24
MasVnrArea        23
BsmtQual          81
BsmtCond          82
BsmtExposure      82
BsmtFinType1      79
BsmtFinSF1         1
BsmtFinType2      80
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
Electrical         1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
FireplaceQu     1420
GarageType       157
GarageYrBlt      159
GarageFinish     159
GarageCars         1
GarageArea         1
GarageQual       159
GarageCond       159
PoolQC          2909
Fence           2348
MiscFeature     2814
SaleType           1
SalePrice       1459
dtype: int64

It looks like Alley, FireplaceQu, PoolQC, Fence and MiscFeature have significant numbers of missing data. At first those columns were eliminated. However, the results were better if these columns were retained and the missing data were coded as 'None', assuming these features didn't exist for their related houses. 

There are a number of rows that have less than 5 rows with missing data. Since some of these are categorical and some are continuous data, their missing data will be replaced with the most frequent value. 

In [391]:
drop_high_nan=['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
train_test[drop_high_nan] = train_test[drop_high_nan].fillna('None')

small_nan_cols = ['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 
                  'TotalBsmtSF', 'Electrical', 'BsmtFullBath', 'BsmtHalfBath', 'KitchenQual', 'Functional', 'GarageCars', 
                  'GarageArea','SaleType', 'SaleCondition']
small_impute = SimpleImputer(strategy='most_frequent')
train_test[small_nan_cols] = pd.DataFrame(small_impute.fit_transform(train_test[small_nan_cols]),columns=small_nan_cols)

The following columns seem to have one value significantly larger than the rest, and it would probably be best to use the mode, or most common, value to feel each NaN value: MasVnrType, MasVnrArea, BsmtCond, BsmtExposure, BsmtFinType2, GarageType, GarageFinish, GarageQual, and GarageCond. That represents 9 out of the 13 columns. 

BsmtQual has two values larger than the rest: Gd and TA. But it only has 2.8% NaNs, so simply using the mode might be good enough.  

GarageYrBlt has 59 NaNs out 2919 rows which is only 2%. It has a dispersed set of values, so it might be easiest just to have any NaNs have the same value as YearBuilt. 

BsmtFinType1 has only 2.7% value of NaNs, and most two of its largest values are GLQ and Unf. It might be easiest to use the mode here. 

LotFrontage has 486 NaNs out of 2919 rows which is a pretty high 16.7%. It has a dispersed range of values, but looking at its characteristics from the describe function above, it seems to have a pretty even distribution with a mean of 10,168 and a median of 9,453. So using the mean to fill in the NaNs seems reasonable.

In [392]:
mode_cols = ['MasVnrType', 'MasVnrArea', 'BsmtCond', 'BsmtExposure', 'BsmtFinType2', 'BsmtQual', 'BsmtFinType1']
mode_impute = SimpleImputer(strategy='most_frequent')
train_test[mode_cols] = pd.DataFrame(mode_impute.fit_transform(train_test[mode_cols]),columns=mode_cols)

garage_cols = ['GarageType', 'GarageFinish', 'GarageQual','GarageCond', 'GarageYrBlt']
train_test[garage_cols] = train_test[garage_cols].fillna('None')

train_test['LotFrontage'].fillna((train_test['LotFrontage'].mean()), inplace=True)

A final check to ensure that the only missing values are the expected SalePrice data from the test dataset. 

In [393]:
pd.isnull(train_test).sum()[pd.isnull(train_test).sum() > 0]

SalePrice    1459
dtype: int64

The BsmtQual and BsmtFinType1 have ordinal scaled categorical values with an inherent order. So the conversion from categorical to numerical was done manually. 

In [394]:
train_test['BsmtQual'].unique()

array(['Gd', 'TA', 'Ex', 'Fa'], dtype=object)

In [395]:
train_test['BsmtFinType1'].unique()

array(['GLQ', 'ALQ', 'Unf', 'Rec', 'BLQ', 'LwQ'], dtype=object)

In [396]:
train_test.BsmtQual = train_test.BsmtQual.replace({"Ex": 110, "Gd": 95, "TA": 85, "Fa": 75, "Po": 60, "NA": 0})
train_test.BsmtFinType1 = train_test.BsmtFinType1.replace({"GLQ": 6, "ALQ": 5, "BLQ": 4, "Rec": 3, "LwQ": 2, "Unf": 1,
                                                         "NA": 0})

To look at how many features have a skew above 0.6, since high skew can be an issue in regression analysis. For some reason, the results were better with a skew limit of 0.6 than the more standard value of 0.5. Skewness measures how symmetrical a distribution of data is. Data with a skew of 0 is perfectly symmetrical.

In [397]:
number_cols = train_test.select_dtypes(include=np.number).columns.tolist()

In [398]:
skew_features = train_test[number_cols].apply(lambda x: skew(x)).sort_values(ascending=False)
high_skew = skew_features[skew_features > 0.6]
skew_index = high_skew.index
skew_index

Index(['MiscVal', 'PoolArea', 'LotArea', 'LowQualFinSF', '3SsnPorch',
       'KitchenAbvGr', 'EnclosedPorch', 'ScreenPorch', 'OpenPorchSF',
       'WoodDeckSF', 'LotFrontage', '1stFlrSF', 'MSSubClass', 'GrLivArea',
       '2ndFlrSF', 'BsmtQual', 'TotRmsAbvGrd', 'Fireplaces', 'HalfBath'],
      dtype='object')

In [399]:
print("There are {} numerical features with Skew > 0.6 :".format(high_skew.shape[0]))
skewness = pd.DataFrame({'Skew' :high_skew})
skewness

There are 19 numerical features with Skew > 0.6 :


Unnamed: 0,Skew
MiscVal,21.947195
PoolArea,16.898328
LotArea,12.822431
LowQualFinSF,12.088761
3SsnPorch,11.376065
KitchenAbvGr,4.302254
EnclosedPorch,4.003891
ScreenPorch,3.946694
OpenPorchSF,2.535114
WoodDeckSF,1.842433


To normalize the features with skew above 0.6 the Yeo-Johnson Transformation is used since data with negative values or values of zero can be included. This transformation is a way to transform a continuous variable so that the output looks more normally distributed. 

In [400]:
yjt = YeoJohnsonTransformer()
yjt.fit(train_test[skew_index])
train_test[skew_index] = yjt.transform(train_test[skew_index])

Since most models cannot work with categorical data, it will be necessary to identify all the columns that have non-numeric object values and then convert them to numeric values. The best approach for this conversion seems to be the straightforward python factorize function. 

In [401]:
obj_cols = list(train_test.select_dtypes(['object']).columns)

In [402]:
for column in obj_cols:
     train_test[column] = pd.factorize(train_test[column], sort=True)[0]

To create some new columns that might compound the effects of some of the existing columns which have higher impacts the score. eli5 (https://eli5.readthedocs.io/en/latest/overview.html) was used with an xgbregressor model with its default parameters on an earlier workbook version to measure the impact of each column on the overall score. 

In [403]:
train_test['QualCondSum'] = train_test['OverallQual'] + train_test['OverallCond'] + train_test['KitchenQual']
train_test['RemodTime'] = train_test['YearRemodAdd'] - train_test['YearBuilt']
train_test['BsmtFinTypeSF1'] = train_test['BsmtFinType1'] * train_test['BsmtFinSF1']
train_test['TotalFlrSF'] = train_test['1stFlrSF'] + train_test['2ndFlrSF']
train_test['TotalFinSF'] = train_test['GrLivArea'] + train_test['BsmtFinSF1']
train_test['GarageCarArea'] = train_test['GarageArea'] * train_test['GarageCars']
train_test['TotalSF'] = train_test['1stFlrSF'] + train_test['2ndFlrSF'] + train_test['TotalBsmtSF']

In [404]:
Features = ['TotalBsmtSF', 'CentralAir', '2ndFlrSF', 'Fireplaces', 'GarageArea', 'WoodDeckSF', 'PoolArea', 'MiscVal']
train_test['Features'] = train_test[Features].gt(0).sum(axis=1)

train_test['MedNhbdArea'] = (train_test.groupby('Neighborhood')['GrLivArea'].transform('median'))  

To use K-Means to create a new feature. First to identify the features to measure against, then to scale them down, and then run the K-means operation.

In [405]:
space = ['LotArea', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageArea']

train_test_scaled = train_test.loc[:, space]
train_test_scaled = (train_test_scaled - train_test_scaled.mean(axis=0)) / train_test_scaled.std(axis=0)

kmeans = KMeans(n_clusters=10, random_state=0)
train_test['Cluster'] = kmeans.fit_predict(train_test_scaled)

To create a K-means cluster distance feature from the space features above to create 10 new Centroid features. 

In [406]:
train_test_cd = kmeans.fit_transform(train_test_scaled)

train_test_cd = pd.DataFrame(train_test_cd, columns=[f"Centroid_{i}" for i in range(train_test_cd.shape[1])])
train_test = train_test.join(train_test_cd)

In [407]:
train_test_cd.head()

Unnamed: 0,Centroid_0,Centroid_1,Centroid_2,Centroid_3,Centroid_4,Centroid_5,Centroid_6,Centroid_7,Centroid_8,Centroid_9
0,2.805477,4.056948,2.918934,1.371359,3.092016,3.354219,2.737796,4.613459,2.630429,0.727621
1,4.203703,2.48199,1.163126,2.948924,0.781183,2.571052,3.407897,3.729459,1.440709,2.752441
2,3.536584,3.683415,3.077031,1.904208,3.04146,3.746463,2.21721,4.01612,2.690726,0.576227
3,3.320031,3.894243,3.161313,1.941166,3.178007,3.675633,2.500895,4.34658,2.627652,0.844579
4,4.968461,3.005393,4.072529,3.502604,3.319298,5.06012,1.077796,3.402015,3.517645,1.913878


To set up the Principle Component Analysis Function

In [408]:
def apply_pca(X, standardize=True):
    # Standardize
    if standardize:
        X = (X - X.mean(axis=0)) / X.std(axis=0)
    # Create principal components
    pca = PCA()
    X_pca = pca.fit_transform(X)
    # Convert to dataframe
    component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
    X_pca = pd.DataFrame(X_pca, columns=component_names)
    # Create loadings
    loadings = pd.DataFrame(
        pca.components_.T,  # transpose the matrix of loadings
        columns=component_names,  # so the columns are the principal components
        index=X.columns,  # and the rows are the original features
    )
    return pca, X_pca, loadings

In [409]:
pca_features = ['OverallQual', 'Neighborhood', 'GrLivArea', 'GarageArea', 'KitchenQual']

pca, X_pca, loadings = apply_pca(train_test[pca_features])
print(loadings)

                   PC1       PC2       PC3       PC4       PC5
OverallQual   0.537033  0.130036  0.056504 -0.106121 -0.824762
Neighborhood  0.229456 -0.968511 -0.017691 -0.094679  0.007677
GrLivArea     0.477156  0.184109 -0.468866 -0.608047  0.385836
GarageArea    0.483240  0.045932 -0.339726  0.780826  0.198156
KitchenQual  -0.444687 -0.095239 -0.813170  0.019067 -0.362731


In [410]:
pca_8_features = ['OverallQual', 'Neighborhood', 'GrLivArea', 'GarageArea', 'YearBuilt', 'KitchenQual', 
                'BsmtQual', '1stFlrSF']

pca, X_pca, loadings = apply_pca(train_test[pca_8_features])
print(loadings)

                   PC1       PC2       PC3       PC4       PC5       PC6  \
OverallQual   0.434180 -0.072700  0.008969  0.129446  0.324256 -0.015019   
Neighborhood  0.150541  0.590865 -0.779603 -0.074241  0.088942 -0.070357   
GrLivArea     0.350983  0.335006  0.412273  0.088996  0.617178 -0.026382   
GarageArea    0.383047  0.115635  0.089148 -0.308257 -0.245647  0.811864   
YearBuilt     0.365493 -0.467402 -0.223303 -0.355340 -0.054683 -0.136110   
KitchenQual  -0.337976  0.078467  0.109071 -0.833750  0.346380 -0.087019   
BsmtQual      0.402110 -0.376713 -0.160681 -0.104154  0.111008 -0.248124   
1stFlrSF      0.330765  0.391737  0.355870 -0.206242 -0.557140 -0.497334   

                   PC7       PC8  
OverallQual  -0.782897  0.266590  
Neighborhood  0.036300 -0.027311  
GrLivArea     0.335537 -0.304143  
GarageArea    0.072139  0.110738  
YearBuilt    -0.062751 -0.668106  
KitchenQual  -0.128761  0.168668  
BsmtQual      0.489908  0.588074  
1stFlrSF     -0.086245  0.052660  


In [366]:
#train_test = train_test.join(X_pca)

In [367]:
train_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,Centroid_8,Centroid_9,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8
0,1,2.664197,3,15.967598,17.222845,1,1,3,3,0,...,2.630429,0.727621,0.960533,-1.606602,0.594178,0.173491,0.647924,0.512502,0.025688,-0.367022
1,2,2.192747,3,18.053783,17.639401,1,1,3,3,0,...,1.440709,2.752441,0.26531,0.93685,-1.572727,-1.002519,0.049585,-0.699815,0.14159,0.442192
2,3,2.664197,3,16.400996,18.166915,1,1,0,3,0,...,2.690726,0.576227,1.196566,-1.400557,0.776728,0.050736,0.519023,0.698819,0.081171,-0.311566
3,4,2.72244,3,15.224624,17.622217,1,1,0,3,0,...,2.627652,0.844579,-0.289799,0.577144,1.516,1.091104,0.338788,1.506363,-0.444203,0.840333
4,5,2.664197,3,18.579309,18.976034,1,1,0,3,0,...,3.517645,1.913878,2.598619,0.157085,0.080754,-0.380072,0.653756,1.077821,-0.180042,-0.186823


In [411]:
train_test['Neighbor_or_YearBlt'] = train_test['Neighborhood'] * train_test['YearBuilt']
train_test['GrLivArea_or_1stFlrSF'] = train_test['GrLivArea'] * train_test['1stFlrSF']
train_test['YearBlt_or_BsmtQual'] = train_test['YearBuilt'] * train_test['BsmtQual']
train_test['BrLivArea_or_GarageArea'] = train_test['GrLivArea'] * train_test['GarageArea']


To create a column with the log of the SalePrice to match the evaluation metric in the contest. 

In [412]:
train_test['LogSalePrice'] = train_test['SalePrice'].apply(np.log)

#  Setting Up and Running the Models

To separate the train_test dataset back into the train and test datasets and identify the independent and dependent columns. Because the dataset is so small, the cross fold validation process seemed to have much less overfitting than creating the separate validation and training sets. 

In [413]:
train = train_test[train_test['SalePrice'].notnull()].copy()
test = train_test[train_test['SalePrice'].isnull()].drop(['SalePrice','LogSalePrice'],axis=1)
X = train.drop(['SalePrice','LogSalePrice'],axis=1)
y = train.LogSalePrice

In [414]:
X,y = shuffle(X,y, random_state=42)
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

In [415]:
X.shape

(1460, 104)

To set up matplot parameters

In [416]:
Xy = X.copy()
Xy['Cluster'] = Xy.Cluster.astype("category")
Xy["LogSalePrice"] = y
sns.relplot(
    x="value", y='LogSalePrice', hue='Cluster', col="variable",
    height=4, aspect=1, facet_kws={'sharex': False}, col_wrap=3,
    data=Xy.melt(
        value_vars=space, id_vars=['LogSalePrice', 'Cluster'],
    ),
);

To set up the functions for Mutual Information scores.

In [417]:
discrete_features = X.dtypes == int

def mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [418]:
type(X)

pandas.core.frame.DataFrame

In [419]:
mi_scores = mi_scores(X, y, discrete_features)
mi_scores[0:35]

Centroid_5                 0.603281
TotalFlrSF                 0.571691
OverallQual                0.567742
Cluster                    0.547245
GrLivArea_or_1stFlrSF      0.546692
Centroid_6                 0.510210
Neighborhood               0.499822
Neighbor_or_YearBlt        0.486574
GrLivArea                  0.478270
MedNhbdArea                0.472585
GarageCarArea              0.397313
Centroid_3                 0.377931
BrLivArea_or_GarageArea    0.377908
GarageArea                 0.371306
YearBlt_or_BsmtQual        0.365790
YearBuilt                  0.364661
TotalBsmtSF                0.364305
GarageCars                 0.361493
TotalSF                    0.356510
Centroid_2                 0.349897
Centroid_1                 0.348842
Centroid_0                 0.336061
Centroid_7                 0.328447
ExterQual                  0.325330
KitchenQual                0.323876
Centroid_8                 0.311006
BsmtQual                   0.307389
GarageYrBlt                0

To set up the function for Principle Component Analysis

To set up the cross validation folds.

In [420]:
kf = KFold(n_splits=12, random_state=42, shuffle=True)

In a separate workbook, Optuna (https://optuna.org/) was used to find the optimal parameters for a wide selection of regression models. It was interesting that the results using Optuna on this dataset were not significantly better than using the default parameters for most of the models. 

In [421]:
cat_model = CatBoostRegressor(colsample_bylevel= 0.08309602563537534, learning_rate= 0.08286145675756133, depth= 4, 
                              l2_leaf_reg= 14.555249413444315, subsample= 0.9097411584295835, 
                              bagging_temperature= 3.177590955252409, model_size_reg= 0.3808343022980778, 
                              boosting_type= 'Plain', verbose=False, random_state=42)

To define the scoring metric. 

In [422]:
def rmse_cv(model,X,y):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kf))
    return rmse

In [423]:
print(rmse_cv(cat_model,X,y).mean())

0.11700880869696308


Base score for CatBoost = 0.12337

With features Features and MedNhbdArea = 0.12058

With feature Cluster = 0.12061

With 10 new Centroid features = 0.11888

With the PCA features = 0.11701 Kaggle = 0.12557

With PCA features and PCA components = 0.1173 Kaggle = 0.12695

In [424]:
submit = test[['Id']]
submit = submit.reset_index(drop=True)

In [425]:
cat_model.fit(X,y)

<catboost.core.CatBoostRegressor at 0x2b6046f48e0>

In [426]:
submit_predict = cat_model.predict(test)
submit_predict = np.exp(submit_predict)

In [427]:
submit['SalePrice'] = submit_predict

In [428]:
submit.to_csv('submit_cat_more_features.csv', index=False)