# Preparing data

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
data_train=pd.read_csv('../data/house-prices-advanced-regression-techniques/train.csv')
print('train size={}'.format(data_train.shape))

data_test=pd.read_csv('../data/house-prices-advanced-regression-techniques/test.csv')
print('train test={}'.format(data_test.shape))

train size=(1460, 81)
train test=(1459, 80)


## Drop data

* columns with > 0.90 missing values <br>['PoolQC', 'MiscFeature', 'Alley']

* most of the values present (see the 75 quantile below) in these variables are 0, thus these features can be cleared during the data preprocessing step.<br>
['BsmtFinSF2','LowQualFinSF','BsmtHalfBath','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal']

* imbalance classes big_class_ratio>=0.90<br>
['Street','Utilities','LandSlope','Condition2','RoofMatl','BsmtCond','Heating','CentralAir','Electrical','Functional','GarageQual','GarageCond','PavedDrive','MiscFeature']

* Highly correlated features r2>0.80 <br>
 'YearBuilt','GarageYrBlt'
 'GarageCars','GarageArea'
 'TotRmsAbvGrd','GrLivArea'
 'TotalBsmtSF','1stFlrSF'

In [44]:
empty_columns=['PoolQC','MiscFeature','Alley','Fence']
lot_of_zeros=['BsmtFinSF2','LowQualFinSF','BsmtHalfBath','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal']
imbalance_classes=['Street','Utilities','LandSlope','Condition2','RoofMatl','BsmtCond','Heating','CentralAir','Electrical','Functional','GarageQual','GarageCond','PavedDrive','MiscFeature']
correlated_features=['GarageYrBlt','GarageArea','GrLivArea','1stFlrSF']

drop_columns=empty_columns+lot_of_zeros+imbalance_classes+correlated_features

data_train_cl=data_train.drop(drop_columns,axis=1)
print(len(drop_columns))


# NA for Basement section means no basement in the house
values = {"BsmtQual": 'NoBsmt', "BsmtCond": 'NoBsmt',"BsmtFinType1":'NoBsmt',"BsmtFinType2":'NoBsmt',"BsmtExposure":'NoBsmt'}
data_train_cl.fillna(value=values,inplace=True)

# NA in 'FireplaceQu' means no FirePlace in the house (missing ratio 0.486468)
data_train_cl.fillna(value={"FireplaceQu":"NoFirePlace"}, inplace=True)

# NA for Garage section means no Garage in the house (0.054471)
values = {"GarageType": 'NoGarage', "GarageFinish":"NoGarage","GarageQual":"NoGarage","GarageCond":"NoGarage"}
data_train_cl.fillna(value=values,inplace=True)

# replace 'LotFrontage' NA with median value (0.166495)
data_train_cl.fillna(value={'LotFrontage':np.nanmedian(data_train_cl['LotFrontage'])},inplace=True)

# left features with missing ratio (0.005)
data_train_cl.dropna(inplace=True)

30


## Feature Engineering

In [50]:
data_train_cl.corr()['SalePrice'].sort_values(ascending=False)

SalePrice       1.000000
OverallQual     0.789997
GarageCars      0.639686
TotalBsmtSF     0.612971
FullBath        0.562491
TotRmsAbvGrd    0.536311
YearBuilt       0.522896
YearRemodAdd    0.507158
MasVnrArea      0.477493
Fireplaces      0.468930
BsmtFinSF1      0.383977
LotFrontage     0.333184
WoodDeckSF      0.324650
2ndFlrSF        0.322710
OpenPorchSF     0.311268
HalfBath        0.282040
LotArea         0.264674
BsmtFullBath    0.225027
BsmtUnfSF       0.215740
BedroomAbvGr    0.171934
MoSold          0.045136
Id             -0.025343
YrSold         -0.026180
OverallCond    -0.076294
MSSubClass     -0.082813
KitchenAbvGr   -0.137419
Name: SalePrice, dtype: float64

In [57]:
data_train_cl['HouseAge']=data_train_cl['YrSold']-data_train_cl['YearBuilt']
data_train_cl['Time_From_Remodel']=data_train_cl['YrSold']-data_train_cl['YearRemodAdd']

In [58]:
data_train_cl.corr()['SalePrice'].sort_values(ascending=False)

SalePrice            1.000000
OverallQual          0.789997
GarageCars           0.639686
TotalBsmtSF          0.612971
FullBath             0.562491
TotRmsAbvGrd         0.536311
YearBuilt            0.522896
YearRemodAdd         0.507158
MasVnrArea           0.477493
Fireplaces           0.468930
BsmtFinSF1           0.383977
LotFrontage          0.333184
WoodDeckSF           0.324650
2ndFlrSF             0.322710
OpenPorchSF          0.311268
HalfBath             0.282040
LotArea              0.264674
BsmtFullBath         0.225027
BsmtUnfSF            0.215740
BedroomAbvGr         0.171934
MoSold               0.045136
Id                  -0.025343
YrSold              -0.026180
OverallCond         -0.076294
MSSubClass          -0.082813
KitchenAbvGr        -0.137419
Time_From_Remodel   -0.508947
HouseAge            -0.523210
Name: SalePrice, dtype: float64

In [59]:
data_train_cl['Time_From_Remodel'].min()

-1