## Preprocessing and Feature Engineering

In [79]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import category_encoders as ce

In [80]:
# read the data
data = pd.read_csv('../data/data2.csv')

---

#### Contrast coding for categorical features

In [81]:
# names of categorical features
cat_cols = [
    'house_style', 'heating', 'central_air', 'electrical', 'paved_drive', 'garage_finish', 'fence', 
    'bsmtfin_type_1',
    # added features that were among the numeric features, but they need to be converted to categorical features
    'bsmt_cond', 'fireplace_qu', 'garage_qual', 'garage_cond', 'pool_area']

In [82]:
data['house_style'].value_counts()

story_1               1050
story_2                595
story_1_half_fin       218
split_level             91
split_foyer             49
story_2_half_unfin      13
story_1_half_unfin      12
story_2_half_fin         5
Name: house_style, dtype: int64

In [83]:
data['house_style'].astype('category').cat.reorder_categories(['split_foyer','split_level','story_1',
                                                               'story_1_half_unfin','story_1_half_fin',
                                                               'story_2','story_2_half_unfin','story_2_half_fin'],
                                                             inplace=True)

In [84]:
encoder = ce.BackwardDifferenceEncoder()
ce_house_style = encoder.fit_transform(data['house_style']).drop(columns='intercept')
ce_house_style.rename(columns = {
    'house_style_0' : 'house_style-split_level_VS_split_foyer',
    'house_style_1' : 'house_style-story_1_VS_split_level',
    'house_style_2' : 'house_style-story_1_half_unfin_VS_story_1',
    'house_style_3' : 'house_style-story_1_half_fin_VS_story_1_half_unfin',
    'house_style_4' : 'house_style-story_2_VS_story_1_half_fin',
    'house_style_5' : 'house_style-story_2_half_unfin_VS_story_2',
    'house_style_6' : 'house_style-story_2_half_fin_VS_story_2_half_unfin'
}, inplace=True)

  elif pd.api.types.is_categorical(cols):


In [7]:
# referent category is the best category: 'story_2_half_fin'
# dum_hous_style = pd.get_dummies(data['house_style'], prefix='story_2_half_fin_VS').drop(columns='story_2_half_fin_VS_story_2_half_fin')

In [85]:
data['heating'].value_counts()

GasA    2000
GasW      20
Wall       6
Grav       5
OthW       2
Name: heating, dtype: int64

In [86]:
data['heating'].astype('category').cat.reorder_categories(['GasA','GasW','Grav','Wall','OthW'], inplace=True)

In [87]:
encoder = ce.BackwardDifferenceEncoder()
ce_heating = encoder.fit_transform(data['heating']).drop(columns='intercept')
ce_heating.rename(columns = {
    'heating_0' : 'heating-GasW_VS_GasA',
    'heating_1' : 'heating-Grav_VS_GasW',
    'heating_2' : 'heating-Wall_VS_Grav',
    'heating_3' : 'heating-OthW_VS_Wall' }, inplace=True)

  elif pd.api.types.is_categorical(cols):


In [11]:
# referent category is the no-gas category: 'OthW'
# dum_heating = pd.get_dummies(data['heating'], prefix='OthW_VS').drop(columns='OthW_VS_OthW')

In [88]:
data['central_air'].value_counts()

Y    1893
N     140
Name: central_air, dtype: int64

In [89]:
data['central_air'].astype('category').cat.reorder_categories(['N','Y'], inplace=True)

In [90]:
encoder = ce.BackwardDifferenceEncoder()
ce_central_air = encoder.fit_transform(data['central_air']).drop(columns='intercept')
ce_central_air.rename(columns = {
    'central_air_0' : 'central_air_VS_no_central_air'}, inplace=True)

  elif pd.api.types.is_categorical(cols):


In [15]:
# 1 is yes, 0 is no central A/C
# dum_central_air = (data['central_air']=='Y').astype(int)

In [91]:
data['electrical'].value_counts()

SBrkr    1854
FuseA     137
FuseF      35
FuseP       7
Name: electrical, dtype: int64

In [92]:
data['electrical'].astype('category').cat.reorder_categories(['SBrkr','FuseA','FuseF','FuseP'], inplace=True)

In [93]:
encoder = ce.BackwardDifferenceEncoder()
ce_electrical = encoder.fit_transform(data['electrical']).drop(columns='intercept')
ce_electrical.rename(columns = {
    'electrical_0' : 'electrical-FuseA_VS_SBrkr',
    'electrical_1' : 'electrical-FuseF_VS_FuseA',
    'electrical_2' : 'electrical-FuseP_VS_FuseF'}, inplace=True)

  elif pd.api.types.is_categorical(cols):


In [19]:
# referent category will be the standard: 'SBrkr'
# dum_electrical = pd.get_dummies(data['electrical'], prefix='SBrkr_VS').drop(columns='SBrkr_VS_SBrkr')

In [94]:
data['paved_drive'].value_counts()

Y    1846
N     148
P      39
Name: paved_drive, dtype: int64

In [95]:
data['paved_drive'].astype('category').cat.reorder_categories(['N','P','Y'], inplace=True)

In [96]:
encoder = ce.BackwardDifferenceEncoder()
ce_paved_drive = encoder.fit_transform(data['paved_drive']).drop(columns='intercept')
ce_paved_drive.rename(columns = {
    'paved_drive_0' : 'paved_drive-P_VS_N',
    'paved_drive_1' : 'paved_drive-Y_VS_P'}, inplace=True)

  elif pd.api.types.is_categorical(cols):


In [23]:
# referent category is no-paved: 'N'
# dum_paved_drive = pd.get_dummies(data['paved_drive'], prefix='N_VS').drop(columns='N_VS_N')

In [97]:
data['garage_finish'].value_counts()

Unf          843
RFn          578
Fin          501
no_garage    111
Name: garage_finish, dtype: int64

In [98]:
data['garage_finish'].astype('category').cat.reorder_categories(['no_garage','Unf','RFn','Fin'], inplace=True)

In [99]:
encoder = ce.BackwardDifferenceEncoder()
ce_garage_finish = encoder.fit_transform(data['garage_finish']).drop(columns='intercept')
ce_garage_finish.rename(columns = {
    'garage_finish_0' : 'garage_finish-Unf_VS_no_garage',
    'garage_finish_1' : 'garage_finish-RFn_VS_Unf',
    'garage_finish_2' : 'garage_finish-Fin_VS_RFn'}, inplace=True)

  elif pd.api.types.is_categorical(cols):


In [27]:
# referent category is no garage
# dum_garage_finish = pd.get_dummies(data['garage_finish'], prefix='no_garage_VS').drop(columns='no_garage_VS_no_garage')

In [100]:
data['fence'].value_counts()

no_fence    1637
MnPrv        224
GdPrv         82
GdWo          80
MnWw          10
Name: fence, dtype: int64

In [101]:
data['fence'].astype('category').cat.reorder_categories(['no_fence','MnWw','GdWo','MnPrv','GdPrv'], inplace=True)

In [102]:
encoder = ce.BackwardDifferenceEncoder()
ce_fence = encoder.fit_transform(data['fence']).drop(columns='intercept')
ce_fence.rename(columns = {
    'fence_0' : 'fence-MnWw_VS_no_fence',
    'fence_1' : 'fence-GdWo_VS_MnWw',
    'fence_2' : 'fence-MnPrv_VS_GdWo',
    'fence_3' : 'fence-GdPrv_VS_MnPrv'}, inplace=True)

  elif pd.api.types.is_categorical(cols):


In [31]:
# referent category is no fence
# dum_fence = pd.get_dummies(data['fence'], prefix='no_fence_VS').drop(columns='no_fence_VS_no_fence')

In [103]:
data['bsmtfin_type_1'].value_counts()

GLQ            604
Unf            601
ALQ            292
BLQ            200
Rec            183
LwQ            100
no_basement     53
Name: bsmtfin_type_1, dtype: int64

In [104]:
data['bsmtfin_type_1'].astype('category').cat.reorder_categories(['no_basement','Unf','LwQ','Rec',
                                                                 'BLQ','ALQ','GLQ'], inplace=True)

In [105]:
encoder = ce.BackwardDifferenceEncoder()
ce_bsmtfin_type_1 = encoder.fit_transform(data['bsmtfin_type_1']).drop(columns='intercept')
ce_bsmtfin_type_1.rename(columns = {
    'bsmtfin_type_1_0' : 'bsmtfin_type_1-Unf_VS_no_basement',
    'bsmtfin_type_1_1' : 'bsmtfin_type_1-LwQ_VS_Unf',
    'bsmtfin_type_1_2' : 'bsmtfin_type_1-Rec_VS_LwQ',
    'bsmtfin_type_1_3' : 'bsmtfin_type_1-BLQ_VS_Rec',
    'bsmtfin_type_1_4' : 'bsmtfin_type_1-ALQ_VS_BLQ',
    'bsmtfin_type_1_5' : 'bsmtfin_type_1-GLQ_VS_ALQ'}, inplace=True)

  elif pd.api.types.is_categorical(cols):


In [35]:
# referent category is no basement
# dum_bsmtfin_type_1 = pd.get_dummies(data['bsmtfin_type_1'], prefix='no_basement_VS').drop(columns='no_basement_VS_no_basement')

In [106]:
data['bsmt_cond'].value_counts()

3.0    1824
4.0      85
2.0      64
1.0       4
5.0       3
Name: bsmt_cond, dtype: int64

In [107]:
data['bsmt_cond'] = ['5' if a==5 else '4' if a==4 else '3' if a==3 else '2' if a==2 else '1' if a==1 else 'no_basement' for a in data['bsmt_cond']]

In [108]:
data['bsmt_cond'].value_counts()

3              1824
4                85
2                64
no_basement      53
1                 4
5                 3
Name: bsmt_cond, dtype: int64

In [109]:
data['bsmt_cond'].astype('category').cat.reorder_categories(['no_basement','1','2','3',
                                                                 '4','5'], inplace=True)

In [110]:
encoder = ce.BackwardDifferenceEncoder()
ce_bsmt_cond = encoder.fit_transform(data['bsmt_cond']).drop(columns='intercept')
ce_bsmt_cond.rename(columns = {
    'bsmt_cond_0' : 'bsmt_cond-1_VS_no_basement',
    'bsmt_cond_1' : 'bsmt_cond-2_VS_1',
    'bsmt_cond_2' : 'bsmt_cond-3_VS_2',
    'bsmt_cond_3' : 'bsmt_cond-4_VS_3',
    'bsmt_cond_4' : 'bsmt_cond-5_VS_4'}, inplace=True)

  elif pd.api.types.is_categorical(cols):


In [41]:
# dum_bsmt_cond = pd.get_dummies(data['bsmt_cond'], prefix='no_basement_VS').drop(columns='no_basement_VS_no_basement')

In [111]:
data['fireplace_qu'].value_counts()

4.0    518
3.0    403
2.0     58
5.0     31
1.0     31
Name: fireplace_qu, dtype: int64

In [112]:
data['fireplace_qu'] = ['5' if a==5 else '4' if a==4 else '3' if a==3 else '2' if a==2 else '1' if a==1 else 'no_fireplace' for a in data['fireplace_qu']]

In [113]:
data['fireplace_qu'].value_counts()

no_fireplace    992
4               518
3               403
2                58
5                31
1                31
Name: fireplace_qu, dtype: int64

In [114]:
data['fireplace_qu'].astype('category').cat.reorder_categories(['no_fireplace','1','2','3',
                                                                 '4','5'], inplace=True)

In [115]:
encoder = ce.BackwardDifferenceEncoder()
ce_fireplace_qu = encoder.fit_transform(data['fireplace_qu']).drop(columns='intercept')
ce_fireplace_qu.rename(columns = {
    'fireplace_qu_0' : 'fireplace_qu-1_VS_no_fireplace',
    'fireplace_qu_1' : 'fireplace_qu-2_VS_1',
    'fireplace_qu_2' : 'fireplace_qu-3_VS_2',
    'fireplace_qu_3' : 'fireplace_qu-4_VS_3',
    'fireplace_qu_4' : 'fireplace_qu-5_VS_4'}, inplace=True)

  elif pd.api.types.is_categorical(cols):


In [47]:
# dum_fireplace_qu = pd.get_dummies(data['fireplace_qu'], prefix='no_fireplace_VS').drop(columns='no_fireplace_VS_no_fireplace')

In [116]:
data['garage_qual'].value_counts() 

3.0    1820
2.0      80
4.0      18
5.0       3
1.0       1
Name: garage_qual, dtype: int64

In [117]:
data['garage_qual'] = ['5' if a==5 else '4' if a==4 else '3' if a==3 else '2' if a==2 else '1' if a==1 else 'no_garage' for a in data['garage_qual']]

In [118]:
data['garage_qual'].value_counts() 

3            1820
no_garage     111
2              80
4              18
5               3
1               1
Name: garage_qual, dtype: int64

In [119]:
data['garage_qual'].astype('category').cat.reorder_categories(['no_garage','1','2','3',
                                                                 '4','5'], inplace=True)

In [120]:
encoder = ce.BackwardDifferenceEncoder()
ce_garage_qual = encoder.fit_transform(data['garage_qual']).drop(columns='intercept')
ce_garage_qual.rename(columns = {
    'garage_qual_0' : 'garage_qual-1_VS_no_garage',
    'garage_qual_1' : 'garage_qual-2_VS_1',
    'garage_qual_2' : 'garage_qual-3_VS_2',
    'garage_qual_3' : 'garage_qual-4_VS_3',
    'garage_qual_4' : 'garage_qual-5_VS_4'}, inplace=True)

  elif pd.api.types.is_categorical(cols):


In [53]:
# dum_garage_qual = pd.get_dummies(data['garage_qual'], prefix='no_garage_VS').drop(columns='no_garage_VS_no_garage')

In [121]:
data['garage_cond'].value_counts()

3.0    1855
2.0      47
4.0      12
1.0       6
5.0       2
Name: garage_cond, dtype: int64

In [122]:
data['garage_cond'] = ['5' if a==5 else '4' if a==4 else '3' if a==3 else '2' if a==2 else '1' if a==1 else 'no_garage' for a in data['garage_cond']]

In [123]:
data['garage_cond'].value_counts()

3            1855
no_garage     111
2              47
4              12
1               6
5               2
Name: garage_cond, dtype: int64

In [124]:
data['garage_cond'].astype('category').cat.reorder_categories(['no_garage','1','2','3',
                                                                 '4','5'], inplace=True)

In [125]:
encoder = ce.BackwardDifferenceEncoder()
ce_garage_cond = encoder.fit_transform(data['garage_cond']).drop(columns='intercept')
ce_garage_cond.rename(columns = {
    'garage_cond_0' : 'garage_cond-1_VS_no_garage',
    'garage_cond_1' : 'garage_cond-2_VS_1',
    'garage_cond_2' : 'garage_cond-3_VS_2',
    'garage_cond_3' : 'garage_cond-4_VS_3',
    'garage_cond_4' : 'garage_cond-5_VS_4'}, inplace=True)

  elif pd.api.types.is_categorical(cols):


In [59]:
# dum_garage_cond = pd.get_dummies(data['garage_cond'], prefix='no_garage_VS').drop(columns='no_garage_VS_no_garage')

In [126]:
data['pool_area'].value_counts()

0    2026
1       7
Name: pool_area, dtype: int64

In [127]:
data['pool_area'] = ['no_pool' if a==0 else 'pool' for a in data['pool_area']]

In [128]:
encoder = ce.BackwardDifferenceEncoder()
ce_pool_area = encoder.fit_transform(data['pool_area']).drop(columns='intercept')
ce_pool_area.rename(columns = {
    'pool_area_0' : 'pool_VS_no_pool'}, inplace=True)

  elif pd.api.types.is_categorical(cols):


In [146]:
# putting all the categorical features together
data_cat = pd.concat([ce_house_style, 
                      ce_heating, ce_central_air, ce_electrical, ce_paved_drive,
                      ce_garage_finish, ce_fence, ce_bsmtfin_type_1,
                     ce_bsmt_cond, ce_fireplace_qu, ce_garage_qual, ce_garage_cond, ce_pool_area], axis=1)

In [148]:
# make sure we don't have null values
data_cat.isnull().sum()

house_style-split_level_VS_split_foyer                0
house_style-story_1_VS_split_level                    0
house_style-story_1_half_unfin_VS_story_1             0
house_style-story_1_half_fin_VS_story_1_half_unfin    0
house_style-story_2_VS_story_1_half_fin               0
house_style-story_2_half_unfin_VS_story_2             0
house_style-story_2_half_fin_VS_story_2_half_unfin    0
heating-GasW_VS_GasA                                  0
heating-Grav_VS_GasW                                  0
heating-Wall_VS_Grav                                  0
heating-OthW_VS_Wall                                  0
central_air_VS_no_central_air                         0
electrical-FuseA_VS_SBrkr                             0
electrical-FuseF_VS_FuseA                             0
electrical-FuseP_VS_FuseF                             0
paved_drive-P_VS_N                                    0
paved_drive-Y_VS_P                                    0
garage_finish-Unf_VS_no_garage                  

In [149]:
data_cat.shape

(2033, 51)

---

#### Scaling numeric features

In [133]:
# numeric features to be scaled, except for the saleprice
num_cols = [
    'overall_qual', 'overall_cond', 'year_remod/add', 'exter_qual', 'exter_cond', 'kitchen_qual', 'full_bath',
    'half_bath', 'bedroom_abvgr', 'kitchen_abvgr', 'totrms_abvgrd', 'fireplaces', 'wood_deck_sf',
    'heating_qc', 'bsmtfin_sf_1', 'total_bsmt_sf', 'bsmt_full_bath', 'bsmt_half_bath']

In [134]:
data_num = data[num_cols]

In [135]:
# check again for null values
data_num.isnull().sum()

overall_qual      0
overall_cond      0
year_remod/add    0
exter_qual        0
exter_cond        0
kitchen_qual      0
full_bath         0
half_bath         0
bedroom_abvgr     0
kitchen_abvgr     0
totrms_abvgrd     0
fireplaces        0
wood_deck_sf      0
heating_qc        0
bsmtfin_sf_1      0
total_bsmt_sf     0
bsmt_full_bath    0
bsmt_half_bath    0
dtype: int64

In [136]:
sc = StandardScaler()

In [137]:
data_num_Z = sc.fit_transform(data_num)

In [138]:
data_num_Z = pd.DataFrame(data_num_Z, columns=data_num.columns)

---

#### Putting all the data together

In [152]:
data_num_Z.shape

(2033, 18)

In [153]:
data_cat.shape

(2033, 51)

In [154]:
data = pd.concat([data_cat, data_num_Z, data['saleprice']], axis=1)

In [155]:
data.head()

Unnamed: 0,house_style-split_level_VS_split_foyer,house_style-story_1_VS_split_level,house_style-story_1_half_unfin_VS_story_1,house_style-story_1_half_fin_VS_story_1_half_unfin,house_style-story_2_VS_story_1_half_fin,house_style-story_2_half_unfin_VS_story_2,house_style-story_2_half_fin_VS_story_2_half_unfin,heating-GasW_VS_GasA,heating-Grav_VS_GasW,heating-Wall_VS_Grav,...,kitchen_abvgr,totrms_abvgrd,fireplaces,wood_deck_sf,heating_qc,bsmtfin_sf_1,total_bsmt_sf,bsmt_full_bath,bsmt_half_bath,saleprice
0,-0.875,-0.75,-0.625,-0.5,-0.375,-0.25,-0.125,-0.8,-0.6,-0.4,...,-0.199992,-0.278128,-0.931714,-0.750407,0.871767,0.222219,-0.76957,-0.823172,-0.250759,130500
1,-0.875,-0.75,-0.625,-0.5,-0.375,-0.25,-0.125,-0.8,-0.6,-0.4,...,-0.199992,1.030746,0.657359,-0.750407,0.871767,0.459688,-0.328274,1.14104,-0.250759,220000
2,0.125,-0.75,-0.625,-0.5,-0.375,-0.25,-0.125,-0.8,-0.6,-0.4,...,-0.199992,-0.932564,-0.931714,-0.750407,-1.202315,0.674323,0.00974,1.14104,-0.250759,109000
3,-0.875,-0.75,-0.625,-0.5,-0.375,-0.25,-0.125,-0.8,-0.6,-0.4,...,-0.199992,0.376309,-0.931714,0.065353,-0.165274,-0.994809,-1.570005,-0.823172,-0.250759,174000
4,0.125,0.25,-0.625,-0.5,-0.375,-0.25,-0.125,-0.8,-0.6,-0.4,...,-0.199992,-0.278128,-0.931714,-0.750407,-1.202315,-0.994809,-0.884588,-0.823172,-0.250759,138500


In [156]:
data.shape

(2033, 70)

---

### **Save the data**

In [157]:
data.to_csv('../data/data3.csv', index=False)

---