In [355]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.gridspec as gridspec

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

In [356]:
pd.set_option('display.max_columns', None)

### Import Data

In [357]:
train = pd.read_csv('../data/Model Train.csv').drop(['Unnamed: 0','SalePrice'], axis=1
                                                   ).set_index('Id')
test = pd.read_csv('../data/Model Test.csv').drop(['Unnamed: 0','SalePrice'], axis=1
                                                 ).set_index('Id')
Ktest = pd.read_csv('../data/Kaggle_test.csv').set_index('Id')

# Feature Engineering

In [358]:
def pp(dataframe):
    df = dataframe.copy()
    
    # Ting
    df['MSSubClass'] = df['MSSubClass'].apply(lambda x: 'PUD' if x in [120,150,160,180] 
                                              else 'NotPUD')
    df['Alley'] = df['Alley'].fillna('NoAlley')
    df['AgeBuilt'] = df['YrSold'] - df['YearBuilt']
    df = df.drop('YearBuilt',axis=1)
    df['AgeRemodAdd'] = df['YrSold'] - df['YearRemodAdd']
    df = df.drop('YearRemodAdd',axis=1)
    df['LotArea'] = np.log(df['LotArea'])
    df['LotFrontage'] = np.log(df['LotFrontage'])
    
    # Lanqing
    df['MasVnrType'] = df['MasVnrType'].fillna('None')
    df['MasVnrArea'] = np.log(df['MasVnrArea'].fillna(0) + 1)
    df['ExterQual'] = df['ExterQual'].replace({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1})
    df['ExterCond'] = df['ExterCond'].replace({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1})
    df['BsmtQual'] = df['BsmtQual'].fillna('No').replace(
                                    {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'No':0})
    df['BsmtCond'] = df['BsmtCond'].fillna('No').replace(
                                    {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'No':0})
    df['BsmtExposure'] = df['BsmtExposure'].fillna('NoB').replace(
                                    {'Gd':4,'Av':3,'Mn':2,'No':1,'NoB':0})
    df['BsmtFinType1'] = df['BsmtFinType1'].fillna('No').replace(
                                    {'GLQ':6,'ALQ':5,'BLQ':4,'Rec':3,'LwQ':2,'Unf':1,'No':0})
    df['BsmtFinType2'] = df['BsmtFinType2'].fillna('No').replace(
                                    {'GLQ':6,'ALQ':5,'BLQ':4,'Rec':3,'LwQ':2,'Unf':1,'No':0})
    df['BsmtFinSF1'] = np.log(df['BsmtFinSF1']+1)
    df['BsmtFinSF2'] = np.log(df['BsmtFinSF2']+1)
    df['BsmtUnfSF'] = np.log(df['BsmtUnfSF']+1)    
    df['BsmtUnfSF'] = np.log(df['BsmtUnfSF']+1)
    
    # Marina
    df['HeatingQC'] = df['HeatingQC'].replace({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1})
    df['1stFlrSF'] = np.log(df['1stFlrSF'])
    df['2ndFlrSF'] = np.log(df['2ndFlrSF']+1)
    df['LowQualFinSF'] = np.log(df['LowQualFinSF']+1)
    df['GrLivArea'] = np.log(df['GrLivArea'])
    df['BsmtBath'] = df['BsmtFullBath'] + df['BsmtHalfBath']*0.5
    df = df.drop(['BsmtFullBath','BsmtHalfBath'],axis=1)   
    df['Bath'] = df['FullBath'] + df['HalfBath']*0.5
    df = df.drop(['FullBath','HalfBath'],axis=1)
    df['KitchenQual'] = df['KitchenQual'].replace({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1})
    df['Functional'] = df['Functional'].replace({'Typ':0,'Min1':1,'Min2':2,'Mod':3,
                                                'Maj1':4,'Maj2':5,'Sev':6,'Sal':7})
    df['FireplaceQu'] = df['FireplaceQu'].fillna('No').replace(
                                    {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'No':0})
        
    # Alex
    df['GarageType'] = df['GarageType'].fillna('NoGarage')
    # df['GarageAge'] = df['YrSold'] - df['GarageYrBlt'].fillna('??') # hard to impute
    df = df.drop('GarageYrBlt',axis=1)
    df['GarageFinish'] = df['GarageFinish'].fillna('No').replace(
                                           {'No':0,'Unf':1,'RFn':2,'Fin':3})
    df['GarageArea'] = np.log(df['GarageArea']+1)
    df['GarageQual'] = df['GarageQual'].fillna('No').replace(
                                    {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'No':0})
    df['GarageCond'] = df['GarageCond'].fillna('No').replace(
                                    {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'No':0})
    df['WoodDeckSF'] = np.log(df['WoodDeckSF']+1)
    df['PorchSF'] = np.log(df['OpenPorchSF']+df['EnclosedPorch']+
                           df['3SsnPorch']+df['ScreenPorch']+1)
    df = df.drop(['OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch'],axis=1)
    df['PoolArea'] = np.log(df['PoolArea']+1)
    df['PoolQC'] = df['PoolQC'].fillna('No').replace(
                                    {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'No':0})
    df['Fence'] = df['Fence'].fillna('NoFence')
    df['MiscFeature'] = df['MiscFeature'].fillna('None')
    df['MiscVal'] = np.log(df['MiscVal']+1)
    
    return(df)

In [359]:
train= pp(train)
test = pp(test)
Ktest = pp(Ktest)

# Features

In [360]:
cat_nom_cols = ['MSSubClass','MSZoning','Street','Alley','LotShape','LandContour',
                'Utilities','LotConfig','LandSlope','Neighborhood','Condition1',
                'Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl','Exterior1st',
                'Exterior2nd','MasVnrType','Foundation','Heating','CentralAir',
                'Electrical','GarageType','PavedDrive','Fence','MiscFeature','MoSold',
                'YrSold','SaleType','SaleCondition']
cat_ord_cols = ['OverallQual', 'OverallCond','AgeBuilt','AgeRemodAdd','ExterQual',
                'ExterCond','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1',
                'BsmtFinType2','HeatingQC','BsmtBath','Bath','BedroomAbvGr','KitchenAbvGr',
                'KitchenQual','TotRmsAbvGrd','Functional','Fireplaces','FireplaceQu',
                'GarageFinish','GarageCars','GarageQual','GarageCond','PoolQC']
num_cols = ['LotArea','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF',
            'TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea','GarageArea',
            'WoodDeckSF','PorchSF','PoolArea','MiscVal']
knn_col = ['LotFrontage']

In [361]:
train['PorchSF'].isna().sum()

0

# Impute Missing Values

In [362]:
cat_imp = SimpleImputer(strategy='most_frequent')
cat_imp = cat_imp.fit(train[cat_nom_cols])

train_cat_imp = cat_imp.transform(train[cat_nom_cols])
test_cat_imp = cat_imp.transform(test[cat_nom_cols])
Ktest_cat_imp = cat_imp.transform(Ktest[cat_nom_cols])

In [363]:
num_imp = SimpleImputer(strategy='median')
num_imp = num_imp.fit(train[cat_ord_cols+num_cols])

train_num_imp = num_imp.transform(train[cat_ord_cols+num_cols])
test_num_imp = num_imp.transform(test[cat_ord_cols+num_cols])
Ktest_num_imp = num_imp.transform(Ktest[cat_ord_cols+num_cols])

# One Hot Encoding & Lable Encoding

In [364]:
all_cat = pd.DataFrame(np.concatenate((train_cat_imp,test_cat_imp,Ktest_cat_imp),
                                      axis=0),columns=cat_nom_cols)
drop_col = all_cat.apply(lambda col: col.value_counts().index[0])
sub_cat = list(all_cat.apply(lambda col: list(col.unique())))

In [365]:
cat_ohe = OneHotEncoder(categories=sub_cat, drop=drop_col, sparse = False)
cat_ohe = cat_ohe.fit(train_cat_imp)

new_cols = cat_ohe.get_feature_names(cat_nom_cols)

In [366]:
train_cat_ohe = cat_ohe.transform(train_cat_imp)
test_cat_ohe = cat_ohe.transform(test_cat_imp)
Ktest_cat_ohe = cat_ohe.transform(Ktest_cat_imp)

In [367]:
cat_label = OrdinalEncoder(categories=sub_cat)
cat_label = cat_label.fit(train_cat_imp)

In [368]:
train_cat_label = cat_label.transform(train_cat_imp)
test_cat_label = cat_label.transform(test_cat_imp)
Ktest_cat_label = cat_label.transform(Ktest_cat_imp)

# Scale

In [369]:
scl = StandardScaler()
scl = scl.fit(train_num_imp)

In [370]:
train_num_scl = scl.transform(train_num_imp)
test_num_scl = scl.transform(test_num_imp)
Ktest_num_scl = scl.transform(Ktest_num_imp)

# Impute LotFrontage Using KNN

In [371]:
lotFront_scl = StandardScaler()
lotFront_scl = lotFront_scl.fit(train[knn_col])

In [372]:
lotFront_scl_train = lotFront_scl.transform(train[knn_col])
lotFront_scl_test = lotFront_scl.transform(test[knn_col])
lotFront_scl_Ktest = lotFront_scl.transform(Ktest[knn_col])

In [373]:
knn_train = np.concatenate((train_cat_ohe,train_num_scl,lotFront_scl_train),axis=1)
knn_test = np.concatenate((test_cat_ohe,test_num_scl,lotFront_scl_test),axis=1)
knn_Ktest = np.concatenate((Ktest_cat_ohe,Ktest_num_scl,lotFront_scl_Ktest),axis=1)

In [374]:
from math import sqrt
n_neighbors = round(sqrt(train.shape[0]))
knn_imp = KNNImputer(n_neighbors=n_neighbors)
knn_imp = knn_imp.fit(knn_train)

In [375]:
lotFront_scl_train_imp = knn_imp.transform(knn_train)[:,-1]
lotFront_scl_test_imp = knn_imp.transform(knn_test)[:,-1]
lotFront_scl_Ktest_imp = knn_imp.transform(knn_Ktest)[:,-1]

In [376]:
lotFront_train_imp = lotFront_scl.inverse_transform(lotFront_scl_train_imp)
lotFront_test_imp = lotFront_scl.inverse_transform(lotFront_scl_test_imp)
lotFront_Ktest_imp = lotFront_scl.inverse_transform(lotFront_scl_Ktest_imp)

# Output Data for Tree Models: Label Encoding, No Scaling

In [377]:
tree_cols = cat_ord_cols+num_cols+cat_nom_cols+knn_col
train_tree = pd.DataFrame(np.concatenate((train_num_imp,train_cat_label,
                                          lotFront_train_imp.reshape(-1,1)),axis=1),
                          columns=tree_cols)
test_tree = pd.DataFrame(np.concatenate((test_num_imp,test_cat_label,
                                          lotFront_test_imp.reshape(-1,1)),axis=1),
                          columns=tree_cols)
Ktest_tree = pd.DataFrame(np.concatenate((Ktest_num_imp,Ktest_cat_label,
                                          lotFront_Ktest_imp.reshape(-1,1)),axis=1),
                          columns=tree_cols)

In [378]:
train_tree = pd.concat([train.reset_index()['Id'],train_tree],axis=1)
test_tree = pd.concat([test.reset_index()['Id'],test_tree],axis=1)
Ktest_tree = pd.concat([Ktest.reset_index()['Id'],Ktest_tree],axis=1)

In [379]:
train_tree.to_csv('../data/train_tree.csv',index=False)
test_tree.to_csv('../data/test_tree.csv',index=False)
Ktest_tree.to_csv('../data/Ktest_tree.csv',index=False)

# Output Data for Other Models: One Hoe Encoding, Scaling

In [380]:
other_cols = cat_ord_cols+num_cols+list(new_cols)+knn_col
train_other = pd.DataFrame(np.concatenate((train_cat_ohe,train_num_scl,
                                          lotFront_scl_train_imp.reshape(-1,1)),axis=1),
                          columns=other_cols)
test_other = pd.DataFrame(np.concatenate((test_cat_ohe,test_num_scl,
                                          lotFront_scl_test_imp.reshape(-1,1)),axis=1),
                          columns=other_cols)
Ktest_other = pd.DataFrame(np.concatenate((Ktest_cat_ohe,Ktest_num_scl,
                                          lotFront_scl_Ktest_imp.reshape(-1,1)),axis=1),
                          columns=other_cols)

In [381]:
train_other = pd.concat([train.reset_index()['Id'],train_other],axis=1)
test_other = pd.concat([test.reset_index()['Id'],test_other],axis=1)
Ktest_other = pd.concat([Ktest.reset_index()['Id'],Ktest_other],axis=1)

In [382]:
train_other.to_csv('../data/train_other.csv',index=False)
test_other.to_csv('../data/test_other.csv',index=False)
Ktest_other.to_csv('../data/Ktest_other.csv',index=False)

In [383]:
drop_col

MSSubClass        NotPUD
MSZoning              RL
Street              Pave
Alley            NoAlley
LotShape             Reg
LandContour          Lvl
Utilities         AllPub
LotConfig         Inside
LandSlope            Gtl
Neighborhood       NAmes
Condition1          Norm
Condition2          Norm
BldgType            1Fam
HouseStyle        1Story
RoofStyle          Gable
RoofMatl         CompShg
Exterior1st      VinylSd
Exterior2nd      VinylSd
MasVnrType          None
Foundation         PConc
Heating             GasA
CentralAir             Y
Electrical         SBrkr
GarageType        Attchd
PavedDrive             Y
Fence            NoFence
MiscFeature         None
MoSold                 6
YrSold              2007
SaleType              WD
SaleCondition     Normal
dtype: object