In [109]:
import numpy as np
import random
import pandas as pd
import seaborn as sns
from scipy import stats
from scipy.stats import norm, skew
from matplotlib import pyplot as plt
import seaborn as sns 
%matplotlib inline 
color = sns.color_palette()
sns.set_style('darkgrid')
from sklearn.model_selection import GridSearchCV,train_test_split
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import MinMaxScaler

In [110]:
train_df = pd.read_csv('train.csv', index_col=0)
predict_df = pd.read_csv('test.csv', index_col=0)
y_org = train_df['SalePrice']
y = np.log1p(train_df.pop('SalePrice'))

In [111]:
# 3. Data cleaning

## 3.1 Remove outliers 

data_all = [train_df,y,y_org]
outlier1 = (train_df['LotFrontage']>250) & (y_org<300000)
outlier2 = (train_df['BsmtFinSF1']>3000) & (y_org<200000)
outlier3 = (train_df['1stFlrSF']>4000) & (y_org<200000)
outlier4 = (train_df['GrLivArea']>4000) & (y_org<210000)
outlier4 = (train_df['LotArea']>100000)
outlier = outlier1|outlier2|outlier3|outlier4
for dataframe in data_all:
    dataframe.drop(outlier[outlier].index,inplace=True,errors='ignore')

In [112]:
## 3.2 Combine train data and predict data

all_df = pd.concat((train_df,predict_df),axis=0)
all_df.drop(columns='Utilities',inplace=True)

train_df.shape

predict_df.shape

all_df.shape

(2913, 78)

In [113]:
## 3.3 Filling Nans which is not missing by random

nan_all_df = all_df.isna().sum().sort_values(ascending=False)
nan_all_df[nan_all_df>0]

PoolQC          2904
MiscFeature     2810
Alley           2715
Fence           2342
FireplaceQu     1420
LotFrontage      483
GarageCond       159
GarageYrBlt      159
GarageQual       159
GarageFinish     159
GarageType       157
BsmtCond          82
BsmtExposure      82
BsmtQual          81
BsmtFinType2      80
BsmtFinType1      79
MasVnrType        24
MasVnrArea        23
MSZoning           4
BsmtHalfBath       2
Functional         2
BsmtFullBath       2
BsmtUnfSF          1
BsmtFinSF2         1
BsmtFinSF1         1
TotalBsmtSF        1
Exterior2nd        1
SaleType           1
Exterior1st        1
Electrical         1
KitchenQual        1
GarageCars         1
GarageArea         1
dtype: int64

In [114]:
all_df['hasPool'] = 1
all_df['hasMiscFeature'] = 1
all_df['hasAlley'] = 1
all_df['hasFence'] = 1
all_df['hasFireplace'] = 1
all_df['hasGarage'] = 1
all_df['hasbsmt'] = 1
all_df['hasPool'].loc[all_df.PoolQC.isna()] = 0
all_df['hasMiscFeature'].loc[all_df.MiscFeature.isna()] = 0
all_df['hasAlley'].loc[all_df.Alley.isna()] = 0
all_df['hasFence'].loc[all_df.Fence.isna()] = 0
all_df['hasFireplace'].loc[all_df.FireplaceQu.isna()] = 0
all_df['hasGarage'].loc[all_df.GarageCond.isna()] = 0
all_df['hasbsmt'].loc[all_df.BsmtExposure.isna()] = 0

In [115]:
all_df.PoolQC.fillna('NoPool',inplace=True)
all_df.MiscFeature.fillna('NoMiscFeature',inplace=True)
all_df.Alley.fillna('NoAlley',inplace=True)
all_df.Fence.fillna('NoFence',inplace=True)
all_df.FireplaceQu.fillna('NoFireplace',inplace=True)
all_df.GarageCond.fillna('NoGarage',inplace=True)
all_df.GarageQual.fillna('NoGarage',inplace=True)
all_df.GarageFinish.fillna('NoGarage',inplace=True)
all_df.GarageType.fillna('NoGarage',inplace=True)
all_df.BsmtExposure.fillna('Nobsmt',inplace=True)
all_df.BsmtCond.fillna('Nobsmt',inplace=True)
all_df.BsmtQual.fillna('Nobsmt',inplace=True)
all_df.BsmtFinType2.fillna('Nobsmt',inplace=True)
all_df.BsmtFinType1.fillna('Nobsmt',inplace=True)

In [116]:
## 3.4 Map string variables to numerical variables

ExterQual_rep = {'Gd':3, 'TA':2, 'Ex':4, 'Fa':1}
all_df.replace({"ExterQual": ExterQual_rep},inplace=True)

ExterCond_rep = {'Gd':3, 'TA':2, 'Ex':4, 'Fa':1,'Po':0}
all_df.replace({"ExterCond": ExterCond_rep},inplace=True)

BsmtQual_rep = {'Gd':3, 'TA':2, 'Ex':4, 'Fa':1, 'Nobsmt':0}
all_df.replace({"BsmtQual": BsmtQual_rep},inplace=True)

BsmtCond_rep = {'TA':3, 'Gd':4, 'Nobsmt':0, 'Fa':2, 'Po':1}
all_df.replace({"BsmtCond": BsmtCond_rep},inplace=True)

BsmtExposure_rep = {'No':1, 'Gd':4, 'Mn':2, 'Av':3, 'Nobsmt':0}
all_df.replace({"BsmtExposure": BsmtExposure_rep},inplace=True)

HeatingQC_rep = {'Ex':4, 'Gd':3, 'TA':2, 'Fa':1, 'Po':0}
all_df.replace({"HeatingQC": HeatingQC_rep},inplace=True)

KitchenQual_rep = {'Gd':2, 'TA':1, 'Ex':3, 'Fa':0, 1492:0}
all_df.replace({"KitchenQual": KitchenQual_rep},inplace=True)

FireplaceQu_rep = {'NoFireplace':0, 'TA':3, 'Gd':4, 'Fa':2, 'Ex':5, 'Po':1}
all_df.replace({"FireplaceQu": FireplaceQu_rep},inplace=True)

GarageQual_rep = {'TA':3, 'Fa':2, 'Gd':4, 'NoGarage':0, 'Ex':5, 'Po':1}
all_df.replace({"GarageQual": GarageQual_rep},inplace=True)

GarageCond_rep = {'TA':3, 'Fa':2, 'NoGarage':0, 'Gd':4, 'Po':1, 'Ex':5}
all_df.replace({"GarageCond": GarageCond_rep},inplace=True)

PoolQC_rep = {'NoPool':0, 'Ex':3, 'Fa':1, 'Gd':2}
all_df.replace({"PoolQC": PoolQC_rep},inplace=True)

#Group by neighborhood and fill in missing value by the median LotFrontage of all the neighborhood
all_df["LotFrontage"] = all_df.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))

for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    all_df[col] = all_df[col].fillna(0)
    
all_df["MasVnrType"] = all_df["MasVnrType"].fillna("None")
all_df["MasVnrArea"] = all_df["MasVnrArea"].fillna(0)
all_df['MSZoning'] = all_df['MSZoning'].fillna(all_df['MSZoning'].mode()[0])
for col in ('GarageYrBlt','GarageArea', 'GarageCars'):
    all_df[col] = all_df[col].fillna(0)
    
all_df["Functional"] = all_df["Functional"].fillna("Typ")

all_df['Electrical'] = all_df['Electrical'].fillna(all_df['Electrical'].mode()[0])

all_df['KitchenQual'] = all_df['KitchenQual'].fillna(all_df['KitchenQual'].mode()[0])

all_df['Exterior1st'] = all_df['Exterior1st'].fillna(all_df['Exterior1st'].mode()[0])
all_df['Exterior2nd'] = all_df['Exterior2nd'].fillna(all_df['Exterior2nd'].mode()[0])

all_df['SaleType'] = all_df['SaleType'].fillna(all_df['SaleType'].mode()[0])

#all_df['GarageYrBlt'] = all_df['GarageYrBlt'].apply(lambda x: np.random.choice(train_df['GarageYrBlt'].loc[(y>11.3) & (y<11.7)].dropna()) if (np.isnan(x)) else x)

In [117]:
#encode binary variables
from sklearn.preprocessing import LabelEncoder
cols = ('BsmtFinType1','BsmtFinType2', 'Functional', 'Fence',  'GarageFinish', 'LandSlope', 'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass')
# process columns, apply LabelEncoder to categorical features
for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(all_df[c].values)) 
    all_df[c] = lbl.transform(list(all_df[c].values))


In [118]:
# 4 Feature engineering 

## 4.1 Creading new features

all_df['TotalSF'] = all_df['TotalBsmtSF'] + all_df['1stFlrSF'] + all_df['2ndFlrSF']
all_df['TotalBathroom'] = all_df['BsmtFullBath'] + all_df['BsmtHalfBath'] + all_df['FullBath'] + all_df['HalfBath']

In [119]:
## 4.2 remove the skewness in the distribution

cat_feats = ['MSSubClass','MSZoning', 'LandContour', 'LotConfig','OverallCond', 'Neighborhood', 'Condition1',
       'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating',
       'Electrical', 'GarageType', 'MiscFeature', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition']
num_feats = [x for x in list(all_df) if x not in cat_feats]

In [120]:
conti_feats = ['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea','GarageYrBlt','GarageArea','WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'TotalSF']

In [121]:
# Check the skew of all numerical features
skewed_feats = all_df[num_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head(10)

skewness = skewness[abs(skewness) > 0.75]
print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

from scipy.special import boxcox1p
skewed_features = conti_feats;#skewness.index
lam = 0.15
for feat in skewed_features:
#     plt.figure()
#     plt.subplot(1,2,1)
#     sns.distplot(all_df[feat])
#     plt.subplot(1,2,2)
#     sns.distplot(boxcox1p(all_df[feat], lam))
    all_df[feat] = boxcox1p(all_df[feat], lam)


Skew in numerical features: 

There are 63 skewed numerical features to Box Cox transform


In [122]:
#all_df[num_feats] = MinMaxScaler().fit_transform(all_df[num_feats])

In [123]:
# for feat in num_feats:
#     plt.figure()
#     plt.scatter(all_df[feat].loc[train_df.index],y)
#     plt.title(feat)
#     plt.show

In [124]:
## 4.3 Creat one-hot coding for class variables 

all_df[cat_feats] = all_df[cat_feats].astype(str)
all_dummy_df = pd.get_dummies(all_df)
all_dummy_df.head()

all_dummy_df.shape

(2913, 266)

In [125]:
train = all_dummy_df.loc[train_df.index]
predict = all_dummy_df.loc[predict_df.index]

In [126]:
import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV,train_test_split
import xgboost as xgb
import lightgbm as lgb
import seaborn as sns
from sklearn.feature_selection import SelectKBest,f_regression,mutual_info_regression
from sklearn.neighbors import KNeighborsRegressor
import matplotlib.pyplot as plt
%matplotlib inline

In [127]:
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, train.values, y.values, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

class grid():
    def __init__(self,model):
        self.model = model
    
    def grid_get(self,param_grid):
        kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
        grid_search = GridSearchCV(self.model,param_grid,cv=kf, scoring="neg_mean_squared_error", n_jobs=-1,verbose=2)
        grid_search.fit(train.values, y.values)
        print(grid_search.best_params_, np.sqrt(-grid_search.best_score_))
        grid_search.cv_results_['mean_test_score'] = np.sqrt(-grid_search.cv_results_['mean_test_score'])
        print(pd.DataFrame(grid_search.cv_results_)[['params','mean_test_score','std_test_score']])


In [128]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0007, random_state=1))
score= rmsle_cv(lasso)
print("Lasso score: {:.5f} ({:.5f})\n".format(score.mean(), score.std()))

Lasso score: 0.11635 (0.01100)

