In [1]:
# import modules
import numpy as np
import pandas as pd

from scipy import stats
from scipy.stats import norm, skew
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor

import warnings
warnings.filterwarnings('ignore')

In [2]:
# load dataset
train = pd.read_csv("house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("house-prices-advanced-regression-techniques/test.csv")

In [3]:
# extract labels and index
length = len(train)
train_id = train["Id"]
test_id = test["Id"]
train_label = train["SalePrice"]

full = pd.concat([train, test], axis = 0).reset_index().drop(columns=["Id", "index", "SalePrice"])

In [4]:
# observe ategorical data distribustion
cat_cols = [col for col in full.select_dtypes(include="object").columns]
def cols():
    for col in cat_cols:
        yield col, full[col].value_counts(dropna=False)
cols = cols()

for col in cols:
    print(col)

('Alley', NaN     2721
Grvl     120
Pave      78
Name: Alley, dtype: int64)
('BldgType', 1Fam      2425
TwnhsE     227
Duplex     109
Twnhs       96
2fmCon      62
Name: BldgType, dtype: int64)
('BsmtCond', TA     2606
Gd      122
Fa      104
NaN      82
Po        5
Name: BsmtCond, dtype: int64)
('BsmtExposure', No     1904
Av      418
Gd      276
Mn      239
NaN      82
Name: BsmtExposure, dtype: int64)
('BsmtFinType1', Unf    851
GLQ    849
ALQ    429
Rec    288
BLQ    269
LwQ    154
NaN     79
Name: BsmtFinType1, dtype: int64)
('BsmtFinType2', Unf    2493
Rec     105
LwQ      87
NaN      80
BLQ      68
ALQ      52
GLQ      34
Name: BsmtFinType2, dtype: int64)
('BsmtQual', TA     1283
Gd     1209
Ex      258
Fa       88
NaN      81
Name: BsmtQual, dtype: int64)
('CentralAir', Y    2723
N     196
Name: CentralAir, dtype: int64)
('Condition1', Norm      2511
Feedr      164
Artery      92
RRAn        50
PosN        39
RRAe        28
PosA        20
RRNn         9
RRNe         6
Name: Con

In [5]:
# feature processing
def transform(x):
    x = x.replace({"Alley":{np.nan:0, "Grvl":1, "Pave":2},
                   "BsmtCond":{"Ex":5,"Gd":4,"TA":3,"Fa":2,"Po":1,np.nan:0},
                   "BsmtExposure":{"Gd":4,"Av":3,"Mn":2,"No":1,np.nan:0},
                   "BsmtFinType1":{"GLQ":6,"ALQ":5,"BLQ":4,"Rec":3,"LwQ":2,"Unf":1, np.nan:0},
                   "BsmtFinType2":{"GLQ":6,"ALQ":5,"BLQ":4,"Rec":3,"LwQ":2,"Unf":1, np.nan:0},
                   "BsmtQual":{"Ex":5,"Gd":4,"TA":3,"Fa":2,"Po":1,np.nan:0},
                   "CentralAir":{"Y":1,"N":0},
                   "Electrical":{"SBrkr":4,"FuseA":3,"FuseF":2,"FuseP":1,"Mix":0,np.nan:0},
                   "ExterCond":{"Ex":4,"Gd":3,"TA":2,"Fa":1,"Po":0},
                   "Exterior1st":{np.nan:x.Exterior1st.mode()[0]},
                   "Exterior2nd":{np.nan:x.Exterior2nd.mode()[0]},
                   "ExterQual":{"Ex":4,"Gd":3,"TA":2,"Fa":1,"Po":0},
                   "Fence":{np.nan:"None"},
                   "FireplaceQu":{"Ex":5,"Gd":4,"TA":3,"Fa":2,"Po":1,np.nan:0},
                   "Functional":{"Typ":6,"Min1":5,"Min2":5,"Mod":4,"Maj1":3,"Maj2":3,"Sev":2,"Sal":1,np.nan:0},
                   "GarageCond":{"Ex":5,"Gd":4,"TA":3,"Fa":2,"Po":1,np.nan:0},
                   "GarageFinish":{"Fin":3,"RFn":2,"Unf":1,np.nan:0},
                   "GarageQual":{"Ex":5,"Gd":4,"TA":3,"Fa":2,"Po":1,np.nan:0},
                   "GarageType":{np.nan:"None"},
                   "HeatingQC":{"Ex":4,"Gd":3,"TA":2,"Fa":1,"Po":0},
                   "KitchenQual":{"Ex":5,"Gd":4,"TA":3,"Fa":2,"Po":1,np.nan:0},
                   "MSZoning":{np.nan:x.MSZoning.mode()[0]},
                   "MasVnrType":{np.nan:x.MasVnrType.mode()[0]},
                   "MiscFeature":{np.nan:"None"},
                   "PavedDrive":{"Y":2,"P":1,"N":0},
                   "PoolQC":{"Ex":4,"Gd":3,"TA":2,"Fa":1,np.nan:0},
                   "Street":{"Grvl":0, 'Pave':1},
                   "Utilities":{"AllPub":4,"NoSewr":3,"NoSeWa":2,"ELO":1,np.nan:0},
                   "SaleType":{np.nan:x.SaleType.mode()[0]}
                  })
    cat = ["MSSubClass", "BldgType", "Condition1", "Condition2", "Exterior1st", 
           "Exterior2nd", "Fence", "Foundation", "GarageType", "Heating", 
           "HouseStyle", "LandContour", "LandSlope", "LotConfig", "LotShape", 
           "MSZoning", "MasVnrType", "MiscFeature", "Neighborhood", "RoofMatl", 
           "RoofStyle", "SaleCondition", "SaleType", "YearBuilt", "YearRemodAdd",
           "MoSold", "YrSold"]
    x[cat] = x[cat].astype("object")
    num = [col for col in x.columns if col not in cat]
    x[num] = x[num].fillna(0)
    x[num].astype("int", inplace=True)
    
    #ohc = OneHotEncoder(sparse=False)
    #cat_ready = pd.DataFrame(ohc.fit_transform(x[cat]))
    dummies = pd.get_dummies(x, drop_first=True)
    
    mms = MinMaxScaler()
    ready = pd.DataFrame(mms.fit_transform(dummies))
    
    #data_ready = pd.concat([cat_ready, num_ready], axis=1)
    return ready

full = transform(full)
test = trandform()

In [25]:
# prepare dataset for ML 
train_pro = full[:length]
test = full[length:]

X_train, y_train, X_test, y_test = train_test_split(train_pro, train_label, test_size=0.2, random_state=1988)

In [7]:
# RMSE for evaluation
def rmse(true, test):
    rmse = np.sqrt(mean_squared_error(true, test))
    return rmse

In [8]:
# classic linear regression
lr = LinearRegression()
lr.fit(X_train, X_test)
pred_lr = lr.predict(y_train)
print(rmse(y_test, pred_lr))

5433138070716057.0


In [9]:
param_grid = {"alpha":np.linspace(100, 10000, 10)}
# lasso
ls = Lasso()
grid_ls = GridSearchCV(ls, param_grid=param_grid, cv=10)
grid_ls.fit(X_train, X_test)
pred_ls = grid_ls.predict(y_train)
print(grid_ls.best_params_)
print(grid_ls.best_score_)
print(rmse(y_test, pred_ls))

{'alpha': 100.0}
0.7685344776388543
24329.956783261598


In [10]:
# ridge
rg = Ridge()
grid_rg = GridSearchCV(rg, param_grid=param_grid, cv=10)
grid_rg.fit(X_train, X_test)
pred_rg = grid_rg.predict(y_train)
print(grid_rg.best_params_)
print(grid_rg.best_score_)
print(rmse(y_test, pred_rg))

{'alpha': 100.0}
0.7306199542527712
37740.45774073547


In [11]:
# tuning XGBoost round 1
# https://www.jianshu.com/p/9abdc030307d
param1 = {'max_depth': list(range(3,10)), 'min_child_weight': list((1,2,3,4,5,6))}
xgb1 = XGBRegressor(eta=0.1, num_boost_round=50, colsample_bytree=0.5, subsample=0.5, seed=1988)
grid_xgb1 = GridSearchCV(xgb1, param_grid=param1, cv=10)
grid_xgb1.fit(X_train, X_test)
pred_xgb1 = grid_xgb1.predict(y_train)
print(grid_xgb1.best_params_)
print(rmse(y_test, pred_xgb1))











{'max_depth': 6, 'min_child_weight': 3}
25698.37363683646


In [15]:
# tuning XGBoost round 2
param2 = {'gamma':np.linspace(0, 0.5, 10)}
xgb2 = XGBRegressor(eta=0.1, num_boost_round=50, colsample_bytree=0.5, subsample=0.5, max_depth=6, min_child_weight=3)
grid_xgb2 = GridSearchCV(xgb2, param_grid=param2, cv=10)
grid_xgb2.fit(X_train, X_test)
pred_xgb2 = grid_xgb2.predict(y_train)
print(grid_xgb2.best_params_)
print(rmse(y_test, pred_xgb2))



{'gamma': 0.0}
25041.53470537917


In [16]:
# tuning XGBoost round 3
param3 = {'eta':[0.5,0.4,0.3,0.2,0.1,0.075,0.05,0.04,0.03], "num_boost_round":np.linspace(0, 200, 20)}
xgb3 = XGBRegressor(colsample_bytree=0.5, subsample=0.5, max_depth=6, 
                    min_child_weight=3, gamma=0)
grid_xgb3 = GridSearchCV(xgb3, param_grid=param3, cv=10)
grid_xgb3.fit(X_train, X_test)
pred_xgb3 = grid_xgb3.predict(y_train)
print(grid_xgb3.best_params_)
print(rmse(y_test, pred_xgb3))
# no improvement, XGBoost done



















































{'eta': 0.5, 'num_boost_round': 0.0}
25041.53470537917


In [19]:
# random forest
param_rf = {"n_estimators":range(1, 10), "max_depth":range(1, 10)}
rf = RandomForestRegressor()
grid_rf = GridSearchCV(rf, param_grid=param_rf, cv=10)
grid_rf.fit(X_train, X_test)
pred_rf = grid_rf.predict(y_train)
print(grid_rf.best_params_)
print(rmse(y_test, pred_rf))

{'max_depth': 9, 'n_estimators': 9}
28123.724292001414


In [26]:
pred = grid_ls.predict(test) * 0.7 + grid_xgb3.predict(test) * 0.2 + grid_rf.predict(test) * 0.1

In [27]:
score = pd.DataFrame()
score["Id"] = test_id
score["SalePrice"] = pred
score.to_csv('submission.csv',index=False)