In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Lasso, Ridge, HuberRegressor, LinearRegression
from sklearn.svm import SVR
from xgboost import XGBRegressor
import catboost as cb
from mlxtend.regressor import StackingCVRegressor
from lightgbm import LGBMRegressor
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings.filterwarnings("ignore")

  from pandas import MultiIndex, Int64Index


In [2]:
df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
print("Null columns in train dataset")
for col in df.columns:
    null = df[col].isnull().sum()
    if null != 0:
        print(df[col].dtypes,col, null)

Null columns in train dataset
float64 LotFrontage 259
object Alley 1369
object MasVnrType 8
float64 MasVnrArea 8
object BsmtQual 37
object BsmtCond 37
object BsmtExposure 38
object BsmtFinType1 37
object BsmtFinType2 38
object Electrical 1
object FireplaceQu 690
object GarageType 81
float64 GarageYrBlt 81
object GarageFinish 81
object GarageQual 81
object GarageCond 81
object PoolQC 1453
object Fence 1179
object MiscFeature 1406


In [4]:
print("Null columns in test dataset")
for col in test.columns:
    null = test[col].isnull().sum()
    if null != 0:
        print(test[col].dtypes, col, null)

Null columns in test dataset
object MSZoning 4
float64 LotFrontage 227
object Alley 1352
object Utilities 2
object Exterior1st 1
object Exterior2nd 1
object MasVnrType 16
float64 MasVnrArea 15
object BsmtQual 44
object BsmtCond 45
object BsmtExposure 44
object BsmtFinType1 42
float64 BsmtFinSF1 1
object BsmtFinType2 42
float64 BsmtFinSF2 1
float64 BsmtUnfSF 1
float64 TotalBsmtSF 1
float64 BsmtFullBath 2
float64 BsmtHalfBath 2
object KitchenQual 1
object Functional 2
object FireplaceQu 730
object GarageType 76
float64 GarageYrBlt 78
object GarageFinish 78
float64 GarageCars 1
float64 GarageArea 1
object GarageQual 78
object GarageCond 78
object PoolQC 1456
object Fence 1169
object MiscFeature 1408
object SaleType 1


In [5]:
y = df['SalePrice']
df = df.drop(['SalePrice'],axis=1)
df = df.set_index('Id')
test = test.set_index('Id')

In [6]:
a = pd.concat([df,test],axis=0)
df = a.reset_index(drop=True)

In [7]:
# Get list of categorical variables in holiday dataset
s = (df.dtypes == 'object')
object_cols = list(s[s].index)
print("\n Categorical variables:")
print(object_cols)
for col in df[object_cols]:
        print("\n"+col)
        print('Number of categories: '+str(len(df[col].unique())))
        print(df[col].unique())
            


 Categorical variables:
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']

MSZoning
Number of categories: 6
['RL' 'RM' 'C (all)' 'FV' 'RH' nan]

Street
Number of categories: 2
['Pave' 'Grvl']

Alley
Number of categories: 3
[nan 'Grvl' 'Pave']

LotShape
Number of categories: 4
['Reg' 'IR1' 'IR2' 'IR3']

LandContour
Number of categories: 4
['Lvl' 'Bnk' 'Low' 'HLS']

Utilities
Number of categories: 3
['AllPub' 'NoSeWa' nan]

LotConfig
Number of categories: 5
['Inside'

In [8]:
low_cardinality_cols=[]
med_cardinality_cols=[]
for col in df[object_cols]:
    if len(df[col].unique())<=10:
        low_cardinality_cols.append(col)
    elif len(df[col].unique())>10:
        med_cardinality_cols.append(col)

In [9]:
for col in df[df.columns]:
    if col not in object_cols:
        scaler = MinMaxScaler()
        df[col] = scaler.fit_transform(df[[col]])

In [10]:
# One Hot Encoding for Low Cardinality Columns
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols = pd.DataFrame(OH_encoder.fit_transform(df[low_cardinality_cols]))
OH_cols.index = df.index
OH_cols.columns = OH_encoder.get_feature_names(low_cardinality_cols)
df = df.drop(low_cardinality_cols, axis=1)
df = pd.concat([df, OH_cols], axis=1)

# Ordinal Encoding for Medium Cardinality Columns
label_encoder = OrdinalEncoder()
df[med_cardinality_cols] = label_encoder.fit_transform(df[med_cardinality_cols])

In [11]:
for col in df.columns:
    null = df[col].isnull().sum()
    if null != 0:
        df = df[(df[col].notna()) | (df.index >= 1460)]
        print(df[col].dtypes, col, null)

float64 LotFrontage 486
float64 Exterior1st 1
float64 Exterior2nd 1
float64 MasVnrArea 21
float64 BsmtFinSF1 1
float64 BsmtFinSF2 1
float64 BsmtUnfSF 1
float64 TotalBsmtSF 1
float64 BsmtFullBath 2
float64 BsmtHalfBath 2
float64 GarageYrBlt 152
float64 GarageCars 1
float64 GarageArea 1


In [12]:
df = df.fillna(0)

In [66]:
df_X = df[df.index < 1460]
test = df[df.index >= 1460]
full = pd.merge(left = df_X, right = y , left_index= True, right_index = True)
train, dev = train_test_split(full, test_size=0.01)

train_y = np.log(train['SalePrice'])
train_X = train.drop(['SalePrice'],axis=1)

dev_y = np.log(dev['SalePrice'])
dev_X = dev.drop(['SalePrice'],axis=1)

In [67]:
# rfmodel = RandomForestRegressor(n_jobs=-1)
# rfparams = {
#     'max_depth' : [26],
#     'n_estimators' : [1000],
#     'min_samples_leaf':[4],
#     'max_features': ['auto'],
#     'oob_score' : [True],
#     'min_samples_split' : [4],
#     'ccp_alpha' : [0.00005],
#     'max_samples' : [0.65]
# }
# rfgrid = GridSearchCV(estimator = rfmodel,verbose=4, param_grid = rfparams, cv = 10, scoring = 'neg_root_mean_squared_error')
# rfgrid.fit(train_X,train_y)
# print(rfgrid.best_params_)
# print(rfgrid.best_score_)
# print(rfgrid.score(dev_X,dev_y))
# print(rfgrid.score(train_X,train_y))

In [68]:
rfmodel = RandomForestRegressor(ccp_alpha= 0.00005, max_depth=26, max_features= 'auto', max_samples=0.6, min_samples_leaf= 4, min_samples_split= 4, n_estimators= 5000, oob_score= True)
rfmodel.fit(train_X,train_y)

RandomForestRegressor(ccp_alpha=5e-05, max_depth=26, max_samples=0.6,
                      min_samples_leaf=4, min_samples_split=4,
                      n_estimators=5000, oob_score=True)

In [69]:
# xgbmodel = XGBRegressor(n_jobs=-1)
# xgbparams = {
#     'max_depth' : [3],
#     'n_estimators' : [10000],
#     'learning_rate' : [0.01],
#     'colsample_bytree' : [0.6],
#     'subsample':[0.65],
#     'alpha' : [3],
#     'lambda' : [3],
# }
# xgbgrid = GridSearchCV(estimator = xgbmodel,verbose=3, param_grid = xgbparams, cv = 5, scoring = 'neg_root_mean_squared_error')
# xgbgrid.fit(train_X,train_y)
# print(xgbgrid.best_params_)
# print(xgbgrid.best_score_)
# print(xgbgrid.score(dev_X,dev_y))
# print(xgbgrid.score(train_X,train_y))

In [70]:
xgbmodel = XGBRegressor(alpha= 3, colsample_bytree=0.6, reg_lambda=3, learning_rate= 0.01, max_depth=3, n_estimators=10000, subsample=0.65)
xgbmodel.fit(train_X,train_y)

XGBRegressor(alpha=3, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.6, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.01, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=10000, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=3,
             reg_lambda=3, scale_pos_weight=1, subsample=0.65,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [71]:
# ridgemodel = Ridge()
# ridgeparams = {
#     'alpha' : [3]
# }
# ridgegrid = GridSearchCV(estimator = ridgemodel,verbose=3, param_grid = ridgeparams, cv = 10, scoring = 'neg_root_mean_squared_error')
# ridgegrid.fit(train_X,train_y)
# print(ridgegrid.best_params_)
# print(ridgegrid.best_score_)
# print(ridgegrid.score(dev_X,dev_y))
# print(ridgegrid.score(train_X,train_y))

In [72]:
ridgemodel = Ridge(alpha=3)
ridgemodel.fit(train_X,train_y)

Ridge(alpha=3)

In [73]:
# svrmodel = SVR()
# svrparams = {
#     'C' : [35],
#     'epsilon' : [0.008],
#     'gamma' : [0.0008]
# }
# svrgrid = GridSearchCV(estimator = svrmodel,verbose=3, param_grid = svrparams, cv = 10, scoring = 'neg_root_mean_squared_error')
# svrgrid.fit(train_X,train_y)
# print(svrgrid.best_params_)
# print(svrgrid.best_score_)
# print(svrgrid.score(dev_X,dev_y))
# print(svrgrid.score(train_X,train_y))

In [74]:
svrmodel = SVR(C=35,epsilon=0.008,gamma=0.0008)
svrmodel.fit(train_X,train_y)

SVR(C=35, epsilon=0.008, gamma=0.0008)

In [75]:
# hubermodel = HuberRegressor()
# huberparams = {
#     'epsilon' : [5],
#     'alpha' : [20],
#     'max_iter' : [600],
#     'fit_intercept' : [True]
# }
# hubergrid = GridSearchCV(estimator = hubermodel,verbose=3, param_grid = huberparams, cv = 10, scoring = 'neg_root_mean_squared_error')
# hubergrid.fit(train_X,train_y)
# print(hubergrid.best_params_)
# print(hubergrid.best_score_)
# print(hubergrid.score(dev_X,dev_y))
# print(hubergrid.score(train_X,train_y))

In [76]:
hubermodel = HuberRegressor(alpha=20,epsilon=5,fit_intercept=True,max_iter=600)
hubermodel.fit(train_X,train_y)

HuberRegressor(alpha=20, epsilon=5, max_iter=600)

In [77]:
# lightgbm = LGBMRegressor(objective='regression')
# lightgbmparams = {
#     'learning_rate' :[0.0005], 
#     'n_estimators' : [50000],
#     'reg_alpha' : [2],
#     'reg_lambda' : [2],
#     'max_depth' : [3],
#     'min_split_gain' : [0.005],
#     'subsample' : [0.7],
#     'colsample_bytree' : [0.7],
#     'extra_trees' : [True]
# }
# lightgbmgrid = GridSearchCV(estimator = lightgbm,verbose=3, param_grid = lightgbmparams, cv = 10, scoring = 'neg_root_mean_squared_error')
# lightgbmgrid.fit(train_X,train_y)
# print(lightgbmgrid.best_params_)
# print(lightgbmgrid.best_score_)
# print(lightgbmgrid.score(dev_X,dev_y))
# print(lightgbmgrid.score(train_X,train_y))

In [78]:
lightgbm = LGBMRegressor(objective='regression',learning_rate=0.0005, 
    n_estimators=50000,
    reg_alpha=2,
    reg_lambda=2,
    max_depth=3,
    min_split_gain=0.005,
    subsample=0.7,
    colsample_bytree=0.7,
    extra_trees=True)
lightgbm.fit(train_X, train_y)

LGBMRegressor(colsample_bytree=0.7, extra_trees=True, learning_rate=0.0005,
              max_depth=3, min_split_gain=0.005, n_estimators=50000,
              objective='regression', reg_alpha=2, reg_lambda=2, subsample=0.7)

In [79]:
# cbmodel = cb.CatBoostRegressor(objective='regression',
#                                loss_function='RMSE')
# cbparams = {
# 'colsample_bylevel':[0.5], 
# 'depth':[2], 
# 'l2_leaf_reg':[50], 
# 'learning_rate':[0.005], 
# 'n_estimators':[15000], 
# 'subsample':[0.5]
# }

# cbgrid = GridSearchCV(estimator = cbmodel,verbose=3, param_grid = cbparams, cv = 10, scoring = 'neg_root_mean_squared_error')
# cbgrid.fit(train_X,train_y)
# print(cbgrid.best_params_)
# print(cbgrid.best_score_)
# print(cbgrid.score(dev_X,dev_y))
# print(cbgrid.score(train_X,train_y))

In [80]:
cbmodel = cb.CatBoostRegressor(loss_function='RMSE',colsample_bylevel=0.5, depth=2, l2_leaf_reg=50, learning_rate=0.005, n_estimators=15000, subsample=0.5,verbose=False)
cbmodel.fit(train_X, train_y)

<catboost.core.CatBoostRegressor at 0x7ff6c24bc520>

In [81]:
stackmodel = StackingCVRegressor(regressors=(ridgemodel, xgbmodel, rfmodel, hubermodel, cbmodel),
                                meta_regressor=cbmodel,
                                use_features_in_secondary=True)
stackmodel.fit(np.array(train_X), np.array(train_y))
stackpred = stackmodel.predict(np.array(dev_X))
stackfit = stackmodel.predict(np.array(train_X))

print(mean_squared_error(dev_y,stackpred, squared=False))
print(mean_squared_error(train_y,stackfit, squared=False))

0.07420638527306428
0.08137649853418003


In [82]:
for i in [rfmodel,xgbmodel,stackmodel,ridgemodel,lightgbm,hubermodel,cbmodel]:
    print(i)
    print(mean_squared_error(dev_y,i.predict(dev_X), squared=False))
    print(mean_squared_error(train_y,i.predict(train_X), squared=False))
    print("\n")

RandomForestRegressor(ccp_alpha=5e-05, max_depth=26, max_samples=0.6,
                      min_samples_leaf=4, min_samples_split=4,
                      n_estimators=5000, oob_score=True)
0.08389818448909904
0.10339154809503268


XGBRegressor(alpha=3, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.6, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.01, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=10000, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=3,
             reg_lambda=3, scale_pos_weight=1, subsample=0.65,
             tree_method='exact', validate_parameters=1, verbosity=None)
0.08146796483873352
0.08839096795045753


StackingCVRegressor(meta_regressor=<catboost.core.CatBoostRegressor object at 0x7ff6c24b

In [83]:
pred = (rfmodel.predict(dev_X) + xgbmodel.predict(dev_X) +  stackmodel.predict(dev_X) + ridgemodel.predict(dev_X) + lightgbm.predict(dev_X) + hubermodel.predict(dev_X) + cbmodel.predict(dev_X)) / 7
print(mean_squared_error(dev_y,pred, squared=False))

0.07346686837889324


In [84]:
fit = (rfmodel.predict(train_X) + xgbmodel.predict(train_X) + lightgbm.predict(train_X) +   stackmodel.predict(train_X) + ridgemodel.predict(train_X) + hubermodel.predict(train_X) + cbmodel.predict(train_X)) / 7
print(mean_squared_error(train_y,fit, squared=False))

0.09435213353346968


In [85]:
final_prediction = (np.exp(ridgemodel.predict(test))+ 2 * np.exp(xgbmodel.predict(test)) \
 + np.exp(rfmodel.predict(test)) + 2 * np.exp(stackmodel.predict(test)) \
 + np.exp(hubermodel.predict(test)) + 2 * np.exp(lightgbm.predict(test)) + 2 * np.exp(cbmodel.predict(test))) / 11 

In [86]:
submission = pd.DataFrame(final_prediction, index = test.index)

In [87]:
submission.index = submission.index + 1
submission.reset_index(drop=False, inplace = True)
submission = submission.rename(columns={0 : 'SalePrice', 'index' : 'Id'})
submission.to_csv('submission_v6.csv', index=False)