https://www.kaggle.com/onestar/kernel-xgboost-stacking
https://www.kaggle.com/satishgunjal/ensemble-learning-bagging-boosting-stacking

In [1]:
import os
import pandas as pd
import numpy as np
import warnings

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.kernel_ridge import KernelRidge

from sklearn.ensemble import BaggingRegressor

from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import StackingRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error

# Global settings

warnings.filterwarnings("ignore") # To ignore warnings
n_jobs = -1 # This parameter conrols the parallel processing. -1 means using all processors.
random_state = 42 # This parameter controls the randomness of the data. Using some int value to get same results everytime this code is run.

# Load Data

In [2]:
# df = pd.read_csv('/Users/yunmo/Documents/iii/FinalReport/House/data/FinalMerge/HouseVar.csv')
df = pd.read_csv('HouseVarCoFinal.csv')
print(f'Shape of data= {df.shape}')

Shape of data= (121820, 49)


In [4]:
df.columns

Index(['Address', 'Area', 'St', '交易年月日', 'year', '交易標的', '交易筆棟數', '建物型態',
       '建物現況格局.廳', '建物現況格局.房', '建物現況格局.衛', '建物現況格局.隔間', '有無管理組織', '總價元', '總坪數',
       '單價元坪', '車位數', 'floor', 'EightCount', 'ParkCount', 'FuneralCount',
       'GasCount', 'CrimeCount', 'PoliceCount', 'busCount', 'subwayCount',
       'govCount', 'clinicCount', 'hospitalCount', 'pharmacyCount',
       'fireareaCount', 'firewayCount', 'martCount', 'mallCount',
       'cinemaCount', '土地面積', '總人口數', '男性人數', '女性人數', '人口密度', '每戶人數', '每戶成年人數',
       '所得收入總計', '可支配所得', '消費支出', '儲蓄', '所得總額', 'Lontitude', 'Latitude'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121820 entries, 0 to 121819
Data columns (total 49 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Address        121820 non-null  object 
 1   Area           121820 non-null  object 
 2   St             121820 non-null  object 
 3   交易年月日          121820 non-null  int64  
 4   year           121820 non-null  int64  
 5   交易標的           121820 non-null  object 
 6   交易筆棟數          121820 non-null  object 
 7   建物型態           121820 non-null  object 
 8   建物現況格局.廳       121820 non-null  int64  
 9   建物現況格局.房       121820 non-null  int64  
 10  建物現況格局.衛       121820 non-null  int64  
 11  建物現況格局.隔間      121820 non-null  object 
 12  有無管理組織         121820 non-null  object 
 13  總價元            121820 non-null  float64
 14  總坪數            121820 non-null  float64
 15  單價元坪           121820 non-null  float64
 16  車位數            121820 non-null  int64  
 17  floor          121820 non-nul

In [6]:
df.columns

Index(['Address', 'Area', 'St', '交易年月日', 'year', '交易標的', '交易筆棟數', '建物型態',
       '建物現況格局.廳', '建物現況格局.房', '建物現況格局.衛', '建物現況格局.隔間', '有無管理組織', '總價元', '總坪數',
       '單價元坪', '車位數', 'floor', 'EightCount', 'ParkCount', 'FuneralCount',
       'GasCount', 'CrimeCount', 'PoliceCount', 'busCount', 'subwayCount',
       'govCount', 'clinicCount', 'hospitalCount', 'pharmacyCount',
       'fireareaCount', 'firewayCount', 'martCount', 'mallCount',
       'cinemaCount', '土地面積', '總人口數', '男性人數', '女性人數', '人口密度', '每戶人數', '每戶成年人數',
       '所得收入總計', '可支配所得', '消費支出', '儲蓄', '所得總額', 'Lontitude', 'Latitude'],
      dtype='object')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121820 entries, 0 to 121819
Data columns (total 49 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Address        121820 non-null  object 
 1   Area           121820 non-null  object 
 2   St             121820 non-null  object 
 3   交易年月日          121820 non-null  int64  
 4   year           121820 non-null  int64  
 5   交易標的           121820 non-null  object 
 6   交易筆棟數          121820 non-null  object 
 7   建物型態           121820 non-null  object 
 8   建物現況格局.廳       121820 non-null  int64  
 9   建物現況格局.房       121820 non-null  int64  
 10  建物現況格局.衛       121820 non-null  int64  
 11  建物現況格局.隔間      121820 non-null  object 
 12  有無管理組織         121820 non-null  object 
 13  總價元            121820 non-null  float64
 14  總坪數            121820 non-null  float64
 15  單價元坪           121820 non-null  float64
 16  車位數            121820 non-null  int64  
 17  floor          121820 non-nul

In [8]:
pd.set_option('display.max_columns', 100) 
dfDrop = df.drop(["EightCount", "FuneralCount", "PoliceCount",
                  "總坪數", "總價元","建物現況格局.隔間", "有無管理組織",
                  "交易筆棟數", "交易標的","交易年月日",
                  "subwayCount", "busCount", "CrimeCount",
                  "clinicCount", "pharmacyCount", "fireareaCount",
                  "mallCount", "cinemaCount", "總人口數",
                  "男性人數", "女性人數", "土地面積",
                  "每戶成年人數",
                  "所得收入總計", "可支配所得", "消費支出",
                  "人口密度", "儲蓄", "Lontitude", "Latitude"], axis=1)
# dfDrop.head()

dfDrop2 = df.drop(["Area","Address","St","year","建物型態",
                   "EightCount", "FuneralCount", "PoliceCount",
                  "總坪數", "總價元","建物現況格局.隔間", "有無管理組織",
                  "交易筆棟數", "交易標的","交易年月日",
                  "subwayCount", "busCount", "CrimeCount",
                  "clinicCount", "pharmacyCount", "fireareaCount",
                  "mallCount", "cinemaCount", "總人口數",
                  "男性人數", "女性人數", "土地面積",
                  "每戶成年人數",
                  "所得收入總計", "可支配所得", "消費支出",
                  "人口密度", "儲蓄", "Lontitude", "Latitude"], axis=1)
dfDrop2

Unnamed: 0,建物現況格局.廳,建物現況格局.房,建物現況格局.衛,單價元坪,車位數,floor,ParkCount,GasCount,govCount,hospitalCount,firewayCount,martCount,每戶人數,所得總額
0,2,5,3,2.465803e+05,0,7,8,7,33,4,18,11,2.89,1720988
1,2,2,2,7.136584e+05,0,7,8,7,33,4,18,11,2.89,1720988
2,0,0,0,2.620434e+05,0,7,8,7,33,4,18,11,2.89,1720988
3,1,1,1,7.439029e+05,0,7,8,7,33,4,18,11,2.89,1720988
4,0,0,0,1.343113e+06,0,5,8,7,33,4,18,11,2.89,1720988
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121815,1,1,1,3.899083e+05,0,5,2,4,27,4,35,8,3.07,1572599
121816,1,2,1,2.661667e+05,0,5,2,4,27,4,35,8,3.07,1572599
121817,1,3,2,5.367694e+05,0,5,2,4,27,4,35,8,3.07,1572599
121818,1,2,1,2.437000e+05,0,12,2,4,27,4,35,8,3.07,1572599


In [9]:
from dfply import *

In [10]:
yDf = dfDrop2 >> select(X.單價元坪)
xDf = dfDrop2.drop(['單價元坪'], axis = 1)
xDf.head()

Unnamed: 0,建物現況格局.廳,建物現況格局.房,建物現況格局.衛,車位數,floor,ParkCount,GasCount,govCount,hospitalCount,firewayCount,martCount,每戶人數,所得總額
0,2,5,3,0,7,8,7,33,4,18,11,2.89,1720988
1,2,2,2,0,7,8,7,33,4,18,11,2.89,1720988
2,0,0,0,0,7,8,7,33,4,18,11,2.89,1720988
3,1,1,1,0,7,8,7,33,4,18,11,2.89,1720988
4,0,0,0,0,5,8,7,33,4,18,11,2.89,1720988


# Train and Test Data

In [11]:
X_train, X_test, y_train, y_test = train_test_split(xDf, yDf, test_size= 0.2, random_state = random_state)

print(f'Training set: X_train shape= {X_train.shape}, y_train shape= {y_train.shape}')
print(f'Holdout set: X_test shape= {X_test.shape}, y_test shape= {y_test.shape}')

Training set: X_train shape= (97456, 13), y_train shape= (97456, 1)
Holdout set: X_test shape= (24364, 13), y_test shape= (24364, 1)


# Regression Model

In [12]:
models_scores = [] # To store model scores

def rmse(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    return mean_squared_error(y_test, y_pred, squared= False) # squared= False > returns Root Mean Square Error    

## Linear Regression

In [13]:
linear_regression = make_pipeline(LinearRegression())
score = rmse(linear_regression)

models_scores.append(['LinearRegression', score])
print(f'LinearRegression Score= {score}')

LinearRegression Score= 1350950.3907151243


## Lasso Regression

In [14]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state= random_state))

score = rmse(lasso)
models_scores.append(['Lasso', score])
print(f'Lasso Score= {score}')

Lasso Score= 1350950.39070096


## ElasticNet Regression

In [15]:
elastic_net = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio= .9, random_state= random_state))

score = rmse(elastic_net)
models_scores.append(['ElasticNet', score])
print(f'ElasticNet Score= {score}')

ElasticNet Score= 1350950.3013892474


## KernelRidge Regression

In [16]:
# kernel_ridge= KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
# score = rmse(kernel_ridge)
# models_scores.append(['KernelRidge', score])
# print(f'KernelRidge Score= {score}')

## Rank scores

In [17]:
pd.DataFrame(models_scores).sort_values(by=[1], ascending=True)

Unnamed: 0,0,1
2,ElasticNet,1350950.0
1,Lasso,1350950.0
0,LinearRegression,1350950.0


# Ensemble Modeling 

In [18]:
def bagging_predictions(estimator):
    """
    I/P
    estimator: The base estimator from which the ensemble is grown.
    O/P
    br_y_pred: Predictions on test data for the base estimator.
    
    """
    regr = BaggingRegressor(base_estimator=estimator,
                            n_estimators=10,
                            max_samples=1.0,
                            bootstrap=True, # Samples are drawn with replacement
                            n_jobs= n_jobs,
                            random_state=random_state).fit(X_train, y_train)

    br_y_pred = regr.predict(X_test)

    rmse_val = mean_squared_error(y_test, br_y_pred, squared= False) # squared= False > returns Root Mean Square Error   

    print(f'RMSE for base estimator {regr.base_estimator_} = {rmse_val}\n')
    return br_y_pred


predictions = np.column_stack((bagging_predictions(linear_regression),
                              bagging_predictions(lasso),
                              bagging_predictions(elastic_net)))
#                               bagging_predictions(kernel_ridge)))
print(f"Bagged predictions shape: {predictions.shape}")
       
y_pred = np.mean(predictions, axis=1)
print("Aggregated predictions (y_pred) shape", y_pred.shape)

rmse_val = mean_squared_error(y_test, y_pred, squared= False) # squared= False > returns Root Mean Square Error   
models_scores.append(['Bagging', rmse_val])

print(f'\nBagging RMSE= {rmse_val}')

RMSE for base estimator Pipeline(steps=[('linearregression', LinearRegression())]) = 1350944.6626265005

RMSE for base estimator Pipeline(steps=[('robustscaler', RobustScaler()),
                ('lasso', Lasso(alpha=0.0005, random_state=42))]) = 1350944.6626075224

RMSE for base estimator Pipeline(steps=[('robustscaler', RobustScaler()),
                ('elasticnet',
                 ElasticNet(alpha=0.0005, l1_ratio=0.9, random_state=42))]) = 1350944.5759043652

Bagged predictions shape: (24364, 3)
Aggregated predictions (y_pred) shape (24364,)

Bagging RMSE= 1350944.63368362


## Rank scores

In [19]:
pd.DataFrame(models_scores).sort_values(by=[1], ascending=True)

Unnamed: 0,0,1
3,Bagging,1350945.0
2,ElasticNet,1350950.0
1,Lasso,1350950.0
0,LinearRegression,1350950.0


# Boosting 

## GradientBoostingRegressor ¶

In [21]:
%%time
gradient_boosting_regressor= GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state = random_state)

score = rmse(gradient_boosting_regressor)
models_scores.append(['GradientBoostingRegressor', score])
print(f'GradientBoostingRegressor Score= {score}')

GradientBoostingRegressor Score= 1341671.4603304362
CPU times: user 2min 4s, sys: 181 ms, total: 2min 5s
Wall time: 2min 5s


## XGBRegressor 

In [22]:
%%time
xgb_regressor= xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213,verbosity=0, nthread = -1, random_state = random_state)
score = rmse(xgb_regressor)
models_scores.append(['XGBRegressor', score])
print(f'XGBRegressor Score= {score}')

XGBRegressor Score= 1340396.208137092
CPU times: user 51.8 s, sys: 1.34 s, total: 53.2 s
Wall time: 27.1 s


## LGBMRegressor 

In [None]:
# lgbm_regressor= lgb.LGBMRegressor(objective='regression',num_leaves=5,
#                               learning_rate=0.05, n_estimators=720,
#                               max_bin = 55, bagging_fraction = 0.8,
#                               bagging_freq = 5, feature_fraction = 0.2319,
#                               feature_fraction_seed=9, bagging_seed=9,
#                               min_data_in_leaf =6, min_sum_hessian_in_leaf = 11,random_state = random_state)

# score = rmse(lgbm_regressor)
# models_scores.append(['LGBMRegressor', score])
# print(f'LGBMRegressor Score= {score}')

## Stacking

In [23]:
%%time
estimators = [ ('linear_regression', linear_regression), ('gradient_boosting_regressor', gradient_boosting_regressor),
              ('xgb_regressor', xgb_regressor) ]

stack = StackingRegressor(estimators=estimators, final_estimator= lasso, cv= 5, n_jobs= n_jobs, passthrough = True)

stack.fit(X_train, y_train)

pred = stack.predict(X_test)

rmse_val = mean_squared_error(y_test, pred, squared= False) # squared= False > returns Root Mean Square Error    
models_scores.append(['Stacking', rmse_val])
print(f'rmse= {rmse_val}')

rmse= 1339957.6616030487
CPU times: user 3.37 s, sys: 454 ms, total: 3.83 s
Wall time: 9min 12s


## Rank scores

In [24]:
# Ranking the scores of each model
pd.DataFrame(models_scores).sort_values(by=[1], ascending=True)                                                                         

Unnamed: 0,0,1
6,Stacking,1339958.0
5,XGBRegressor,1340396.0
4,GradientBoostingRegressor,1341671.0
3,Bagging,1350945.0
2,ElasticNet,1350950.0
1,Lasso,1350950.0
0,LinearRegression,1350950.0


# Predict

In [26]:
%%time
yPred = stack.predict(xDf)

CPU times: user 4.73 s, sys: 55.9 ms, total: 4.78 s
Wall time: 3.82 s


In [27]:
%%time
StackPred = pd.DataFrame(yPred, columns = ['pred'])
StackPred

CPU times: user 350 µs, sys: 5 µs, total: 355 µs
Wall time: 360 µs


Unnamed: 0,pred
0,608990.363867
1,611683.203384
2,673593.911057
3,607951.446442
4,699300.484182
...,...
121815,390455.396095
121816,361367.770272
121817,291676.034978
121818,460680.348054


In [31]:
result = pd.concat([df, StackPred], axis=1)#.drop(["Unnamed: 0"], axis=1)
result.head(2)

Unnamed: 0,Address,Area,St,交易年月日,year,交易標的,交易筆棟數,建物型態,建物現況格局.廳,建物現況格局.房,建物現況格局.衛,建物現況格局.隔間,有無管理組織,總價元,總坪數,單價元坪,車位數,floor,EightCount,ParkCount,FuneralCount,GasCount,CrimeCount,PoliceCount,busCount,subwayCount,govCount,clinicCount,hospitalCount,pharmacyCount,fireareaCount,firewayCount,martCount,mallCount,cinemaCount,土地面積,總人口數,男性人數,女性人數,人口密度,每戶人數,每戶成年人數,所得收入總計,可支配所得,消費支出,儲蓄,所得總額,Lontitude,Latitude,pred
0,臺北市中山區一江街1~30號,中山區,一江街,1041208,2015,房地(土地+建物),土地1建物1車位0,華廈,2,5,3,有,無,16800000.0,68.132075,246580.260178,0,7,77,8,132,7,904,13,280,8,33,486,4,98,80,18,11,6,5,13.6821,224707,104138,120569,16423,2.89,2.51,1614178,1339702,1071429,268273,1720988,121.531568,25.050704,608990.363867
1,臺北市中山區一江街1~30號,中山區,一江街,1080827,2019,房地(土地+建物),土地2建物1車位0,華廈,2,2,2,有,無,23000000.0,32.22835,713658.438145,0,7,77,8,132,7,904,13,280,8,33,486,4,98,80,18,11,6,5,13.6821,224707,104138,120569,16423,2.89,2.51,1614178,1339702,1071429,268273,1720988,121.531568,25.050704,611683.203384


In [32]:
result["漲跌"] = round(result["單價元坪"]-result["pred"], 2)

In [33]:
result.head(2)

Unnamed: 0,Address,Area,St,交易年月日,year,交易標的,交易筆棟數,建物型態,建物現況格局.廳,建物現況格局.房,建物現況格局.衛,建物現況格局.隔間,有無管理組織,總價元,總坪數,單價元坪,車位數,floor,EightCount,ParkCount,FuneralCount,GasCount,CrimeCount,PoliceCount,busCount,subwayCount,govCount,clinicCount,hospitalCount,pharmacyCount,fireareaCount,firewayCount,martCount,mallCount,cinemaCount,土地面積,總人口數,男性人數,女性人數,人口密度,每戶人數,每戶成年人數,所得收入總計,可支配所得,消費支出,儲蓄,所得總額,Lontitude,Latitude,pred,漲跌
0,臺北市中山區一江街1~30號,中山區,一江街,1041208,2015,房地(土地+建物),土地1建物1車位0,華廈,2,5,3,有,無,16800000.0,68.132075,246580.260178,0,7,77,8,132,7,904,13,280,8,33,486,4,98,80,18,11,6,5,13.6821,224707,104138,120569,16423,2.89,2.51,1614178,1339702,1071429,268273,1720988,121.531568,25.050704,608990.363867,-362410.1
1,臺北市中山區一江街1~30號,中山區,一江街,1080827,2019,房地(土地+建物),土地2建物1車位0,華廈,2,2,2,有,無,23000000.0,32.22835,713658.438145,0,7,77,8,132,7,904,13,280,8,33,486,4,98,80,18,11,6,5,13.6821,224707,104138,120569,16423,2.89,2.51,1614178,1339702,1071429,268273,1720988,121.531568,25.050704,611683.203384,101975.23


In [34]:
updown = result.groupby(['Area'], as_index=True).mean()[['pred','漲跌']].reset_index()
updown.sort_values(by=['漲跌'], inplace=True, ascending=False)

In [35]:
updown

Unnamed: 0,Area,pred,漲跌
9,文山區,449704.429186,20895.984896
10,松山區,693564.938705,7296.074466
6,士林區,569977.2298,4566.556926
11,萬華區,474432.175375,827.162935
5,南港區,529471.858953,658.301754
4,北投區,465882.553933,284.841135
1,中正區,745294.663001,-296.308046
0,中山區,657107.296231,-463.500519
2,信義區,710266.573859,-2604.330203
3,內湖區,525903.193766,-2831.328514
