In [1]:
import os
import pandas as pd
import numpy as np
import warnings

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.kernel_ridge import KernelRidge

from sklearn.ensemble import BaggingRegressor

from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import StackingRegressor

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error

# Global settings

warnings.filterwarnings("ignore") # To ignore warnings
n_jobs = -1 # This parameter conrols the parallel processing. -1 means using all processors.
random_state = 42 # This parameter controls the randomness of the data. Using some int value to get same results everytime this code is run.

# Load Data

In [2]:
df = pd.read_csv('HouseVar.csv')
print(f'Shape of data= {df.shape}')

Shape of data= (133919, 48)


In [3]:
df.columns

Index(['Unnamed: 0', 'Area', 'Address', 'St', '交易年月日', 'year', '交易標的', '交易筆棟數',
       '建物型態', '建物現況格局.廳', '建物現況格局.房', '建物現況格局.衛', '建物現況格局.隔間', '有無管理組織',
       '總價元', '總坪數', '單價元坪', '車位數', 'floor', 'EightCount', 'ParkCount',
       'FuneralCount', 'GasCount', 'CrimeCount', 'PoliceCount', 'busCount',
       'subwayCount', 'govCount', 'clinicCount', 'hospitalCount',
       'pharmacyCount', 'fireareaCount', 'firewayCount', 'martCount',
       'mallCount', 'cinemaCount', '土地面積', '總人口數', '男性人數', '女性人數', '人口密度',
       '每戶人數', '每戶成年人數', '所得收入總計', '可支配所得', '消費支出', '儲蓄', '所得總額'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133919 entries, 0 to 133918
Data columns (total 48 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Unnamed: 0     133919 non-null  int64  
 1   Area           133919 non-null  object 
 2   Address        133919 non-null  object 
 3   St             133919 non-null  object 
 4   交易年月日          133919 non-null  int64  
 5   year           133919 non-null  int64  
 6   交易標的           133919 non-null  object 
 7   交易筆棟數          133919 non-null  object 
 8   建物型態           133919 non-null  object 
 9   建物現況格局.廳       133919 non-null  int64  
 10  建物現況格局.房       133919 non-null  int64  
 11  建物現況格局.衛       133919 non-null  int64  
 12  建物現況格局.隔間      133919 non-null  object 
 13  有無管理組織         133919 non-null  object 
 14  總價元            133919 non-null  float64
 15  總坪數            133919 non-null  float64
 16  單價元坪           133919 non-null  float64
 17  車位數            133919 non-nul

In [5]:
pd.set_option('display.max_columns', 100) 
dfDrop = df.drop(["Unnamed: 0", "EightCount", "FuneralCount", "PoliceCount",
                  "總坪數", "總價元","建物現況格局.隔間", "有無管理組織",
                  "交易筆棟數", "交易標的","交易年月日",
                  "subwayCount", "busCount", "CrimeCount",
                  "clinicCount", "pharmacyCount", "fireareaCount",
                  "mallCount", "cinemaCount", "總人口數",
                  "男性人數", "女性人數", "土地面積",
                  "每戶成年人數",
                  "所得收入總計", "可支配所得", "消費支出",
                  "人口密度", "儲蓄"], axis=1)

dfDrop2 = df.drop(["Unnamed: 0", "Area","Address","St","year","建物型態",
                   "EightCount", "FuneralCount", "PoliceCount",
                  "總坪數", "總價元","建物現況格局.隔間", "有無管理組織",
                  "交易筆棟數", "交易標的","交易年月日",
                  "subwayCount", "busCount", "CrimeCount",
                  "clinicCount", "pharmacyCount", "fireareaCount",
                  "mallCount", "cinemaCount", "總人口數",
                  "男性人數", "女性人數", "土地面積",
                  "每戶成年人數",
                  "所得收入總計", "可支配所得", "消費支出",
                  "人口密度", "儲蓄"], axis=1)

Unnamed: 0,建物現況格局.廳,建物現況格局.房,建物現況格局.衛,單價元坪,車位數,floor,ParkCount,GasCount,govCount,hospitalCount,firewayCount,martCount,每戶人數,所得總額
0,0,2,0,732295.253165,1,12,8,7,33,4,18,11,2.89,1720988
1,1,1,1,637604.152485,0,14,8,7,33,4,18,11,2.89,1720988
2,1,1,1,552066.930000,0,12,8,7,33,4,18,11,2.89,1720988
3,1,1,1,632612.670157,1,7,8,7,33,4,18,11,2.89,1720988
4,1,2,1,664642.533672,1,14,8,7,33,4,18,11,2.89,1720988
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133914,2,2,1,476655.612245,2,18,2,4,27,4,35,8,3.07,1572599
133915,2,3,2,439986.147276,0,12,2,4,27,4,35,8,3.07,1572599
133916,1,2,1,494355.601392,0,12,2,4,27,4,35,8,3.07,1572599
133917,2,2,2,560674.599178,1,13,2,4,27,4,35,8,3.07,1572599


In [6]:
from dfply import *

In [7]:
yDf = dfDrop2 >> select(X.單價元坪)
xDf = dfDrop2.drop(['單價元坪'], axis = 1)
xDf.head()

Unnamed: 0,建物現況格局.廳,建物現況格局.房,建物現況格局.衛,車位數,floor,ParkCount,GasCount,govCount,hospitalCount,firewayCount,martCount,每戶人數,所得總額
0,0,2,0,1,12,8,7,33,4,18,11,2.89,1720988
1,1,1,1,0,14,8,7,33,4,18,11,2.89,1720988
2,1,1,1,0,12,8,7,33,4,18,11,2.89,1720988
3,1,1,1,1,7,8,7,33,4,18,11,2.89,1720988
4,1,2,1,1,14,8,7,33,4,18,11,2.89,1720988


# Train and Test Data

In [8]:
X_train, X_test, y_train, y_test = train_test_split(xDf, yDf, test_size= 0.2, random_state = random_state)

print(f'Training set: X_train shape= {X_train.shape}, y_train shape= {y_train.shape}')
print(f'Holdout set: X_test shape= {X_test.shape}, y_test shape= {y_test.shape}')

Training set: X_train shape= (107135, 13), y_train shape= (107135, 1)
Holdout set: X_test shape= (26784, 13), y_test shape= (26784, 1)


# Regression Model

In [9]:
models_scores = [] # To store model scores

def rmse(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    return mean_squared_error(y_test, y_pred, squared= False) # squared= False > returns Root Mean Square Error    

## Linear Regression

In [10]:
linear_regression = make_pipeline(LinearRegression())
score = rmse(linear_regression)

models_scores.append(['LinearRegression', score])
print(f'LinearRegression Score= {score}')

LinearRegression Score= 281363.07323917025


## Lasso Regression

In [11]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state= random_state))

score = rmse(lasso)
models_scores.append(['Lasso', score])
print(f'Lasso Score= {score}')

Lasso Score= 281363.0732988141


## ElasticNet Regression

In [12]:
elastic_net = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio= .9, random_state= random_state))

score = rmse(elastic_net)
models_scores.append(['ElasticNet', score])
print(f'ElasticNet Score= {score}')

ElasticNet Score= 281363.37428007607


## KernelRidge Regression

In [13]:
kernel_ridge= KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
score = rmse(kernel_ridge)
models_scores.append(['KernelRidge', score])
print(f'KernelRidge Score= {score}')

## Rank scores

In [14]:
pd.DataFrame(models_scores).sort_values(by=[1], ascending=True)

Unnamed: 0,0,1
0,LinearRegression,281363.073239
1,Lasso,281363.073299
2,ElasticNet,281363.37428


# Ensemble Modeling 

In [15]:
def bagging_predictions(estimator):
    """
    I/P
    estimator: The base estimator from which the ensemble is grown.
    O/P
    br_y_pred: Predictions on test data for the base estimator.
    
    """
    regr = BaggingRegressor(base_estimator=estimator,
                            n_estimators=10,
                            max_samples=1.0,
                            bootstrap=True,
                            n_jobs= n_jobs,
                            random_state=random_state).fit(X_train, y_train)

    br_y_pred = regr.predict(X_test)

    rmse_val = mean_squared_error(y_test, br_y_pred, squared= False)   

    print(f'RMSE for base estimator {regr.base_estimator_} = {rmse_val}\n')
    return br_y_pred


predictions = np.column_stack((bagging_predictions(linear_regression),
                              bagging_predictions(lasso),
                              bagging_predictions(elastic_net)))

print(f"Bagged predictions shape: {predictions.shape}")
       
y_pred = np.mean(predictions, axis=1)
print("Aggregated predictions (y_pred) shape", y_pred.shape)

rmse_val = mean_squared_error(y_test, y_pred, squared= False)    
models_scores.append(['Bagging', rmse_val])

print(f'\nBagging RMSE= {rmse_val}')

RMSE for base estimator Pipeline(steps=[('linearregression', LinearRegression())]) = 281352.83739983174

RMSE for base estimator Pipeline(steps=[('robustscaler', RobustScaler()),
                ('lasso', Lasso(alpha=0.0005, random_state=42))]) = 281352.83745369845

RMSE for base estimator Pipeline(steps=[('robustscaler', RobustScaler()),
                ('elasticnet',
                 ElasticNet(alpha=0.0005, l1_ratio=0.9, random_state=42))]) = 281353.1213665054

Bagged predictions shape: (26784, 3)
Aggregated predictions (y_pred) shape (26784,)

Bagging RMSE= 281352.9319347711


## Rank scores

In [16]:
pd.DataFrame(models_scores).sort_values(by=[1], ascending=True)

Unnamed: 0,0,1
3,Bagging,281352.931935
0,LinearRegression,281363.073239
1,Lasso,281363.073299
2,ElasticNet,281363.37428


# Boosting 

## GradientBoostingRegressor ¶

In [17]:
gradient_boosting_regressor= GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state = random_state)

score = rmse(gradient_boosting_regressor)
models_scores.append(['GradientBoostingRegressor', score])
print(f'GradientBoostingRegressor Score= {score}')

GradientBoostingRegressor Score= 251219.46664597013


## XGBRegressor 

In [18]:
xgb_regressor= xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213,verbosity=0, nthread = -1, random_state = random_state)
score = rmse(xgb_regressor)
models_scores.append(['XGBRegressor', score])
print(f'XGBRegressor Score= {score}')

XGBRegressor Score= 276630.9192789049


## LGBMRegressor 

In [19]:
lgbm_regressor= lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11,random_state = random_state)

score = rmse(lgbm_regressor)
models_scores.append(['LGBMRegressor', score])
print(f'LGBMRegressor Score= {score}')

## Stacking

In [20]:
estimators = [ ('linear_regression', linear_regression), ('gradient_boosting_regressor', gradient_boosting_regressor),
              ('xgb_regressor', xgb_regressor) ]

stack = StackingRegressor(estimators=estimators, final_estimator= lasso, cv= 5, n_jobs= n_jobs, passthrough = True)

stack.fit(X_train, y_train)

pred = stack.predict(X_test)

rmse_val = mean_squared_error(y_test, pred, squared= False) # squared= False > returns Root Mean Square Error    
models_scores.append(['Stacking', rmse_val])
print(f'rmse= {rmse_val}')

rmse= 249921.54417812105


## Rank scores

In [21]:
# Ranking the scores of each model
pd.DataFrame(models_scores).sort_values(by=[1], ascending=True)                                                                         

Unnamed: 0,0,1
6,Stacking,249921.544178
4,GradientBoostingRegressor,251219.466646
5,XGBRegressor,276630.919279
3,Bagging,281352.931935
0,LinearRegression,281363.073239
1,Lasso,281363.073299
2,ElasticNet,281363.37428


# Predict

In [22]:
yPred = stack.predict(xDf)

In [23]:
StackPred = pd.DataFrame(yPred, columns = ['pred'])
StackPred

Unnamed: 0,pred
0,606928.923866
1,677952.719661
2,573015.155225
3,693421.654176
4,696761.438366
...,...
133914,519208.019055
133915,412855.711812
133916,466227.598744
133917,470705.747619


In [24]:
result = pd.concat([df, StackPred], axis=1).drop(["Unnamed: 0"], axis=1)
result.head(2)

Unnamed: 0,Area,Address,St,交易年月日,year,交易標的,交易筆棟數,建物型態,建物現況格局.廳,建物現況格局.房,建物現況格局.衛,建物現況格局.隔間,有無管理組織,總價元,總坪數,單價元坪,車位數,floor,EightCount,ParkCount,FuneralCount,GasCount,CrimeCount,PoliceCount,busCount,subwayCount,govCount,clinicCount,hospitalCount,pharmacyCount,fireareaCount,firewayCount,martCount,mallCount,cinemaCount,土地面積,總人口數,男性人數,女性人數,人口密度,每戶人數,每戶成年人數,所得收入總計,可支配所得,消費支出,儲蓄,所得總額,pred
0,中山區,臺北市中山區建國北路二段61~90號,建國北路,1030430,2014,房地(土地+建物)+車位,土地2建物1車位1,辦公商業大樓,0,2,0,有,有,31500000.0,43.0155,732295.253165,1,12,77,8,132,7,904,13,280,8,33,486,4,98,80,18,11,6,5,13.6821,224707,104138,120569,16423,2.89,2.51,1614178,1339702,1071429,268273,1720988,606928.923866
1,中山區,臺北市中山區林森北路301~330號,林森北路,1030702,2014,房地(土地+建物),土地1建物1車位0,套房,1,1,1,有,有,8500000.0,13.331175,637604.152485,0,14,77,8,132,7,904,13,280,8,33,486,4,98,80,18,11,6,5,13.6821,224707,104138,120569,16423,2.89,2.51,1614178,1339702,1071429,268273,1720988,677952.719661


In [32]:
result["漲跌"] = round(result["pred"]-result["單價元坪"], 2)

In [33]:
result.head(2)

Unnamed: 0,Area,Address,St,交易年月日,year,交易標的,交易筆棟數,建物型態,建物現況格局.廳,建物現況格局.房,建物現況格局.衛,建物現況格局.隔間,有無管理組織,總價元,總坪數,單價元坪,車位數,floor,EightCount,ParkCount,FuneralCount,GasCount,CrimeCount,PoliceCount,busCount,subwayCount,govCount,clinicCount,hospitalCount,pharmacyCount,fireareaCount,firewayCount,martCount,mallCount,cinemaCount,土地面積,總人口數,男性人數,女性人數,人口密度,每戶人數,每戶成年人數,所得收入總計,可支配所得,消費支出,儲蓄,所得總額,pred,漲跌
0,中山區,臺北市中山區建國北路二段61~90號,建國北路,1030430,2014,房地(土地+建物)+車位,土地2建物1車位1,辦公商業大樓,0,2,0,有,有,31500000.0,43.0155,732295.253165,1,12,77,8,132,7,904,13,280,8,33,486,4,98,80,18,11,6,5,13.6821,224707,104138,120569,16423,2.89,2.51,1614178,1339702,1071429,268273,1720988,606928.923866,-125366.33
1,中山區,臺北市中山區林森北路301~330號,林森北路,1030702,2014,房地(土地+建物),土地1建物1車位0,套房,1,1,1,有,有,8500000.0,13.331175,637604.152485,0,14,77,8,132,7,904,13,280,8,33,486,4,98,80,18,11,6,5,13.6821,224707,104138,120569,16423,2.89,2.51,1614178,1339702,1071429,268273,1720988,677952.719661,40348.57


## By Area

In [36]:
updown = result.groupby(['Area'], as_index=True).mean()[['單價元坪','漲跌','pred']].reset_index()
updown.sort_values(by=['漲跌'], inplace=True, ascending=False)

In [38]:
updown = updown.rename({'單價元坪':'平均單價元/坪 (市價)', '漲跌':'預測平均漲跌','pred':'預測平均單價元/坪'}, axis=1)

In [42]:
deal = result.groupby(['Area'], as_index=True).count()['St'].reset_index()

In [45]:
AreaPred = pd.merge(deal, updown, on='Area')
AreaPred = AreaPred.rename({'St':'交易量'}, axis=1)
AreaPred.sort_values(by=['預測平均漲跌'], inplace=True, ascending=False)

In [46]:
AreaPred

Unnamed: 0,Area,交易量,平均單價元/坪 (市價),預測平均漲跌,預測平均單價元/坪
7,大同區,6031,559284.101951,24455.429335,583739.531278
8,大安區,12358,851835.304997,6757.361352,858592.666311
2,信義區,9703,712115.570443,6736.056747,718851.627221
3,內湖區,15129,525574.434169,5544.783628,531119.21781
11,萬華區,10315,490464.793799,5350.239034,495815.032794
6,士林區,10498,584015.257458,721.716619,584736.974044
0,中山區,20027,667169.675668,177.244383,667346.920067
4,北投區,13703,463433.138088,-2008.493582,461424.644462
1,中正區,7352,757656.14927,-2107.562198,755548.58707
5,南港區,5981,538310.363055,-8912.619622,529397.743411


## By St

In [48]:
updownSt = result.groupby(['St'], as_index=True).mean()[['單價元坪','漲跌','pred']].reset_index()
updownSt.sort_values(by=['漲跌'], inplace=True, ascending=False)

In [49]:
updownSt = updownSt.rename({'單價元坪':'平均單價元/坪 (市價)', '漲跌':'預測平均漲跌','pred':'預測平均單價元/坪'}, axis=1)

In [53]:
dealSt = result.groupby(['St'], as_index=True).count()['Area'].reset_index()

In [56]:
AreaPredSt = pd.merge(dealSt, updownSt, on='St')
AreaPredSt = AreaPredSt.rename({'Area':'交易量'}, axis=1)
AreaPredSt.sort_values(by=['交易量'], inplace=True, ascending=False)

In [57]:
AreaPredSt

Unnamed: 0,St,交易量,平均單價元/坪 (市價),預測平均漲跌,預測平均單價元/坪
19,中山北路,3559,6.808447e+05,-42482.869306,6.383618e+05
353,民權東路,3205,5.944901e+05,-2387.755526,5.921023e+05
214,忠孝東路,2875,8.482974e+05,-74113.482588,7.741839e+05
74,南京東路,2701,7.145658e+05,-28824.519067,6.857413e+05
57,內湖路,2517,5.441278e+05,-12749.882686,5.313779e+05
...,...,...,...,...,...
72,北平西路,1,4.949771e+05,231635.100000,7.266122e+05
564,金山北路,1,7.407103e+05,137732.680000,8.784430e+05
64,凱旋路,1,1.000372e+06,234122.000000,1.234494e+06
60,公舍官路,1,3.899487e+05,-18996.210000,3.709525e+05
