In [30]:
import pandas as pd
import numpy as np
import scipy.stats as sps
import seaborn as sns
import time
import datetime
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [5]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV

In [2]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

Using TensorFlow backend.


In [7]:
# alternative methods of loading keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

In [3]:
hdb_df = pd.read_csv("../datasets/final/combined.csv")

In [50]:
hdb_df.columns

Index(['storey_range', 'floor_area_sqm', 'remaining_lease', 'resale_price',
       'bedok', 'bishan', 'bukit_batok', 'bukit_merah', 'bukit_panjang',
       'bukit_timah', 'central_area', 'choa_chu_kang', 'clementi', 'geylang',
       'hougang', 'jurong_east', 'jurong_west', 'kallang_whampoa',
       'marine_parade', 'pasir_ris', 'punggol', 'queenstown', 'sembawang',
       'sengkang', 'serangoon', 'tampines', 'toa_payoh', 'woodlands', 'yishun',
       'model_adjoined_flat', 'model_apartment', 'model_dbss',
       'model_improved', 'model_improved_maisonette', 'model_maisonette',
       'model_model_a', 'model_model_a2', 'model_model_a_maisonette',
       'model_multi_generation', 'model_new_generation',
       'model_premium_apartment', 'model_premium_apartment_loft',
       'model_premium_maisonette', 'model_simplified', 'model_standard',
       'model_terrace', 'model_type_s1', 'model_type_s2', 'type_2_room',
       'type_3_room', 'type_4_room', 'type_5_room', 'type_executive',
     

In [15]:
X = hdb_df.drop(columns=['resale_price'])
y = hdb_df['resale_price']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state=42)

In [53]:
X_train.shape

(64324, 80)

In [12]:
X_valid.shape

(16081, 80)

In [29]:
hdb_df.describe()

Unnamed: 0,storey_range,floor_area_sqm,remaining_lease,resale_price,bedok,bishan,bukit_batok,bukit_merah,bukit_panjang,bukit_timah,...,dist_hawker,dist_park,dist_after_death,dist_attraction,dist_pool,dist_gym,dist_tennis,dist_stadium,dist_sports_hall,dist_library
count,80405.0,80405.0,80405.0,80405.0,80405.0,80405.0,80405.0,80405.0,80405.0,80405.0,...,80405.0,80405.0,80405.0,80405.0,80405.0,80405.0,80405.0,80405.0,80405.0,80405.0
mean,7.430794,97.581557,74.015546,439948.8,0.060058,0.019551,0.039077,0.038816,0.035296,0.002637,...,1153.316828,15569.09,19390.36,17434.59,16151.73,16377.17,18193.68,16657.06,16387.95,15961.72
std,5.602465,24.242844,11.518088,145735.3,0.237597,0.138452,0.19378,0.193157,0.184529,0.051281,...,1816.462424,412375.0,412222.5,412246.6,412357.8,412350.0,412289.4,412340.5,412349.7,412407.3
min,1.0,31.0,47.0,160000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,33.11,61.04,15.07,12.26,12.26,30.22,30.22,12.26,20.37
25%,4.0,76.0,66.0,338000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,359.75,481.24,2196.71,1473.33,815.34,940.43,1608.11,1000.47,950.25,735.84
50%,7.0,96.0,73.0,409000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,763.22,718.7,3461.76,2402.94,1229.43,1427.18,3109.95,1645.84,1435.15,1100.52
75%,10.0,112.0,83.0,505000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1576.65,999.81,6762.18,3518.61,1798.95,2083.29,5102.27,2629.33,2098.44,1611.45
max,49.0,280.0,97.0,1185000.0,1.0,1.0,1.0,1.0,1.0,1.0,...,417372.39,11529700.0,11528900.0,11527910.0,11529790.0,11529790.0,11529790.0,11529790.0,11529790.0,11530990.0


In [16]:
ss = StandardScaler()
ss.fit(X_train)
X_train_ss = ss.transform(X_train)
X_valid_ss = ss.transform(X_valid)
#X_test = ss.transform(X_test)

## Baseline Linear Regression

In [32]:
reg = LinearRegression().fit(X_train_ss, y_train)

In [33]:
y_pred = reg.predict(X_valid_ss)

In [35]:
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print("RMSE: %f" % (rmse))

RMSE: 51512.321212


## XGBoost

In [18]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

In [19]:
xg_reg.fit(X_train_ss,y_train)

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=5,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=10, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=10, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [36]:
y_pred = xg_reg.predict(X_valid_ss)

In [37]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: %f" % (rmse))

RMSE: 186567.992546


In [None]:
data_dmatrix = xgb.DMatrix(data=X,label=y)

In [41]:
params = {"objective":"reg:squarederror",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}

cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)

print((cv_results["test-rmse-mean"]).tail(1))

49    46851.627604
Name: test-rmse-mean, dtype: float64


In [40]:
params = {"objective":"reg:squarederror",'colsample_bytree': 0.3,'learning_rate': 0.05,
                'max_depth': 5, 'alpha': 10}

cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)

print((cv_results["test-rmse-mean"]).tail(1))

49    73742.419271
Name: test-rmse-mean, dtype: float64


In [42]:
params = {"objective":"reg:squarederror",'colsample_bytree': 0.3,'learning_rate': 0.2,
                'max_depth': 5, 'alpha': 10}

cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)

print((cv_results["test-rmse-mean"]).tail(1))

49    37419.154948
Name: test-rmse-mean, dtype: float64


In [44]:
## Hyper Parameter Optimization
regressor=xgboost.XGBRegressor()
n_estimators = [100, 500, 1000]
max_depth = [2, 3, 5]
booster=['gbtree','gblinear']
learning_rate=[0.05,0.1,0.20]
base_score=[0.25,0.5,0.75]

# Define the grid of hyperparameters to search
hyperparameter_grid = {
    'n_estimators': n_estimators,
    'max_depth':max_depth,
    'learning_rate':learning_rate,
    'booster':booster,
    'base_score':base_score
    }

# Set up the random search with 4-fold cross validation
random_cv = RandomizedSearchCV(estimator=regressor,
            param_distributions=hyperparameter_grid,
            cv=5, n_iter=30,
            scoring = 'neg_mean_absolute_error',n_jobs = 4,
            verbose = 5, 
            return_train_score = True,
            random_state=42)

In [None]:
start_time = time.perf_counter()

random_cv.fit(X_train,y_train)

end_time = time.perf_counter()
print(f"Boosting took {end_time - start_time:0.4f} seconds")

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  4.1min
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed: 130.5min
[Parallel(n_jobs=4)]: Done 150 out of 150 | elapsed: 418.6min finished


## Keras

In [5]:
# define base model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(13, input_dim=13, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [8]:
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=baseline_model, epochs=50, batch_size=5, verbose=0)))

pipeline = Pipeline(estimators)

kfold = KFold(n_splits=10)

results = cross_val_score(pipeline, X, Y, cv=kfold)

print("Standardized: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Traceback (most recent call last):
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 335, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\wrappers\scikit_learn.py", line 166, in fit
    history = self.model.fit(x, y, **fit_args)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\training.py", line 66, in _method_wrapper
    return method(self, *args, **kwargs)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\training.py", line 848, in fit
    tmp_logs = train_function(iterator)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\t

Traceback (most recent call last):
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 335, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\wrappers\scikit_learn.py", line 166, in fit
    history = self.model.fit(x, y, **fit_args)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\training.py", line 66, in _method_wrapper
    return method(self, *args, **kwargs)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\training.py", line 848, in fit
    tmp_logs = train_function(iterator)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\t

Traceback (most recent call last):
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 335, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\wrappers\scikit_learn.py", line 166, in fit
    history = self.model.fit(x, y, **fit_args)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\training.py", line 66, in _method_wrapper
    return method(self, *args, **kwargs)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\training.py", line 848, in fit
    tmp_logs = train_function(iterator)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\t

Traceback (most recent call last):
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 335, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\wrappers\scikit_learn.py", line 166, in fit
    history = self.model.fit(x, y, **fit_args)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\training.py", line 66, in _method_wrapper
    return method(self, *args, **kwargs)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\training.py", line 848, in fit
    tmp_logs = train_function(iterator)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\t

Traceback (most recent call last):
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 335, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\wrappers\scikit_learn.py", line 166, in fit
    history = self.model.fit(x, y, **fit_args)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\training.py", line 66, in _method_wrapper
    return method(self, *args, **kwargs)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\training.py", line 848, in fit
    tmp_logs = train_function(iterator)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\t

Standardized: nan (nan) MSE


Traceback (most recent call last):
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 335, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\wrappers\scikit_learn.py", line 166, in fit
    history = self.model.fit(x, y, **fit_args)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\training.py", line 66, in _method_wrapper
    return method(self, *args, **kwargs)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\training.py", line 848, in fit
    tmp_logs = train_function(iterator)
  File "C:\Users\wtbha\AppData\Roaming\Python\Python37\site-packages\t

In [12]:
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=larger_model, epochs=50, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=10)
results = cross_val_score(pipeline, X, Y, cv=kfold)
print("Larger: %.2f (%.2f) MSE" % (results.mean(), results.std()))

NameError: name 'larger_model' is not defined

In [14]:
kfold = KFold(n_splits=10)
results = cross_val_score(estimator, X, Y, cv=kfold)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))

NameError: name 'estimator' is not defined