# Hyperparameter tuning - machine learning models

In [None]:
import math
import time
import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingRegressor
from xgboost.sklearn import XGBRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from utils import train_test_split, X_Y_split, scaler

In [None]:
data = pd.read_csv('data_droped_nov18_dummy_final.csv')

In [None]:
data = data.set_index('datetime')
data.index = pd.to_datetime(data.index)
data = data.drop(['Unnamed: 0.2','Unnamed: 0.1','Unnamed: 0', 'diff'],axis=1)
data=data.reindex(columns=['lots_available', 'total_lot',  'carpark_number','x_coord', 'y_coord',
'car_park_decks', 'gantry_height','BASEMENT CAR PARK', 'COVERED CAR PARK','MECHANISED AND SURFACE CAR PARK', 'MULTI-STOREY CAR PARK',
'SURFACE CAR PARK', '7AM-10.30PM', '7AM-7PM', 'NO', 'WHOLE DAY', 'NO.1','SUN & PH FR 1PM-10.30PM', 'SUN & PH FR 7AM-10.30PM', 'NO.2', 
'YES','N', 'Y'])

In [None]:
data["day_of_week"] = data.index.weekday
data["hour_of_day"] = data.index.hour

In [None]:
features = ['lots_available','day_of_week','hour_of_day','total_lot', 'carpark_number', 'x_coord', 'y_coord','car_park_decks', 'gantry_height', 'MULTI-STOREY CAR PARK','WHOLE DAY', 
       'NO.1','SUN & PH FR 7AM-10.30PM']

In [None]:
data = data[features]
data=data.reindex(columns=features)

In [None]:
data.loc['2016-02-19 11:15:00',:] = np.nan
data.dropna(inplace=True)

In [None]:
Train, Test = train_test_split(data, test_step_size=673)

In [None]:
for i in Train.columns:
    scaler = MinMaxScaler()
    s_train = scaler.fit_transform(Train[i].values.reshape((-1,1)))
    s_train = np.reshape(s_train,(len(s_train)))
    Train[i] = s_train   
    s_test = scaler.transform(Test[i].values.reshape((-1,1)))
    s_test = np.reshape(s_test,(len(s_test)))
    Test[i] = s_test

In [None]:
X_train, Y_train = X_Y_split(Train, window_size=1,label_col_no=0)
X_test, Y_test = X_Y_split(Test, window_size=1,label_col_no=0)

## Bagging regressor

### Define baseline

In [None]:
bagging_regressor_baseline = BaggingRegressor()
bagging_regressor_baseline.fit(X_train,Y_train)

In [None]:
yp_bagg_base = bagging_regressor_baseline.predict(X_test)

In [None]:
mse = mean_squared_error(y_pred=yp_bagg_base, y_true=Y_test)
mae = mean_absolute_error(y_pred=yp_bagg_base, y_true=Y_test)
rmse = math.sqrt(mean_squared_error(y_pred=yp_bagg_base, y_true=Y_test))
r2 = r2_score(y_pred=yp_bagg_base, y_true=Y_test)
print(round(mse, 5))
print(round(mae,5))
print(round(rmse,5))
print(round(r2,5))

### hp tuning

In [None]:
# parameter range from literature reviews
params = {
    'n_estimators': (100,200,500),
    'max_samples' : (0.7, 0.8, 0.9, 1.0)
}

In [None]:
model = BaggingRegressor()

finder = RandomizedSearchCV(
    estimator=model,
    param_distributions=params,
    scoring='r2',
    refit=True,
    cv=TimeSeriesSplit(n_splits=5),  # change this to the splitter subject to test
    return_train_score=True
    )

finder.fit(X_train, Y_train)

best_params = finder.best_params_
best_score = round(finder.best_score_,4)

In [None]:
print(best_params)
print(best_score)

In [None]:
bagging_regressor_best = BaggingRegressor(max_samples=0.7, n_estimators=500)

In [None]:
bagging_regressor_best.fit(X_train,Y_train)

In [None]:
yp = bagging_regressor_best.predict(X_test)

In [None]:
mse = mean_squared_error(y_pred=yp, y_true=Y_test)
mae = mean_absolute_error(y_pred=yp, y_true=Y_test)
rmse = math.sqrt(mean_squared_error(y_pred=yp, y_true=Y_test))
r2 = r2_score(y_pred=yp, y_true=Y_test)
print(mse)
print(mae)
print(rmse)
print(r2)

## XGBoost regressor

### Define baseline

In [None]:
xgb_baseline =XGBRegressor()
xgb_baseline.fit(X_train,Y_train)

In [None]:
yp_xgb_base = xgb_baseline.predict(X_test)

In [None]:
mse = mean_squared_error(y_pred=yp_xgb_base, y_true=Y_test)
mae = mean_absolute_error(y_pred=yp_xgb_base, y_true=Y_test)
rmse = math.sqrt(mean_squared_error(y_pred=yp_xgb_base, y_true=Y_test))
r2 = r2_score(y_pred=yp_xgb_base, y_true=Y_test)
print(round(mse, 3))
print(round(mae,3))
print(round(rmse,3))
print(round(r2,3))

### hp tuning

In [None]:
# parameter range from literature reviews
params = {
    'n_estimators': (50, 100, 150),
    'max_depth': (3, 6, 9),
    'learning_rate':(0.01, 0.1, 0.3, 0.5),
    'gamma' :(5, 7, 10)
}

In [None]:
model = XGBRegressor()

finder = RandomizedSearchCV(
    estimator=model,
    param_distributions=params,
    scoring='r2',
    refit=True,
    cv=TimeSeriesSplit(n_splits=5),  # change this to the splitter subject to test
    return_train_score=True
    )

start_time = time.time()
finder.fit(X_train, Y_train)
print("--- %s seconds ---" %(time.time()- start_time))

best_params = finder.best_params_
best_score = round(finder.best_score_,4)

In [None]:
print(best_params)
print(best_score)

In [None]:
#best parameter based on 1x to predict next step
xgb_best = XGBRegressor(learning_rate=0.3,max_depth=3,n_estimators=50,gamma=5)

In [None]:
xgb_best.fit(X_train, Y_train)

In [None]:
yp1 = xgb_best.predict(X_test)

In [None]:
mse = mean_squared_error(y_pred=yp1, y_true=Y_test)
mae = mean_absolute_error(y_pred=yp1, y_true=Y_test)
rmse = math.sqrt(mean_squared_error(y_pred=yp1, y_true=Y_test))
r2 = r2_score(y_pred=yp1, y_true=Y_test)
print(mse)
print(mae)
print(rmse)
print(r2)