In [None]:
# import modules
import numpy as np
import pandas as pd
import glob
import datetime
import matplotlib.pyplot as plt
import seaborn as sb

In [None]:
pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from xgboost import plot_importance
from sklearn.metrics import mean_squared_error

In [None]:
#df = pd.read_csv(r'G:\NCHRP\PTSU_VAS\SPF_Data\Analysis\pm_peak_nb_4_hrs.csv', low_memory=True)
df = pd.read_csv(r'G:\NCHRP\PTSU_VAS\SPF_Data\Analysis\PM_SB_All_Seg.csv', low_memory=True)

In [None]:
#Creating Categorical Variable for left and right shoulder width
df['RightShoulderWidthCat'] = 0
df.loc[df['RightShoulderWidth'] > 10, 'RightShoulderWidthCat'] = 1
df['LeftShoulderWidthCat'] = 0
df.loc[df['LeftShoulderWidth'] > 5, 'LeftShoulderWidthCat'] = 1

In [None]:
#Preliminary
df_sel = df[['AvgVolume', 'AvgSpeed', 'StdSpeed',
       'CoefOfVarSpeed', 'AvgOccupancy', 'StdOccupancy', 'CoefOfVarOccupancy',
       'SegmentLength', 'ThruLanes', 'SegmentType', 'RightShoulderWidthCat', 'LeftShoulderWidthCat',
       'Turnout', 'Diff_Avg_Volume_Down', 'Diff_Avg_Volume_Up',
       'Diff_Avg_Volume', 'Diff_Avg_Occupancy_Down', 'Diff_Avg_Occupancy_Up',
       'Diff_Avg_Occupancy', 'Diff_Std_Occupancy_Down',
       'Diff_Std_Occupancy_Up', 'Diff_Std_Occupancy',
       'Diff_CoefOfVar_Occupancy_Down', 'Diff_CoefOfVar_Occupancy_Up',
       'Diff_CoefOfVar_Occupancy', 'Diff_Avg_Speed_Down', 'Diff_Avg_Speed_Up',
       'Diff_Avg_Speed', 'Diff_Std_Speed_Down', 'Diff_Std_Speed_Up',
       'Diff_Std_Speed', 'Diff_CoefOfVar_Speed_Down',
       'Diff_CoefOfVar_Speed_Up', 'Diff_CoefOfVar_Speed',
       'TotalCrashes']]

In [None]:
df_sel['ThruLanes'] =  df_sel['ThruLanes'].astype("category")
df_sel['SegmentType'] =  df_sel['SegmentType'].astype("category")
df_sel['Turnout'] =  df_sel['Turnout'].astype("category")
df_sel['LeftShoulderWidthCat'] =  df_sel['LeftShoulderWidthCat'].astype("category")
df_sel['RightShoulderWidthCat'] =  df_sel['RightShoulderWidthCat'].astype("category")

In [None]:
X_train = df_sel.drop(columns=['TotalCrashes'])
y_train = df_sel[['TotalCrashes']]

In [None]:
model_xgb = xgb.XGBRegressor(tree_method="hist", enable_categorical=True, eval_metric= mean_absolute_error)

param_grid = {"max_depth":    [2, 4, 6, 8, 10],   #default: 6
              "n_estimators": [100, 300, 600],  
              "learning_rate": [0.001, 0.01, 0.1, 0.8],  #default: 0.3
              "objective": ['count:poisson'],
              "gamma": [0.001,0.01,0.1],
              "subsample": [0.5, 0.75, 1.0],
              "colsample_bytree": [0.5, 0.75, 1.0],
              "colsample_bylevel": [0.5, 0.75, 1.0]
             }
# try out every combination of the above values
GS_xgb = GridSearchCV(model_xgb, param_grid, cv=5).fit(X_train, y_train, early_stopping_rounds= 50, eval_set=[(X_train, y_train)])

print("The best hyperparameters are ", GS_xgb.best_params_)

In [None]:
#XGBoost
#tree_method : exact, approx, hist
model_xgb = xgb.XGBRegressor(tree_method="hist",
                             enable_categorical= True, 
                             objective='count:poisson',
                              n_estimators=100,
                              eta= 0.1,
                              gamma = 0.1,
                              subsample = 0.75,
                              colsample_bytree = 0.75,
                              colsample_bylevel = 1.0,
                              max_depth=8,
                              eval_metric = mean_absolute_error
                            )

model_xgb.fit(X_train, y_train, early_stopping_rounds= 10, eval_set=[(X_train, y_train)])
y_pred = model_xgb.predict(X_train)
print("Mean Absolute Error:", mean_absolute_error(y_train, y_pred))

In [None]:
plot_importance(model_xgb
                #,max_num_features= 30
               )
plt.show()

In [None]:
df['Pred_XGB'] = y_pred

In [None]:
mean_squared_error(y_train, y_pred, squared=False)