In [None]:
#Importing packages
import pandas as pd                 
import numpy as np                   
from matplotlib import pyplot as plt
from mapie.regression import MapieRegressor
from mapie.metrics import regression_coverage_score
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
import time
import pickle

df = pd.read_csv('Traffic_Volume.csv')
df.info()

In [None]:
#Used Gemini here to extract month, weekday, and hour
df['month'] = pd.to_datetime(df['date_time']).dt.strftime('%B')
df['weekday'] = pd.to_datetime(df['date_time']).dt.strftime('%A') 
df['hour'] = pd.to_datetime(df['date_time']).dt.hour
df['hour'] = df['hour'].astype(str)

In [None]:
y = df['traffic_volume']

df.drop(columns=['date_time', 'traffic_volume'], inplace=True)
df.info()

In [None]:
df_encoded = pd.get_dummies(df, columns=['holiday', 'weather_main', 'month','weekday','hour'], drop_first=True)
X = df_encoded

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
#Training XGBoost regressor
xg_reg = XGBRegressor()
xg_reg.fit(train_X, train_y)

In [None]:
y_pred = xg_reg.predict(test_X)

In [None]:
#Saving model
xg_pickle = open('XGBoost_pickle.pkl', 'wb')
pickle.dump(xg_reg, xg_pickle)
xg_pickle.close()

#### Histogram of Residuals/Errors 

In [None]:

all_residuals = test_y - y_pred
plt.figure(figsize=(6, 4), dpi = 150)
plt.hist(all_residuals, bins = 25, color = 'lime', edgecolor = 'black')
plt.xlabel('Residuals', fontsize = 14)
plt.ylabel('# of Test Datapoints', fontsize = 14)
plt.title('Distribution of Residuals', fontsize = 16)
plt.xticks(fontsize = 10)
plt.yticks(fontsize = 10)
plt.savefig("residuals.svg");

#### Scatter Plot of Predicted Vs. Actual Values

In [None]:
plt.figure(figsize = (6, 4), dpi = 150)
plt.scatter(test_y, y_pred, color = 'blue', alpha = 0.6, edgecolor = 'black', s = 40)
plt.plot([min(test_y), max(test_y)], [min(test_y), max(test_y)], color = 'red', linestyle = '--', lw = 2)
plt.xlabel('Actual Values', fontsize = 10)
plt.ylabel('Predicted Values', fontsize = 10)
plt.title('Predicted vs Actual Values', fontsize = 12)
plt.xticks(fontsize = 10)
plt.yticks(fontsize = 10)
plt.savefig("scatter_plot.svg");

#### Feature Importance Plot

In [None]:
importance = xg_reg.feature_importances_
feature_imp = pd.DataFrame(list(zip(train_X.columns, importance)),
               columns = ['Feature', 'Importance'])

feature_imp = feature_imp.sort_values('Importance', ascending = False).reset_index(drop = True)
plt.figure(figsize=(8, 8))
plt.barh(feature_imp['Feature'], feature_imp['Importance'], color = ['red', 'lime'])
plt.xlabel("Importance", fontsize = 12)
plt.ylabel("Input Feature", fontsize = 12)
plt.title('Which features are the most important for predicting traffic volume?', fontsize = 12) 
plt.yticks(fontsize = 6) # fontsize of yticks
plt.xticks(fontsize = 10) # fontsize of xticks
plt.tight_layout()
plt.savefig("feature_imp.svg");

In [None]:
# Training Mapie model
model = XGBRegressor()
mapie = MapieRegressor(estimator = model, # Prediction model to use
                       n_jobs = -1,
                       random_state = 42)

start = time.time()  
mapie.fit(train_X, train_y)
stop = time.time()             
print(f"Training time: {stop - start}s")

In [None]:
#Saving model
mapie_pickle = open('mapie_pickle.pkl', 'wb')
pickle.dump(mapie, mapie_pickle)
mapie_pickle.close()