In [46]:
import pandas as pd                  # Pandas
import numpy as np                   # Numpy
from matplotlib import pyplot as plt # Matplotlib

# Packages to implement required models
# import sklearn
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor

# Package to implement Grid Search Cross Validation
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold 

# Package for generating confusion matrix
from sklearn.metrics import r2_score, mean_squared_error # take sqrt of the mean squared error for rmse ex: >> rmse = sqrt(mean_squared_error(y_actual, y_pred))

# Package to record time
import time

# Package for Data pretty printer
from pprint import pprint

%matplotlib inline

# Display inline plots as vector-based (svg)
%config InlineBackend.figure_formats = ['svg']

In [47]:
# load dataset into dataframe

traffic_df = pd.read_csv('Traffic_Volume.csv')
traffic_df.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00,5545
1,,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00,4516
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00,4767
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00,5026
4,,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00,4918


In [48]:
# drop null-values

# traffic_df.dropna(inplace = True)
traffic_df.replace({np.nan:None}, inplace = True)
traffic_df.info()

# note categorical variables: holiday, weather_main, and weather_description... drop weather_description as superfluous
traffic_df.drop('weather_description', axis = 1, inplace = True)
traffic_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48204 entries, 0 to 48203
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   holiday              61 non-null     object 
 1   temp                 48204 non-null  float64
 2   rain_1h              48204 non-null  float64
 3   snow_1h              48204 non-null  float64
 4   clouds_all           48204 non-null  int64  
 5   weather_main         48204 non-null  object 
 6   weather_description  48204 non-null  object 
 7   date_time            48204 non-null  object 
 8   traffic_volume       48204 non-null  int64  
dtypes: float64(3), int64(2), object(4)
memory usage: 3.3+ MB


Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,date_time,traffic_volume
0,,288.28,0.0,0.0,40,Clouds,2012-10-02 09:00:00,5545
1,,289.36,0.0,0.0,75,Clouds,2012-10-02 10:00:00,4516
2,,289.58,0.0,0.0,90,Clouds,2012-10-02 11:00:00,4767
3,,290.13,0.0,0.0,90,Clouds,2012-10-02 12:00:00,5026
4,,291.14,0.0,0.0,75,Clouds,2012-10-02 13:00:00,4918


In [49]:
# date_time to be decomposed into separate columns for month, day, and time... drop year
# traffic_df['date'] = 0
# traffic_df['time'] = 0
# traffic_df['date'] = pd.to_datetime(traffic_df['date_time']).dt.date
# traffic_df['time'] = pd.to_datetime(traffic_df['date_time']).dt.time
traffic_df['date_time']= pd.to_datetime(traffic_df['date_time'],format ='mixed')
# traffic_df['year']= traffic_df['date_time'].dt.year
traffic_df['month']= traffic_df['date_time'].dt.month
traffic_df['day']= traffic_df['date_time'].dt.day
traffic_df['time'] = pd.to_datetime(traffic_df['date_time']).dt.hour
traffic_df.drop('date_time', axis = 1, inplace = True)

# traffic_df = traffic_df.assign(
#     date=traffic_df['date_time'].dt.date, 
#     time=traffic_df['date_time'].dt.time
# )
traffic_df

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,traffic_volume,month,day,time
0,,288.28,0.0,0.0,40,Clouds,5545,10,2,9
1,,289.36,0.0,0.0,75,Clouds,4516,10,2,10
2,,289.58,0.0,0.0,90,Clouds,4767,10,2,11
3,,290.13,0.0,0.0,90,Clouds,5026,10,2,12
4,,291.14,0.0,0.0,75,Clouds,4918,10,2,13
...,...,...,...,...,...,...,...,...,...,...
48199,,283.45,0.0,0.0,75,Clouds,3543,9,30,19
48200,,282.76,0.0,0.0,90,Clouds,2781,9,30,20
48201,,282.73,0.0,0.0,90,Thunderstorm,2159,9,30,21
48202,,282.09,0.0,0.0,90,Clouds,1450,9,30,22


In [50]:
# check df.value_counts() for holiday and weather_main
print('holidays:')
print(traffic_df['holiday'].value_counts())
print('   ')
print('weather types:')
print(traffic_df['weather_main'].value_counts())

holidays:
holiday
Labor Day                    7
Thanksgiving Day             6
Christmas Day                6
New Years Day                6
Martin Luther King Jr Day    6
Columbus Day                 5
Veterans Day                 5
Washingtons Birthday         5
Memorial Day                 5
Independence Day             5
State Fair                   5
Name: count, dtype: int64
   
weather types:
weather_main
Clouds          15164
Clear           13391
Mist             5950
Rain             5672
Snow             2876
Drizzle          1821
Haze             1360
Thunderstorm     1034
Fog               912
Smoke              20
Squall              4
Name: count, dtype: int64


In [51]:
traffic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48204 entries, 0 to 48203
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   holiday         61 non-null     object 
 1   temp            48204 non-null  float64
 2   rain_1h         48204 non-null  float64
 3   snow_1h         48204 non-null  float64
 4   clouds_all      48204 non-null  int64  
 5   weather_main    48204 non-null  object 
 6   traffic_volume  48204 non-null  int64  
 7   month           48204 non-null  int32  
 8   day             48204 non-null  int32  
 9   time            48204 non-null  int32  
dtypes: float64(3), int32(3), int64(2), object(2)
memory usage: 3.1+ MB


In [52]:
# extract target and input features

output = traffic_df['traffic_volume']

features = traffic_df.drop('traffic_volume', axis = 1)

# one-hot-encoding for cat vars
features = pd.get_dummies(features)
features

Unnamed: 0,temp,rain_1h,snow_1h,clouds_all,month,day,time,holiday_Christmas Day,holiday_Columbus Day,holiday_Independence Day,...,weather_main_Clouds,weather_main_Drizzle,weather_main_Fog,weather_main_Haze,weather_main_Mist,weather_main_Rain,weather_main_Smoke,weather_main_Snow,weather_main_Squall,weather_main_Thunderstorm
0,288.28,0.0,0.0,40,10,2,9,False,False,False,...,True,False,False,False,False,False,False,False,False,False
1,289.36,0.0,0.0,75,10,2,10,False,False,False,...,True,False,False,False,False,False,False,False,False,False
2,289.58,0.0,0.0,90,10,2,11,False,False,False,...,True,False,False,False,False,False,False,False,False,False
3,290.13,0.0,0.0,90,10,2,12,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,291.14,0.0,0.0,75,10,2,13,False,False,False,...,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48199,283.45,0.0,0.0,75,9,30,19,False,False,False,...,True,False,False,False,False,False,False,False,False,False
48200,282.76,0.0,0.0,90,9,30,20,False,False,False,...,True,False,False,False,False,False,False,False,False,False
48201,282.73,0.0,0.0,90,9,30,21,False,False,False,...,False,False,False,False,False,False,False,False,False,True
48202,282.09,0.0,0.0,90,9,30,22,False,False,False,...,True,False,False,False,False,False,False,False,False,False


In [53]:
# partition into test and train sets
train_X, test_X, train_y, test_y = train_test_split(features, output, test_size = 0.2, random_state = 1)

### Here we train a DT model

In [54]:
# Define your model
regressor = DecisionTreeRegressor(random_state = 42)

In [55]:
# Start with an initial guess for parameters
hyper_params = {
    'max_depth': [10, 20, 30, 40],
    'min_samples_split': [20, 40, 60],
    'min_samples_leaf': [10, 20, 30, 40, 100]
}

In [56]:
# Creating folds
folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 100)

In [57]:
# Start with an initial guess for parameters

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(2, 20, num = 10)]

# Minimum number of samples required to split a node
min_samples_split = [5, 10, 20]

# Minimum number of samples required at each leaf node
min_samples_leaf = [5, 10, 20]

# Create the random grid
random_grid = {'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

pprint(random_grid)

{'max_depth': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
 'min_samples_leaf': [5, 10, 20],
 'min_samples_split': [5, 10, 20]}


In [43]:
# Creating folds
folds = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 100)

In [58]:
# Call RandomizedSearchCV()
model_cv = RandomizedSearchCV(estimator = regressor, 
                              param_distributions = random_grid,
                              n_iter = 100,
                              scoring = 'r2', 
                              cv = folds, 
                              verbose = 2,
                              random_state = 42,
                              n_jobs = -1) # Will utilize all available CPUs

In [59]:
# Fit the model
start = time.time()            # Start Time
model_cv.fit(train_X, train_y)  
stop = time.time()             # End Time
print(f"Training time: {stop - start}s")



Fitting 5 folds for each of 90 candidates, totalling 450 fits
Training time: 23.200263023376465s


In [60]:
print('Initial score: ', model_cv.best_score_)
print('Initial parameters: ', model_cv.best_params_)

Initial score:  0.7825706582168775
Initial parameters:  {'min_samples_split': 5, 'min_samples_leaf': 5, 'max_depth': 10}


In [73]:
# Alter parameters

# Maximum number of levels in tree
max_depth = [8, 9, 10, 11, 12]

# Minimum number of samples required to split a node
min_samples_split = [2, 3, 4]

# Minimum number of samples required at each leaf node
min_samples_leaf = [4, 5, 6]

# Create the random grid
random_grid = {'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

pprint(random_grid)

{'max_depth': [8, 9, 10, 11, 12],
 'min_samples_leaf': [4, 5, 6],
 'min_samples_split': [2, 3, 4]}


In [74]:
# Call GridSearchCV()
model_cv = GridSearchCV(estimator = regressor, 
                              param_grid = random_grid,
                              scoring = 'r2', 
                              cv = folds, 
                              verbose = 2,
                              n_jobs = -1) # Will utilize all available CPUs

In [75]:
# Fit the model
start = time.time()            # Start Time
model_cv.fit(train_X, train_y)  
stop = time.time()             # End Time
print(f"Training time: {stop - start}s")

Fitting 5 folds for each of 45 candidates, totalling 225 fits




Training time: 10.893773555755615s


In [76]:
print('Improved score: ', model_cv.best_score_)
print('Improved parameters: ', model_cv.best_params_)

Improved score:  0.782845826684766
Improved parameters:  {'max_depth': 11, 'min_samples_leaf': 5, 'min_samples_split': 2}


In [77]:
# Predictions on test set
y_pred = model_cv.predict(test_X)

In [89]:
DT_r2score = r2_score(test_y,y_pred)
print('DT r2 score:', DT_r2score)
DT_rmsescore = np.sqrt(mean_squared_error(test_y, y_pred))
print('DT rmse score:', DT_rmsescore)

DT r2 score: 0.7788060503849884
DT rmse score: 931.7365798544645


In [80]:
# pickling the model
import pickle
# saving the trained DT model
# Creating the file where we want to write the model
dt_pickle = open('decisiontree_trafficpred.pickle', 'wb') 

# Write DT model to the file
pickle.dump(model_cv, dt_pickle)

# Close the file
dt_pickle.close()

## Here we train random forest

In [81]:
# instantiate the model
regressor = RandomForestRegressor(random_state = 0)

In [82]:
# Start with an initial guess for parameters
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 500, num = 10)]

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(2, 20, num = 10)]

# Minimum number of samples required to split a node
min_samples_split = [5, 10, 20]

# Minimum number of samples required at each leaf node
min_samples_leaf = [5, 10, 20]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

pprint(random_grid)

{'max_depth': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
 'min_samples_leaf': [5, 10, 20],
 'min_samples_split': [5, 10, 20],
 'n_estimators': [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]}


In [83]:
# Call RandomizedSearchCV()
model_cv = RandomizedSearchCV(estimator = regressor, 
                              param_distributions = random_grid,
                              n_iter = 100,
                              scoring = 'r2', 
                              cv = folds, 
                              verbose = 2,
                              random_state = 42,
                              n_jobs = -1) # Will utilize all available CPUs 

In [86]:
# Fit the model
start = time.time()            # Start Time
model_cv.fit(train_X, train_y)  
stop = time.time()             # End Time
print(f"Training time: {stop - start}s")

Fitting 5 folds for each of 100 candidates, totalling 500 fits




Training time: 5334.393263578415s


In [87]:
# saving the trained RF model
# Creating the file where we want to write the model
rf_pickle = open('randomforest_trafficpred.pickle', 'wb') 

# Write DT model to the file
pickle.dump(model_cv, rf_pickle)

# Close the file
rf_pickle.close()

In [90]:
# predictions on test set
y_pred = model_cv.predict(test_X)

In [91]:
RF_r2score = r2_score(test_y,y_pred)
print('RF r2 score:', RF_r2score)
RF_rmsescore = np.sqrt(mean_squared_error(test_y, y_pred))
print('RF rmse score:', RF_rmsescore)

RF r2 score: 0.8154913804108594
RF rmse score: 850.9711845010081


## Here we train AdaBoost

In [98]:
# initialize regressor
regressor = AdaBoostRegressor()
search_grid={'n_estimators':[500],
             'learning_rate':[.001],
             'random_state':[1]}

model_cv = GridSearchCV(estimator=regressor,
                    param_grid=search_grid,
                    scoring='r2',
                    n_jobs=-1,
                    cv=folds)

In [99]:
model_cv.fit(train_X, train_y)
model_cv.best_params_



{'learning_rate': 0.001, 'n_estimators': 500, 'random_state': 1}

In [100]:
y_pred = model_cv.predict(test_X)
ada_r2score = r2_score(test_y, y_pred)
ada_rmsescore = np.sqrt(mean_squared_error(test_y, y_pred))
print('Ada r2: ', ada_r2score)
print('Ada rmse: ', ada_rmsescore)

Ada r2:  0.7490841020394913
Ada rmse:  992.3631156242598


In [101]:
# saving the trained AdaBoost model
# Creating the file where we want to write the model
ada_pickle = open('adaboost_trafficpred.pickle', 'wb') 

# Write DT model to the file
pickle.dump(model_cv, ada_pickle)

# Close the file
ada_pickle.close()