# 1. Introduction

The purpose of this notebook is to perform cross validation for the monthly machine learning models.

# 2. Setup

Import required modules and packages.

In [1]:
# import time so that various tasks can be timed
import time

# import math for mathematical calculates
import math

# import pandas and numpy for data analysis
import pandas as pd
import numpy as np

# import from sklearn for model training
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor

# import pickle so that models can be saved to file
import pickle

# 3. August Cross Validation

## 3.1 Import the Cleaned Data

Import the cleaned data:

In [2]:
df = pd.read_hdf('/data_analytics/data/august_bus_data_cleaned.hdf')

## 3.2 Encode the Categorical Features

The categorical features we will use to train the model are:

- hour
- weekday
- peak

weekday & peak are already binary features. We will encode hour.

In [3]:
# encode values for hour
df_dummies = pd.get_dummies(df['hour'], prefix='hour')
df = pd.concat([df, df_dummies], axis =1)

## 3.3 Prepare the Features

In [4]:
# Prepare the descriptive & target features for the training data
X = df[['actualtime_arr_stop_first','segment_mean','weekday', 'segment_std','peak', 'rain', 'temp', 'hour_0','hour_1','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']]
y = df.time_diff

## 3.4 Perform Cross Validation

In [5]:
# based on https://stackoverflow.com/questions/44446501/how-to-standardize-data-with-sklearns-cross-val-score
scaler = StandardScaler()
gtb = GradientBoostingRegressor(max_depth=5, n_estimators=125)
pipeline = Pipeline([('transformer', scaler), ('estimator', gtb)])

scores = cross_val_score(pipeline, X, y, scoring='neg_mean_squared_error', cv=3)
rmse = [math.sqrt(-1*x) for x in scores]
print("\nCross Validation RMSE:", rmse)


Cross Validation RMSE: [37.24188742999849, 37.30456476296007, 37.0961403936959]


# 4. September Cross Validation

## 4.1 Import the Cleaned Data

Import the cleaned data:

In [6]:
df = pd.read_hdf('/data_analytics/data/september_bus_data_cleaned.hdf')

## 4.2 Encode the Categorical Features

The categorical features we will use to train the model are:

- hour
- weekday
- peak

weekday & peak are already binary features. We will encode hour.

In [7]:
# encode values for hour
df_dummies = pd.get_dummies(df['hour'], prefix='hour')
df = pd.concat([df, df_dummies], axis =1)

## 4.3 Prepare the Features

In [8]:
# Prepare the descriptive & target features for the training data
X = df[['actualtime_arr_stop_first','segment_mean','weekday', 'segment_std','peak', 'rain', 'temp', 'hour_0','hour_1','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']]
y = df.time_diff

## 4.4 Perform Cross Validation

In [9]:
# based on https://stackoverflow.com/questions/44446501/how-to-standardize-data-with-sklearns-cross-val-score
scaler = StandardScaler()
gtb = GradientBoostingRegressor(max_depth=5, n_estimators=125)
pipeline = Pipeline([('transformer', scaler), ('estimator', gtb)])

scores = cross_val_score(pipeline, X, y, scoring='neg_mean_squared_error', cv=3)
rmse = [math.sqrt(-1*x) for x in scores]
print("\nCross Validation RMSE:", rmse)


Cross Validation RMSE: [35.718245215854324, 36.328116655257524, 36.33918589277743]


# 5. October Cross Validation

## 5.1 Import the Cleaned Data

Import the cleaned data:

In [10]:
df = pd.read_hdf('/data_analytics/data/october_bus_data_cleaned.hdf')

## 5.2 Encode the Categorical Features

The categorical features we will use to train the model are:

- hour
- weekday
- peak

weekday & peak are already binary features. We will encode hour.

In [11]:
# encode values for hour
df_dummies = pd.get_dummies(df['hour'], prefix='hour')
df = pd.concat([df, df_dummies], axis =1)

## 5.3 Prepare the Features

In [12]:
# Prepare the descriptive & target features for the training data
X = df[['actualtime_arr_stop_first','segment_mean','weekday', 'segment_std','peak', 'school_hol', 'rain', 'temp', 'hour_0','hour_1','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']]
y = df.time_diff

## 5.4 Perform Cross Validation

In [13]:
# based on https://stackoverflow.com/questions/44446501/how-to-standardize-data-with-sklearns-cross-val-score
scaler = StandardScaler()
gtb = GradientBoostingRegressor(max_depth=5, n_estimators=125)
pipeline = Pipeline([('transformer', scaler), ('estimator', gtb)])

scores = cross_val_score(pipeline, X, y, scoring='neg_mean_squared_error', cv=3)
rmse = [math.sqrt(-1*x) for x in scores]
print("\nCross Validation RMSE:", rmse)


Cross Validation RMSE: [37.049191325513185, 36.66962245894936, 37.48971842633407]


# 6. November Cross Validation

## 6.1 Import the Cleaned Data

Import the cleaned data:

In [14]:
df = pd.read_hdf('/data_analytics/data/november_bus_data_cleaned.hdf')

## 6.2 Encode the Categorical Features

The categorical features we will use to train the model are:

- hour
- weekday
- peak

weekday & peak are already binary features. We will encode hour.

In [15]:
# encode values for hour
df_dummies = pd.get_dummies(df['hour'], prefix='hour')
df = pd.concat([df, df_dummies], axis =1)

## 6.3 Prepare the Features

In [16]:
# Prepare the descriptive & target features for the training data
X = df[['actualtime_arr_stop_first','segment_mean','weekday', 'segment_std','peak', 'school_hol', 'rain', 'temp', 'hour_0','hour_1','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']]
y = df.time_diff

## 6.4 Perform Cross Validation

In [17]:
# based on https://stackoverflow.com/questions/44446501/how-to-standardize-data-with-sklearns-cross-val-score
scaler = StandardScaler()
gtb = GradientBoostingRegressor(max_depth=5, n_estimators=125)
pipeline = Pipeline([('transformer', scaler), ('estimator', gtb)])

scores = cross_val_score(pipeline, X, y, scoring='neg_mean_squared_error', cv=3)
rmse = [math.sqrt(-1*x) for x in scores]
print("\nCross Validation RMSE:", rmse)


Cross Validation RMSE: [38.65456327223573, 38.27572719419434, 38.92495576433557]


# 7. December Cross Validation

## 7.1 Import the Cleaned Data

Import the cleaned data:

In [18]:
df = pd.read_hdf('/data_analytics/data/december_bus_data_cleaned.hdf')

## 7.2 Encode the Categorical Features

The categorical features we will use to train the model are:

- hour
- weekday
- peak

weekday & peak are already binary features. We will encode hour.

In [19]:
# encode values for hour
df_dummies = pd.get_dummies(df['hour'], prefix='hour')
df = pd.concat([df, df_dummies], axis =1)

## 7.3 Prepare the Features

In [20]:
# Prepare the descriptive & target features for the training data
X = df[['actualtime_arr_stop_first','segment_mean','weekday', 'segment_std','peak', 'school_hol', 'rain', 'temp', 'hour_0','hour_1','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']]
y = df.time_diff

## 7.4 Perform Cross Validation

In [21]:
# based on https://stackoverflow.com/questions/44446501/how-to-standardize-data-with-sklearns-cross-val-score
scaler = StandardScaler()
gtb = GradientBoostingRegressor(max_depth=5, n_estimators=125)
pipeline = Pipeline([('transformer', scaler), ('estimator', gtb)])

scores = cross_val_score(pipeline, X, y, scoring='neg_mean_squared_error', cv=3)
rmse = [math.sqrt(-1*x) for x in scores]
print("\nCross Validation RMSE:", rmse)


Cross Validation RMSE: [38.60321270351514, 38.158649739917216, 39.110517353038176]


# 8. January Cross Validation

## 8.1 Import the Cleaned Data

Import the cleaned data:

In [22]:
df = pd.read_hdf('/data_analytics/data/january_bus_data_cleaned.hdf')

## 8.2 Encode the Categorical Features

The categorical features we will use to train the model are:

- hour
- weekday
- peak

weekday & peak are already binary features. We will encode hour.

In [23]:
# encode values for hour
df_dummies = pd.get_dummies(df['hour'], prefix='hour')
df = pd.concat([df, df_dummies], axis =1)

## 8.3 Prepare the Features

In [24]:
# Prepare the descriptive & target features for the training data
X = df[['actualtime_arr_stop_first','segment_mean','weekday', 'segment_std','peak', 'school_hol', 'rain', 'temp', 'hour_0','hour_1','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']]
y = df.time_diff

## 8.4 Perform Cross Validation

In [25]:
# based on https://stackoverflow.com/questions/44446501/how-to-standardize-data-with-sklearns-cross-val-score
scaler = StandardScaler()
gtb = GradientBoostingRegressor(max_depth=5, n_estimators=125)
pipeline = Pipeline([('transformer', scaler), ('estimator', gtb)])

scores = cross_val_score(pipeline, X, y, scoring='neg_mean_squared_error', cv=3)
rmse = [math.sqrt(-1*x) for x in scores]
print("\nCross Validation RMSE:", rmse)


Cross Validation RMSE: [38.26150878231374, 36.26141625923627, 37.0530874155413]


# 9. February Cross Validation

## 9.1 Import the Cleaned Data

Import the cleaned data:

In [26]:
df = pd.read_hdf('/data_analytics/data/february_bus_data_cleaned.hdf')

## 9.2 Encode the Categorical Features

The categorical features we will use to train the model are:

- hour
- weekday
- peak

weekday & peak are already binary features. We will encode hour.

In [27]:
# encode values for hour
df_dummies = pd.get_dummies(df['hour'], prefix='hour')
df = pd.concat([df, df_dummies], axis =1)

## 9.3 Prepare the Features

In [28]:
# Prepare the descriptive & target features for the training data
X = df[['actualtime_arr_stop_first','segment_mean','weekday', 'segment_std','peak', 'school_hol', 'rain', 'temp', 'hour_0','hour_1','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']]
y = df.time_diff

## 9.4 Perform Cross Validation

In [29]:
# based on https://stackoverflow.com/questions/44446501/how-to-standardize-data-with-sklearns-cross-val-score
scaler = StandardScaler()
gtb = GradientBoostingRegressor(max_depth=5, n_estimators=125)
pipeline = Pipeline([('transformer', scaler), ('estimator', gtb)])

scores = cross_val_score(pipeline, X, y, scoring='neg_mean_squared_error', cv=3)
rmse = [math.sqrt(-1*x) for x in scores]
print("\nCross Validation RMSE:", rmse)


Cross Validation RMSE: [36.96773012986183, 37.01166482499384, 36.5499841435223]


# 10. March Cross Validation

## 10.1 Import the Cleaned Data

Import the cleaned data:

In [30]:
df = pd.read_hdf('/data_analytics/data/march_bus_data_cleaned.hdf')

## 10.2 Encode the Categorical Features

The categorical features we will use to train the model are:

- hour
- weekday
- peak

weekday & peak are already binary features. We will encode hour.

In [31]:
# encode values for hour
df_dummies = pd.get_dummies(df['hour'], prefix='hour')
df = pd.concat([df, df_dummies], axis =1)

## 10.3 Prepare the Features

In [32]:
# Prepare the descriptive & target features for the training data
X = df[['actualtime_arr_stop_first','segment_mean','weekday', 'segment_std','peak', 'school_hol', 'rain', 'temp', 'hour_0','hour_1','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']]
y = df.time_diff

## 10.4 Perform Cross Validation

In [33]:
# based on https://stackoverflow.com/questions/44446501/how-to-standardize-data-with-sklearns-cross-val-score
scaler = StandardScaler()
gtb = GradientBoostingRegressor(max_depth=5, n_estimators=125)
pipeline = Pipeline([('transformer', scaler), ('estimator', gtb)])

scores = cross_val_score(pipeline, X, y, scoring='neg_mean_squared_error', cv=3)
rmse = [math.sqrt(-1*x) for x in scores]
print("\nCross Validation RMSE:", rmse)


Cross Validation RMSE: [38.38497226913683, 39.165926791583516, 39.36675914835057]


# 11. April Cross Validation

## 11.1 Import the Cleaned Data

Import the cleaned data:

In [34]:
df = pd.read_hdf('/data_analytics/data/april_bus_data_cleaned.hdf')

## 11.2 Encode the Categorical Features

The categorical features we will use to train the model are:

- hour
- weekday
- peak

weekday & peak are already binary features. We will encode hour.

In [35]:
# encode values for hour
df_dummies = pd.get_dummies(df['hour'], prefix='hour')
df = pd.concat([df, df_dummies], axis =1)

## 11.3 Prepare the Features

In [36]:
# Prepare the descriptive & target features for the training data
X = df[['actualtime_arr_stop_first','segment_mean','weekday', 'segment_std','peak', 'school_hol', 'rain', 'temp', 'hour_0','hour_1','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']]
y = df.time_diff

## 11.4 Perform Cross Validation

In [37]:
# based on https://stackoverflow.com/questions/44446501/how-to-standardize-data-with-sklearns-cross-val-score
scaler = StandardScaler()
gtb = GradientBoostingRegressor(max_depth=5, n_estimators=125)
pipeline = Pipeline([('transformer', scaler), ('estimator', gtb)])

scores = cross_val_score(pipeline, X, y, scoring='neg_mean_squared_error', cv=3)
rmse = [math.sqrt(-1*x) for x in scores]
print("\nCross Validation RMSE:", rmse)


Cross Validation RMSE: [37.55171376830806, 37.37205052756081, 37.406729724125675]


# 12. May Cross Validation

## 12.1 Import the Cleaned Data

Import the cleaned data:

In [38]:
df = pd.read_hdf('/data_analytics/data/may_bus_data_cleaned.hdf')

## 12.2 Encode the Categorical Features

The categorical features we will use to train the model are:

- hour
- weekday
- peak

weekday & peak are already binary features. We will encode hour.

In [39]:
# encode values for hour
df_dummies = pd.get_dummies(df['hour'], prefix='hour')
df = pd.concat([df, df_dummies], axis =1)

## 12.3 Prepare the Features

In [40]:
# Prepare the descriptive & target features for the training data
X = df[['actualtime_arr_stop_first','segment_mean','weekday', 'segment_std','peak', 'rain', 'temp', 'hour_0','hour_1','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']]
y = df.time_diff

## 12.4 Perform Cross Validation

In [41]:
# based on https://stackoverflow.com/questions/44446501/how-to-standardize-data-with-sklearns-cross-val-score
scaler = StandardScaler()
gtb = GradientBoostingRegressor(max_depth=5, n_estimators=125)
pipeline = Pipeline([('transformer', scaler), ('estimator', gtb)])

scores = cross_val_score(pipeline, X, y, scoring='neg_mean_squared_error', cv=3)
rmse = [math.sqrt(-1*x) for x in scores]
print("\nCross Validation RMSE:", rmse)


Cross Validation RMSE: [37.36075688157742, 36.4515214256466, 37.64666091142253]


# 13. June Cross Validation

## 13.1 Import the Cleaned Data

Import the cleaned data:

In [42]:
df = pd.read_hdf('/data_analytics/data/june_bus_data_cleaned.hdf')

## 13.2 Encode the Categorical Features

The categorical features we will use to train the model are:

- hour
- weekday
- peak

weekday & peak are already binary features. We will encode hour.

In [43]:
# encode values for hour
df_dummies = pd.get_dummies(df['hour'], prefix='hour')
df = pd.concat([df, df_dummies], axis =1)

## 13.3 Prepare the Features

In [44]:
# Prepare the descriptive & target features for the training data
X = df[['actualtime_arr_stop_first','segment_mean','weekday', 'segment_std','peak', 'rain', 'temp', 'hour_0','hour_1','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']]
y = df.time_diff

## 13.4 Perform Cross Validation

In [45]:
# based on https://stackoverflow.com/questions/44446501/how-to-standardize-data-with-sklearns-cross-val-score
scaler = StandardScaler()
gtb = GradientBoostingRegressor(max_depth=5, n_estimators=125)
pipeline = Pipeline([('transformer', scaler), ('estimator', gtb)])

scores = cross_val_score(pipeline, X, y, scoring='neg_mean_squared_error', cv=3)
rmse = [math.sqrt(-1*x) for x in scores]
print("\nCross Validation RMSE:", rmse)


Cross Validation RMSE: [36.94910566229633, 38.33545696621176, 37.979720827151276]


# 14. July Cross Validation

## 14.1 Import the Cleaned Data

Import the cleaned data:

In [46]:
df = pd.read_hdf('/data_analytics/data/july_bus_data_cleaned.hdf')

## 14.2 Encode the Categorical Features

The categorical features we will use to train the model are:

- hour
- weekday
- peak

weekday & peak are already binary features. We will encode hour.

In [47]:
# encode values for hour
df_dummies = pd.get_dummies(df['hour'], prefix='hour')
df = pd.concat([df, df_dummies], axis =1)

## 14.3 Prepare the Features

In [48]:
# Prepare the descriptive & target features for the training data
X = df[['actualtime_arr_stop_first','segment_mean','weekday', 'segment_std','peak', 'rain', 'temp', 'hour_0','hour_1','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']]
y = df.time_diff

## 14.4 Perform Cross Validation

In [49]:
# based on https://stackoverflow.com/questions/44446501/how-to-standardize-data-with-sklearns-cross-val-score
scaler = StandardScaler()
gtb = GradientBoostingRegressor(max_depth=5, n_estimators=125)
pipeline = Pipeline([('transformer', scaler), ('estimator', gtb)])

scores = cross_val_score(pipeline, X, y, scoring='neg_mean_squared_error', cv=3)
rmse = [math.sqrt(-1*x) for x in scores]
print("\nCross Validation RMSE:", rmse)


Cross Validation RMSE: [37.74627178730055, 38.42287886705406, 37.965746207415194]
