# 1. Introduction

The purpose of this notebook is to train a machine learning model on the cleaned August Data, and to improve the accuracy of the model as much as possible via feature selection.

# 2. Setup & Data Load

Import required modules and packages.

In [1]:
# import time so that run time of various tasks can be tracked
import time

# import math for mathematical functions
import math

# import pandas and numpy for data analysis
import pandas as pd
import numpy as np

# import from sklearn for machine learning
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor

# import pickle so that models can be saved to file
import pickle

Set the max number of columns & rows to display.

In [2]:
pd.set_option('display.max_columns', 250)
pd.set_option('display.max_rows', 5700)

## 2.1 Data Import

Import the cleaned data:

In [3]:
df = pd.read_hdf('/data_analytics/data/august_bus_data_cleaned.hdf')

## 2.2 Merge bank_holiday & weekday Features

We will set the value of weekday to 0, where bank_holiday is 1:

In [5]:
df.loc[df.bank_holiday == 1, 'weekday'] = 0

# 3. Initial Feature Selection

There are a number of features that we have kept in the data for analysis, sorting the data, etc. These are as follows:

- tripid
- lineid
- month
- day
- segment

As such, the features we will consider when training models are as follows:

- actualtime_arr_stop_first
- hour
- rain
- temp
- rhum
- msl
- day_of_week
- weekday
- bank_holiday
- rain_binary
- peak
- segment_mean
- segment_median
- segment_std

Our target feature is:

- time_diff

# 4. Encoding Categorical Features

Our categorical features are:

- hour
- day_of_week
- weekday
- bank_holiday
- rain_binary
- peak

weekday, bank_holiday, rain_binary & peak are already binary features. We will encode hour and day_of_week.

In [7]:
# encode values for day_of_week
df_dummies = pd.get_dummies(df['day_of_week'], prefix='day_of_week')
df = pd.concat([df, df_dummies], axis =1)

In [8]:
# encode values for hour
df_dummies = pd.get_dummies(df['hour'], prefix='hour')
df = pd.concat([df, df_dummies], axis =1)

# 5. Split Test & Training Data

We will use out of time sampling to split our test and training data.

First we ensure that the data is sorted by date and time:

In [9]:
df = df.sort_values(by=['day', 'actualtime_arr_stop_first'])

Data is then split between training and test data:

In [10]:
df_train, df_test = train_test_split(df, test_size=0.3, shuffle=False)

In [11]:
# drop dataframes that are no longer being used to free up memory
del df

# 6. Prepare Features

## 6.1 Initial Model - All Features

For the initial model we will use the following features:

- actualtime_arr_stop_first
- hour
- rain
- temp
- rhum
- msl
- day_of_week
- weekday
- bank_holiday
- rain_binary
- peak
- segment_mean
- segment_median
- segment_std

In [12]:
# Prepare the descriptive & target features for the training data
X_train = df_train[['actualtime_arr_stop_first','rain','temp','rhum','msl','weekday','bank_holiday','rain_binary','peak','segment_mean','segment_median','segment_std','day_of_week_0','day_of_week_1','day_of_week_2','day_of_week_3','day_of_week_4','day_of_week_5','day_of_week_6','hour_0','hour_1','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']]
y_train = df_train.time_diff

In [13]:
# Prepare the descriptive & target features for the test data
X_test = df_test[['actualtime_arr_stop_first','rain','temp','rhum','msl','weekday','bank_holiday','rain_binary','peak','segment_mean','segment_median','segment_std','day_of_week_0','day_of_week_1','day_of_week_2','day_of_week_3','day_of_week_4','day_of_week_5','day_of_week_6','hour_0','hour_1','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']]
y_test = df_test.time_diff

In [14]:
# normalise the features for training
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 6.2 Basic Model - actualtime_arr_stop_first, day_of_week, segment_mean

In [None]:
# Prepare the descriptive & target features for the training data
X_train = df_train[['actualtime_arr_stop_first','segment_mean','day_of_week_0','day_of_week_1','day_of_week_2','day_of_week_3','day_of_week_4','day_of_week_5','day_of_week_6']]
y_train = df_train.time_diff

In [None]:
# Prepare the descriptive & target features for the test data
X_test = df_test[['actualtime_arr_stop_first','segment_mean','day_of_week_0','day_of_week_1','day_of_week_2','day_of_week_3','day_of_week_4','day_of_week_5','day_of_week_6']]
y_test = df_test.time_diff

In [None]:
# normalise the features for training
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 6.3 Basic Model - actualtime_arr_stop_first, weekday, segment_mean

In [None]:
# Prepare the descriptive & target features for the training data
X_train = df_train[['actualtime_arr_stop_first','segment_mean','weekday']]
y_train = df_train.time_diff

In [None]:
# Prepare the descriptive & target features for the test data
X_test = df_test[['actualtime_arr_stop_first','segment_mean','weekday']]
y_test = df_test.time_diff

In [None]:
# normalise the features for training
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 6.4 day_of_week & week_day

In [None]:
# Prepare the descriptive & target features for the training data
X_train = df_train[['actualtime_arr_stop_first','segment_mean','day_of_week','weekday']]
y_train = df_train.time_diff

In [None]:
# Prepare the descriptive & target features for the test data
X_test = df_test[['actualtime_arr_stop_first','segment_mean','day_of_week','weekday']]
y_test = df_test.time_diff

In [None]:
# normalise the features for training
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 6.5 week_day & bank_holiday

In [None]:
# Prepare the descriptive & target features for the training data
X_train = df_train[['actualtime_arr_stop_first','segment_mean','bank_holiday','weekday']]
y_train = df_train.time_diff

In [None]:
# Prepare the descriptive & target features for the test data
X_test = df_test[['actualtime_arr_stop_first','segment_mean','bank_holiday','weekday']]
y_test = df_test.time_diff

In [None]:
# normalise the features for training
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 6.6 segment_median instead of segment_mean

In [None]:
# Prepare the descriptive & target features for the training data
X_train = df_train[['actualtime_arr_stop_first','segment_median','weekday']]
y_train = df_train.time_diff

In [None]:
# Prepare the descriptive & target features for the test data
X_test = df_test[['actualtime_arr_stop_first','segment_median','weekday']]
y_test = df_test.time_diff

In [None]:
# normalise the features for training
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 6.7 segment_mean  & segment_std

In [None]:
# Prepare the descriptive & target features for the training data
X_train = df_train[['actualtime_arr_stop_first','segment_mean','weekday', 'segment_std']]
y_train = df_train.time_diff

In [None]:
# Prepare the descriptive & target features for the test data
X_test = df_test[['actualtime_arr_stop_first','segment_mean','weekday', 'segment_std']]
y_test = df_test.time_diff

In [None]:
# normalise the features for training
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 6.8 Add hour

In [None]:
# Prepare the descriptive & target features for the training data
X_train = df_train[['actualtime_arr_stop_first','segment_mean','weekday', 'segment_std','hour_0','hour_1','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']]
y_train = df_train.time_diff

In [None]:
# Prepare the descriptive & target features for the test data
X_test = df_test[['actualtime_arr_stop_first','segment_mean','weekday', 'segment_std','hour_0','hour_1','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']]
y_test = df_test.time_diff

In [None]:
# normalise the features for training
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 6.9 Without actualtime_arr_stop_first

In [None]:
# Prepare the descriptive & target features for the training data
X_train = df_train[['segment_mean','weekday', 'segment_std','hour_0','hour_1','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']]
y_train = df_train.time_diff

In [None]:
# Prepare the descriptive & target features for the test data
X_test = df_test[['segment_mean','weekday', 'segment_std','hour_0','hour_1','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']]
y_test = df_test.time_diff

In [None]:
# normalise the features for training
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 6.10 Add peak

In [None]:
# Prepare the descriptive & target features for the training data
X_train = df_train[['actualtime_arr_stop_first','segment_mean','weekday', 'segment_std','peak','hour_0','hour_1','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']]
y_train = df_train.time_diff

In [None]:
# Prepare the descriptive & target features for the test data
X_test = df_test[['actualtime_arr_stop_first','segment_mean','weekday', 'segment_std','peak','hour_0','hour_1','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']]
y_test = df_test.time_diff

In [None]:
# normalise the features for training
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 6.11 Add continuous weather data

In [None]:
# Prepare the descriptive & target features for the training data
X_train = df_train[['actualtime_arr_stop_first','segment_mean','weekday', 'segment_std','peak', 'rain', 'temp', 'rhum', 'msl','hour_0','hour_1','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']]
y_train = df_train.time_diff

In [None]:
# Prepare the descriptive & target features for the test data
X_test = df_test[['actualtime_arr_stop_first','segment_mean','weekday', 'segment_std','peak', 'rain', 'temp', 'rhum', 'msl','hour_0','hour_1','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']]
y_test = df_test.time_diff

In [None]:
# normalise the features for training
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 6.12 rain & temp

In [15]:
# Prepare the descriptive & target features for the training data
X_train = df_train[['actualtime_arr_stop_first','segment_mean','weekday', 'segment_std','peak', 'rain', 'temp', 'hour_0','hour_1','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']]
y_train = df_train.time_diff

In [16]:
# Prepare the descriptive & target features for the test data
X_test = df_test[['actualtime_arr_stop_first','segment_mean','weekday', 'segment_std','peak', 'rain', 'temp','hour_0','hour_1','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']]
y_test = df_test.time_diff

In [17]:
# normalise the features for training
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 6.13 just temp

In [None]:
# Prepare the descriptive & target features for the training data
X_train = df_train[['actualtime_arr_stop_first','segment_mean','weekday', 'segment_std','peak', 'temp', 'hour_0','hour_1','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']]
y_train = df_train.time_diff

In [None]:
# Prepare the descriptive & target features for the test data
X_test = df_test[['actualtime_arr_stop_first','segment_mean','weekday', 'segment_std','peak', 'temp','hour_0','hour_1','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']]
y_test = df_test.time_diff

In [None]:
# normalise the features for training
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 6.14 temp & rain_binary

In [None]:
# Prepare the descriptive & target features for the training data
X_train = df_train[['actualtime_arr_stop_first','segment_mean','weekday', 'segment_std','peak', 'temp', 'rain_binary', 'hour_0','hour_1','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']]
y_train = df_train.time_diff

In [None]:
# Prepare the descriptive & target features for the test data
X_test = df_test[['actualtime_arr_stop_first','segment_mean','weekday', 'segment_std','peak', 'temp', 'rain_binary','hour_0','hour_1','hour_4','hour_5','hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13','hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']]
y_test = df_test.time_diff

In [None]:
# normalise the features for training
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 7. Linear Regression

## 7.1 Train the Model

Train a model using linear regression from scikit-learn:

In [18]:
start = time.time()
linreg = linear_model.LinearRegression().fit(X_train, y_train)
end = time.time()
print(end - start)

19.64493465423584


## 7.2 Test on the Test Data

In [19]:
# make predictions based on the training data
start = time.time()
linreg_predicted = (linreg.predict(X_test))
end = time.time()
print(end - start)

4.059881925582886


In [20]:
print("Mean Absolute Error: ", metrics.mean_absolute_error(y_test, linreg_predicted))
print()
print("Root Mean Squared Error: ", math.sqrt(metrics.mean_squared_error(y_test, linreg_predicted)))
print()
print("R Squared:", metrics.r2_score(y_test, linreg_predicted))

Mean Absolute Error:  19.202115059669264

Root Mean Squared Error:  38.49484577408298

R Squared: 0.6184812747331497


# 8. Gradient Tree Boosting

## 8.1 Train the Model

In [21]:
# specify the GTB parameters
gtb = GradientBoostingRegressor()

In [22]:
# Fit model on the training data
start = time.time()
gtb_model = gtb.fit(X_train_scaled, y_train)
end = time.time()
print(end - start)

741.9387912750244


## 8.2 Test the Model

In [23]:
# make predictions based on the training data
start = time.time()
gtb_predicted = (gtb_model.predict(X_test_scaled))
end = time.time()
print(end - start)

4.743624925613403


In [24]:
print("Mean Absolute Error: ", metrics.mean_absolute_error(y_test, gtb_predicted))
print()
print("Root Mean Squared Error: ", math.sqrt(metrics.mean_squared_error(y_test, gtb_predicted)))
print()
print("R Squared:", metrics.r2_score(y_test, gtb_predicted))

Mean Absolute Error:  18.823026614201847

Root Mean Squared Error:  37.67861044568791

R Squared: 0.634489004333413


# 9. Improve the GTB Model