### The purpose of this notebook is to check a number of different types of models on a subset of the Dublin Bus data for August.

### The main purpose here is not to test the accuracy of the models, but rather to check how much data we can process with different types of models before we encounter memory issues. We will also compare the models in terms of training time and the size of the pickled models.

<br>

# 1. Setup & Data Load

Import required modules and packages.

In [None]:
# import time so that run time of various tasks can be tracked
import time

# import math for mathematical functions
import math

# import pandas and numpy for data analysis
import pandas as pd
import numpy as np

# import from sklearn for machine learning
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR

# import pickle so that models can be saved to file
import pickle

Set the max number of columns & rows to display.

In [None]:
pd.set_option('display.max_columns', 250)
pd.set_option('display.max_rows', 5700)

Import the prepared data:

In [None]:
df = pd.read_hdf('/data_analytics/data/all_routes_aug_prepared.hdf')

# 2. Split Test & Training Data

We will use out of time sampling to split our test and training data.

First we ensure that the data is sorted by date and time:

In [None]:
df = df.sort_values(by=['dayofservice', 'actualtime_arr_stop_first'])

Data is first split to reduce the size of the set to roughly 715,000 (so train on 500,000):

In [None]:
df_temp1, df_temp2 = train_test_split(df, test_size=0.926, shuffle=False)

In [None]:
df_temp1.shape

Data is then split between training and test data:

In [None]:
df_train, df_test = train_test_split(df_temp1, test_size=0.3, shuffle=False)

In [None]:
df_train.shape

# 3. Prepare Features

In [None]:
# Prepare the descriptive & target features for the training data
X_train = df_train[['actualtime_arr_stop_first','segment_means','rain','temp','rhum','msl','weekday','bank_holiday','day_of_week_0','day_of_week_1','day_of_week_2','day_of_week_3','day_of_week_4','day_of_week_5','day_of_week_6','hour_0.0','hour_1.0','hour_4.0','hour_5.0','hour_6.0','hour_7.0','hour_8.0','hour_9.0','hour_10.0','hour_11.0','hour_12.0','hour_13.0','hour_14.0','hour_15.0','hour_16.0','hour_17.0','hour_18.0','hour_19.0','hour_20.0','hour_21.0','hour_22.0','hour_23.0']]
y_train = df_train.time_diff

In [None]:
# Prepare the descriptive & target features for the test data
X_test = df_test[['actualtime_arr_stop_first','segment_means','rain','temp','rhum','msl','weekday','bank_holiday','day_of_week_0','day_of_week_1','day_of_week_2','day_of_week_3','day_of_week_4','day_of_week_5','day_of_week_6','hour_0.0','hour_1.0','hour_4.0','hour_5.0','hour_6.0','hour_7.0','hour_8.0','hour_9.0','hour_10.0','hour_11.0','hour_12.0','hour_13.0','hour_14.0','hour_15.0','hour_16.0','hour_17.0','hour_18.0','hour_19.0','hour_20.0','hour_21.0','hour_22.0','hour_23.0']]
y_test = df_test.time_diff

In [None]:
# drop dataframes that are no longer being used to free up memory
del df_temp1
del df_temp2
del df_train
del df_test
del df

# 4. Linear Regression

## 4.1 Train the Model

Train a model using linear regression from scikit-learn:

In [None]:
start = time.time()
linreg = linear_model.LinearRegression().fit(X_train, y_train)
end = time.time()
print(end - start)

## 4.2 Test on the Test Data

In [None]:
# make predictions based on the training data
start = time.time()
linreg_predicted = (linreg.predict(X_test))
end = time.time()
print(end - start)

In [None]:
print("Mean Absolute Error: ", metrics.mean_absolute_error(y_test, linreg_predicted))
print()
print("Root Mean Squared Error: ", math.sqrt(metrics.mean_squared_error(y_test, linreg_predicted)))
print()
print("R Squared:", metrics.r2_score(y_test, linreg_predicted))

## 4.3 Pickle the Model

Save the model to check the size:

In [None]:
filename = '/data_analytics/linreg_model_size_check.sav'
pickle.dump(linreg, open(filename, 'wb'))

File size: 1.1K

# 5. Random Forest

## 5.1 Train the Model

In [None]:
# specify the random forest parameters
rfr = RandomForestRegressor(n_estimators=100, oob_score=True, random_state=1, max_depth=6)

In [None]:
# Fit model on the training data
start = time.time()
random_forest = rfr.fit(X_train, y_train)
end = time.time()
print(end - start)

## 5.2 Test the Model

In [None]:
# make predictions based on the training data
start = time.time()
rf_predicted = (random_forest.predict(X_test))
end = time.time()
print(end - start)

In [None]:
print("Mean Absolute Error: ", metrics.mean_absolute_error(y_test, rf_predicted))
print()
print("Root Mean Squared Error: ", math.sqrt(metrics.mean_squared_error(y_test, rf_predicted)))
print()
print("R Squared:", metrics.r2_score(y_test, rf_predicted))

## 5.3 Pickle the Model

Save the model to check the size:

In [None]:
filename = '/data_analytics/rf_model_size_check.sav'
pickle.dump(random_forest, open(filename, 'wb'))

File size: 4.7M

# 6. Neural Networks

## 6.1 Train the Model

In [None]:
# specify the neural net parameters
nn = MLPRegressor(
    hidden_layer_sizes=(10,),  activation='relu', solver='adam', alpha=0.001, batch_size='auto',
    learning_rate='constant', learning_rate_init=0.01, power_t=0.5, max_iter=1000, shuffle=True,
    random_state=9, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True,
    early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

In [None]:
# Fit model on the training data
start = time.time()
neural_net = nn.fit(X_train, y_train)
end = time.time()
print(end - start)

## 6.2 Test the Model

In [None]:
# make predictions based on the training data
start = time.time()
nn_predicted = (neural_net.predict(X_test))
end = time.time()
print(end - start)

In [None]:
print("Mean Absolute Error: ", metrics.mean_absolute_error(y_test, nn_predicted))
print()
print("Root Mean Squared Error: ", math.sqrt(metrics.mean_squared_error(y_test, nn_predicted)))
print()
print("R Squared:", metrics.r2_score(y_test, rf_predicted))

## 6.3 Pickle the Model

Save the model to check the size:

In [None]:
filename = '/data_analytics/nn_model_size_check.sav'
pickle.dump(neural_net, open(filename, 'wb'))

File size: 14K

# 7. SVM

## 7.1 Train the Model

In [None]:
# specify the SVM parameters
svm = LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=0, tol=1e-05, verbose=0)

In [None]:
# Fit model on the training data
start = time.time()
svm_model = svm.fit(X_train, y_train)
end = time.time()
print(end - start)

## 7.2 Test the Model

In [None]:
# make predictions based on the training data
start = time.time()
svm_predicted = (svm_model.predict(X_test))
end = time.time()
print(end - start)

In [None]:
print("Mean Absolute Error: ", metrics.mean_absolute_error(y_test, svm_predicted))
print()
print("Root Mean Squared Error: ", math.sqrt(metrics.mean_squared_error(y_test, svm_predicted)))
print()
print("R Squared:", metrics.r2_score(y_test, rf_predicted))

## 7.3 Pickle the Model

Save the model to check the size:

In [None]:
filename = '/data_analytics/svm_model_size_check.sav'
pickle.dump(svm_model, open(filename, 'wb'))