In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score
pd.options.mode.chained_assignment = None 
from matplotlib import pyplot
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import pickle
import csv
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
pd.set_option('display.max_columns', None)

In [20]:
def Final_Clean(df, direction):
  
    df = df.loc[df['DIRECTION'] == direction]
    
    #covert to appropriate data types
    df['DAYOFSERVICE'] = df['DAYOFSERVICE'].astype('datetime64[ns]')
    df['ROUTEID'] = df['ROUTEID'].astype('category')
    df['LINEID'] = df['LINEID'].astype('category')

    #adding an hour column so that dataframe can be merged with weather dataframe 
    df['HOUR'] = pd.to_datetime(df['DEPARTURE_TIME_ACTUAL'])
    df['HOUR'] = df['HOUR'].dt.hour

    #Grouping the trips by day of service and trip ID and then 
    #sorting the dataframe according to progrnumber (Sequential position of the stop point in the trip)
    sorted_df = df.groupby(["DAYOFSERVICE","TRIPID"]).apply(lambda x: x.sort_values(["PROGRNUMBER"], ascending = True)).reset_index(drop=True)

    #Creating a feature for the duration of the journey 
    #this is done by taking the actual arrival time of the preceding row from the arrival time of the following row.
    sorted_df['JOURNEY_DURATION'] = sorted_df['ACTUALTIME_ARR'].shift(-1) - sorted_df['ACTUALTIME_ARR'] - sorted_df['DWELL_TIME']

    #creating the peak hour feature
    sorted_df['PEAK_HOUR'] = [1 if x >'07:00:00' and x <'10:00:00' or x >'16:00:00' and x <'19:00:00' else 0 for x in sorted_df['ARRIVAL_TIME_ACTUAL']]

    #creating a public holiday indicator feature 
    dates = pd.to_datetime(['2018-01-01','2018-03-17','2018-03-19','2018-03-30','2018-04-01','2018-04-02','2018-05-07',
     '2018-06-04','2018-08-06','2018-11-29','2018-12-24','2018-12-25','2018-12-26', '2018-12-31'])

    sorted_df['PUBLIC_HOLIDAY'] = sorted_df['DAYOFSERVICE'].isin(dates)
    sorted_df['PUBLIC_HOLIDAY'] *= 1    
    
    #deleting columns that won't be used
    del sorted_df['ARRIVAL_TIME_ACTUAL']
    del sorted_df['DEPARTURE_TIME_ACTUAL']
    del sorted_df['PLANNEDTIME_DEP']
    del sorted_df['PLANNEDTIME_ARR']

    #we want to get rid of all rows where the bus has reached the final destination of the trip
    #the journey duration of these rows is not representative of the actual journey time as 
    #this is the time spent in the bay before the next trip 

    #we create a column END_OF_TRIP that will be 1 for all in transit journeys
    #and negative for the final destination as it is the subtraction of the progress number of the current section
    #from the progress number of the next section of the trip 
    sorted_df['NEXT_STOPPOINTID'] = sorted_df['STOPPOINTID'].shift(-1)
    sorted_df['NEXT_PROGNUMBER']=sorted_df["PROGRNUMBER"].shift(-1)
    sorted_df['END_OF_TRIP'] = sorted_df['NEXT_PROGNUMBER'] - sorted_df['PROGRNUMBER']

    #as the final rows journey duration, NEXT_STOPPOINTID, NEXT_PROGNUMBER and END_OF_TRIP are null
    sorted_df = sorted_df.dropna(axis=0)
 
    #merging weather data 
    weather_df = pd.read_csv('~/data/Clean_Weather_2018.csv')
    weather_df['date'] = weather_df['date'].astype('datetime64[ns]')
    weather_df['weather_main'] = weather_df['weather_main'].astype('category')

    final_df = pd.merge(sorted_df, weather_df, how='left',left_on=['DAYOFSERVICE','HOUR'],right_on=['date','hour'])
 
    del final_df['VEHICLEID']
    del final_df['date']
    del final_df['hour']

    final_df['MONTH']= final_df['MONTH'].astype('category')
    final_df['DAY']= final_df['DAY'].astype('category')
    final_df['HOUR']= final_df['HOUR'].astype('category')
    final_df['STOPPOINTID']= final_df['STOPPOINTID'].astype('category')
    final_df['NEXT_STOPPOINTID']= final_df['NEXT_STOPPOINTID'].astype('int')
    final_df['NEXT_STOPPOINTID']= final_df['NEXT_STOPPOINTID'].astype('category')
    final_df['JOURNEY_DURATION']= final_df['JOURNEY_DURATION'].astype('int')

    #removing the row at the end of each trip where journey duration is not required.
    final_df = final_df.loc[final_df['END_OF_TRIP'] == 1]
    
    #remove negative journey times
    final_df = final_df[(final_df['JOURNEY_DURATION'] > 0)]
    
    #remove outliers that are 3 standard deviations above or below the mean.
    
    final_df=final_df[np.abs(final_df.JOURNEY_DURATION-final_df.JOURNEY_DURATION.mean()) <= (3*final_df.JOURNEY_DURATION.std())]

    return final_df

# Parameter tuning for specific route 

In [3]:
df_120 = pd.read_csv("~/data/Cleaned_By_Route_Header/route_120.csv")

In [4]:
final_df = Final_Clean(df_120, 1)

In [7]:
X = pd.DataFrame(final_df[['DAY','HOUR','MONTH','STOPPOINTID','NEXT_STOPPOINTID',
                       'PEAK_HOUR', 'wind_speed', 'temp', 'humidity', 'weather_main']])

y = final_df.JOURNEY_DURATION

X = pd.get_dummies(X)

#test-train split 
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [8]:
# Number of trees in random forest
n_estimators = [16,32,64]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 100, num = 10)]

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [16, 32, 64], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [9]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=2, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, Y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=50,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [16, 32, 64]},
                   random_state=2, verbose=2)

In [10]:
rf_random.best_params_

{'n_estimators': 32,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 70,
 'bootstrap': True}

In [34]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [35]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_train, Y_train)

Model Performance
Average Error: 11.0942 degrees.
Accuracy = 79.79%.


In [36]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'n_estimators': [128, 150, 250, 500],
     'min_samples_split': [9,10,11],
     'min_samples_leaf': [3,4,5],
     'max_features': ['sqrt'],
     'max_depth': [80, 90, 100, 110, 120],
     'bootstrap': [False]
}

# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(X_train, Y_train)

Fitting 3 folds for each of 180 candidates, totalling 540 fits


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [False],
                         'max_depth': [80, 90, 100, 110, 120],
                         'max_features': ['sqrt'],
                         'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [9, 10, 11],
                         'n_estimators': [128, 150, 250, 500]},
             verbose=2)

In [37]:
grid_search.best_params_

{'bootstrap': False,
 'max_depth': 90,
 'max_features': 'sqrt',
 'min_samples_leaf': 3,
 'min_samples_split': 10,
 'n_estimators': 500}

While having 500 estimators would improve our models, it does result in very large pickle files and as a result a lower amount may have to be used. We will keep all of the other best parameters as is.

In [38]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_train, Y_train)

Model Performance
Average Error: 10.6010 degrees.
Accuracy = 80.69%.


In [21]:
grid_search.best_params_

{'max_depth': 90, 'max_features': 'sqrt', 'n_estimators': 64}

# Running these parameters on all routes

In [19]:
def Random_Forest(df):

    X = pd.DataFrame(df[['DAY','HOUR','MONTH','STOPPOINTID','NEXT_STOPPOINTID',
                           'PEAK_HOUR', 'wind_speed', 'temp', 'humidity', 'weather_main']])
    
    y = df.JOURNEY_DURATION

    X = pd.get_dummies(X)
    
    dummies = X.columns
    
    with open("/home/tomah/data/Model_Testing/RandomForestPickles/RandomForest_route_{}_d{}_headers.pkl".format(Route, direction), 'wb') as handle:
        pickle.dump(dummies, handle, pickle.HIGHEST_PROTOCOL)

    #test-train split 
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state=1)

    
    params = {'bootstrap': False,
     'max_depth': 90,
     'max_features': 'sqrt',
     'min_samples_leaf': 3,
     'min_samples_split': 10,
     'n_estimators': 128}

    #creating and fitting the RF model and  Fit model on full dataset
    rfc = RandomForestRegressor(**params).fit(X_train, Y_train)
    
    with open("/home/tomah/data/Model_Testing/RandomForestPickles/RandomForest_Model_route_{}_d{}.pkl".format(Route, direction), 'wb') as handle:
        pickle.dump(rfc, handle, pickle.HIGHEST_PROTOCOL)

    #predictions from the training set
    y_pred_rf_train = rfc.predict(X_train)
    
    #predicitng from the test set 
    y_pred_rf = rfc.predict(X_test)
    
    #some evaluation metrics.
    print('Mean Absolute Error (train):', metrics.mean_absolute_error(Y_train, y_pred_rf_train))
    print('Mean Absolute Error (test):', metrics.mean_absolute_error(Y_test, y_pred_rf))
    print('Mean Percentage Error:', np.mean((Y_test - y_pred_rf)/Y_test))
    print('Mean Absolute Percentage Error:', metrics.mean_absolute_percentage_error(Y_test,y_pred_rf)) 
    print('Mean Squared Error:', metrics.mean_squared_error(Y_test, y_pred_rf))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test, y_pred_rf)))
    print('R-square value:', r2_score(Y_test,y_pred_rf))
    
    Mean_abs_error_train = metrics.mean_absolute_error(Y_train, y_pred_rf_train)
    Mean_abs_error_test = metrics.mean_absolute_error(Y_test, y_pred_rf)
    Mean_Percentage_Error = np.mean((Y_test - y_pred_rf)/Y_test)
    Mean_Absolute_Percentage_Error= metrics.mean_absolute_percentage_error(Y_test,y_pred_rf)  
    Mean_squared_error =  metrics.mean_squared_error(Y_test, y_pred_rf)
    Root_Mean_sqaured_error = np.sqrt(metrics.mean_squared_error(Y_test, y_pred_rf))
    R_square_value =  r2_score(Y_test,y_pred_rf)

    
#     with open('Random_Forest_Results_2.csv', 'a', newline='') as file:
#         writer = csv.writer(file)
#         writer.writerow([Route, direction, Mean_abs_error_train, Mean_abs_error_test,Mean_Percentage_Error,
#                          Mean_Absolute_Percentage_Error, Mean_squared_error, Root_Mean_sqaured_error, R_square_value])

In [4]:
Trips_df = pd.read_csv("~/data/rt_trips_DB_2018.txt", ";")
Line_List = Trips_df.LINEID.unique().tolist()

In [25]:
with open('Random_Forest_Results_2.csv', 'a', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Route","Direction", "Mean_Absolute_Error_Train","Mean_Absolute_Error_Test", 
                     "Mean_Percentage_Error", "Mean_Absolute_Percentage_Error",
                     "Mean_Squared_Error", "Root_Mean_Squared_Error", "R_Square"])

In [18]:
for Route in Line_List[60:]:

    df = pd.read_csv("~/data/Cleaned_By_Route_Header/route_{}.csv".format(Route))
    print("//--------------------------------------//")  
    
    if len(df.DIRECTION.unique()) == 1:

        direction = df.DIRECTION.unique()[0]
        print("route:",Route,"\n","direction:", direction)
        final_df = Final_Clean(df, direction)
        Random_Forest(final_df)  
    
    elif len(df.DIRECTION.unique()) == 2:
        for direction in range(1,3):
            print("route:",Route,"\n","direction:", direction)
            final_df = Final_Clean(df, direction)
            Random_Forest(final_df)   

//--------------------------------------//
route: 37 
 direction: 1
Mean Absolute Error (train): 10.047961955407622
Mean Absolute Error (test): 12.081099582144855
Mean Percentage Error: -0.10347585462366128
Mean Absolute Percentage Error: 0.2501763795377245
Mean Squared Error: 390.396793062042
Root Mean Squared Error: 19.758461302997304
R-square value: 0.7725304845746319
route: 37 
 direction: 2
Mean Absolute Error (train): 9.274686937133785
Mean Absolute Error (test): 11.130712406938732
Mean Percentage Error: -0.08814133488696047
Mean Absolute Percentage Error: 0.2348771763498562
Mean Squared Error: 312.98895497172145
Root Mean Squared Error: 17.691493859245504
R-square value: 0.7292456392590618
//--------------------------------------//
route: 27 
 direction: 1
Mean Absolute Error (train): 11.692702659983645
Mean Absolute Error (test): 13.459288540227845
Mean Percentage Error: -0.11760933608607849
Mean Absolute Percentage Error: 0.27596356139580513
Mean Squared Error: 472.45229947685

//--------------------------------------//
route: 70 
 direction: 1
Mean Absolute Error (train): 12.901404219608196
Mean Absolute Error (test): 15.724779674391549
Mean Percentage Error: -0.1113938207390421
Mean Absolute Percentage Error: 0.25844940613653417
Mean Squared Error: 656.4930973434706
Root Mean Squared Error: 25.622121249878408
R-square value: 0.7901234943418646
route: 70 
 direction: 2
Mean Absolute Error (train): 11.83773948867906
Mean Absolute Error (test): 14.250355102950216
Mean Percentage Error: -0.13027387205958973
Mean Absolute Percentage Error: 0.28655923091841434
Mean Squared Error: 573.9351506213897
Root Mean Squared Error: 23.956943682811247
R-square value: 0.652365590111798
//--------------------------------------//
route: 84A 
 direction: 1
Mean Absolute Error (train): 14.270555016840342
Mean Absolute Error (test): 17.71627422638579
Mean Percentage Error: -0.13358672641956978
Mean Absolute Percentage Error: 0.2931740968077862
Mean Squared Error: 960.619601787798

Mean Absolute Error (train): 10.836946054908326
Mean Absolute Error (test): 12.75250042572864
Mean Percentage Error: -0.1432266545617727
Mean Absolute Percentage Error: 0.3017846221562897
Mean Squared Error: 418.4574477856007
Root Mean Squared Error: 20.456232492460597
R-square value: 0.6948295813924406
route: 75 
 direction: 2
Mean Absolute Error (train): 9.876494468685646
Mean Absolute Error (test): 11.517083149015846
Mean Percentage Error: -0.11880371457770436
Mean Absolute Percentage Error: 0.27250558698746635
Mean Squared Error: 324.6281608924445
Root Mean Squared Error: 18.01744046451783
R-square value: 0.660250043750281
//--------------------------------------//
route: 26 
 direction: 1
Mean Absolute Error (train): 12.382489866038927
Mean Absolute Error (test): 15.485048464159423
Mean Percentage Error: -0.10835227668625357
Mean Absolute Percentage Error: 0.26485832360378736
Mean Squared Error: 598.2987366623009
Root Mean Squared Error: 24.460145883912894
R-square value: 0.747342

//--------------------------------------//
route: 25 
 direction: 1
Mean Absolute Error (train): 10.720532177906502
Mean Absolute Error (test): 13.22281200247663
Mean Percentage Error: -0.10318751104245649
Mean Absolute Percentage Error: 0.245112550414788
Mean Squared Error: 465.99717288028404
Root Mean Squared Error: 21.58696766292765
R-square value: 0.7787748708979927
route: 25 
 direction: 2
Mean Absolute Error (train): 9.496043217426072
Mean Absolute Error (test): 11.740865371939034
Mean Percentage Error: -0.1016301434853556
Mean Absolute Percentage Error: 0.23871382524291693
Mean Squared Error: 378.0338684048786
Root Mean Squared Error: 19.44309307710269
R-square value: 0.7137125342794386
//--------------------------------------//
route: 104 
 direction: 1
Mean Absolute Error (train): 9.01387421571809
Mean Absolute Error (test): 11.251221613928287
Mean Percentage Error: -0.06794833210884357
Mean Absolute Percentage Error: 0.19756184708476685
Mean Squared Error: 345.0954061983807
R

Mean Absolute Error (train): 18.035667966049388
Mean Absolute Error (test): 22.073612273606997
Mean Percentage Error: -0.1832762326149958
Mean Absolute Percentage Error: 0.3802704972222541
Mean Squared Error: 943.1345531502546
Root Mean Squared Error: 30.71049581413909
R-square value: 0.6885366336781946
//--------------------------------------//
route: 38D 
 direction: 1
Mean Absolute Error (train): 14.658264512518608
Mean Absolute Error (test): 18.205003426971675
Mean Percentage Error: -0.1535321389434798
Mean Absolute Percentage Error: 0.32727845444825576
Mean Squared Error: 795.739175710733
Root Mean Squared Error: 28.20884924470924
R-square value: 0.7181966922427336
route: 38D 
 direction: 2
Mean Absolute Error (train): 12.113478246120355
Mean Absolute Error (test): 15.186785491956162
Mean Percentage Error: -0.13653683595110605
Mean Absolute Percentage Error: 0.3053477193267385
Mean Squared Error: 569.7510698300745
Root Mean Squared Error: 23.869458934589918
R-square value: 0.56573

Mean Absolute Error (train): 12.92177853706584
Mean Absolute Error (test): 17.332542580328532
Mean Percentage Error: -0.09316409081454056
Mean Absolute Percentage Error: 0.23329601982300227
Mean Squared Error: 1164.983193921281
Root Mean Squared Error: 34.13185013914835
R-square value: 0.8093314047212503
//--------------------------------------//
route: 25X 
 direction: 1
Mean Absolute Error (train): 17.587181923208
Mean Absolute Error (test): 21.152660090132585
Mean Percentage Error: -0.1319998355292507
Mean Absolute Percentage Error: 0.30980613679428975
Mean Squared Error: 1046.2964428731545
Root Mean Squared Error: 32.346505883528664
R-square value: 0.7432830464730146
route: 25X 
 direction: 2
Mean Absolute Error (train): 19.583171009477553
Mean Absolute Error (test): 23.360040349097904
Mean Percentage Error: -0.13559242406190147
Mean Absolute Percentage Error: 0.30193634001843983
Mean Squared Error: 1337.562123183096
Root Mean Squared Error: 36.57269641663157
R-square value: 0.8050

# Running RF models on the problematic Linear Regression models to get the neccessary pickle files 

The pickle files produced are still far too large for deployment and so the paramters will have to be tuned again to account for this.

In [42]:
def Random_Forest(df):

    X = pd.DataFrame(df[['DAY','HOUR','MONTH','STOPPOINTID','NEXT_STOPPOINTID',
                           'PEAK_HOUR', 'wind_speed', 'temp', 'humidity', 'weather_main']])
    
    y = df.JOURNEY_DURATION

    X = pd.get_dummies(X)
    
    dummies = X.columns
    
    with open("/home/tomah/data/Model_Testing/RandomForestPickles_2/RandomForest_route_{}_d{}_headers.pkl".format(Route, direction), 'wb') as handle:
        pickle.dump(dummies, handle, pickle.HIGHEST_PROTOCOL)

    #test-train split 
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state=1)

    
    params = {'bootstrap': False,
     'max_depth': 15,
     'max_features': 'sqrt',
     'min_samples_leaf': 3,
     'min_samples_split': 10,
     'n_estimators': 64}

    #creating and fitting the RF model and  Fit model on full dataset
    rfc = RandomForestRegressor(**params).fit(X_train, Y_train)
    
    with open("/home/tomah/data/Model_Testing/RandomForestPickles_2/RandomForest_Model_route_{}_d{}.pkl".format(Route, direction), 'wb') as handle:
        pickle.dump(rfc, handle, pickle.HIGHEST_PROTOCOL)

    #predictions from the training set
    y_pred_rf_train = rfc.predict(X_train)
    
    #predicitng from the test set 
    y_pred_rf = rfc.predict(X_test)
    
    #some evaluation metrics.
    print('Mean Absolute Error (train):', metrics.mean_absolute_error(Y_train, y_pred_rf_train))
    print('Mean Absolute Error (test):', metrics.mean_absolute_error(Y_test, y_pred_rf))
    print('Mean Percentage Error:', np.mean((Y_test - y_pred_rf)/Y_test))
    print('Mean Absolute Percentage Error:', metrics.mean_absolute_percentage_error(Y_test,y_pred_rf)) 
    print('Mean Squared Error:', metrics.mean_squared_error(Y_test, y_pred_rf))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test, y_pred_rf)))
    print('R-square value:', r2_score(Y_test,y_pred_rf))
    
    Mean_abs_error_train = metrics.mean_absolute_error(Y_train, y_pred_rf_train)
    Mean_abs_error_test = metrics.mean_absolute_error(Y_test, y_pred_rf)
    Mean_Percentage_Error = np.mean((Y_test - y_pred_rf)/Y_test)
    Mean_Absolute_Percentage_Error= metrics.mean_absolute_percentage_error(Y_test,y_pred_rf)  
    Mean_squared_error =  metrics.mean_squared_error(Y_test, y_pred_rf)
    Root_Mean_sqaured_error = np.sqrt(metrics.mean_squared_error(Y_test, y_pred_rf))
    R_square_value =  r2_score(Y_test,y_pred_rf)

    
#     with open('Random_Forest_Results_2.csv', 'a', newline='') as file:
#         writer = csv.writer(file)
#         writer.writerow([Route, direction, Mean_abs_error_train, Mean_abs_error_test,Mean_Percentage_Error,
#                          Mean_Absolute_Percentage_Error, Mean_squared_error, Root_Mean_sqaured_error, R_square_value])

In [43]:
direction_1 = ['41','83A','84X','40E','16D','41D']

In [44]:
direction_2 = ['25A','46A','51D']

Max depth 90, n estimators 500, 1.5 GB

In [31]:
for Route in direction_1:
    df = pd.read_csv("~/data/Cleaned_By_Route_Header/route_{}.csv".format(Route))
    direction = 1
    print("//--------------------------------------//")  
    print("route:",Route,"\n","direction:", direction)
    final_df = Final_Clean(df, direction)
    Random_Forest(final_df)  

//--------------------------------------//
route: 41 
 direction: 1
Mean Absolute Error (train): 12.718865668107789
Mean Absolute Error (test): 14.254145228879121
Mean Percentage Error: -0.12758247564748762
Mean Absolute Percentage Error: 0.2978640056073299
Mean Squared Error: 436.71691925358755
Root Mean Squared Error: 20.89777306924323
R-square value: 0.6376632070843579
//--------------------------------------//
route: 83A 
 direction: 1
Mean Absolute Error (train): 11.353713963325267
Mean Absolute Error (test): 12.628120029884583
Mean Percentage Error: -0.11116310960987368
Mean Absolute Percentage Error: 0.2639143879629161
Mean Squared Error: 351.6095064147361
Root Mean Squared Error: 18.751253462495143
R-square value: 0.6633724107871544
//--------------------------------------//
route: 84X 
 direction: 1
Mean Absolute Error (train): 13.707804009457709
Mean Absolute Error (test): 16.61963686375834
Mean Percentage Error: -0.05483669920649149
Mean Absolute Percentage Error: 0.16835235

In [32]:
for Route in direction_2:
    df = pd.read_csv("~/data/Cleaned_By_Route_Header/route_{}.csv".format(Route))
    direction = 2
    print("//--------------------------------------//")  
    print("route:",Route,"\n","direction:", direction)
    final_df = Final_Clean(df, direction)
    Random_Forest(final_df)  

//--------------------------------------//
route: 25A 
 direction: 2
Mean Absolute Error (train): 11.521562102345467
Mean Absolute Error (test): 13.606780990019832
Mean Percentage Error: -0.09959782330009292
Mean Absolute Percentage Error: 0.2423042710132698
Mean Squared Error: 467.4785176525925
Root Mean Squared Error: 21.621251528359604
R-square value: 0.7187965938830049
//--------------------------------------//
route: 46A 
 direction: 2
Mean Absolute Error (train): 14.010776489420651
Mean Absolute Error (test): 15.357987396963772
Mean Percentage Error: -0.15144429250885239
Mean Absolute Percentage Error: 0.3246484374484841
Mean Squared Error: 520.4241846934201
Root Mean Squared Error: 22.812807470660424
R-square value: 0.6666322832359655
//--------------------------------------//
route: 51D 
 direction: 2
Mean Absolute Error (train): 15.943879825133584
Mean Absolute Error (test): 18.6762509650844
Mean Percentage Error: -0.1238551223080473
Mean Absolute Percentage Error: 0.263887060

Max depth 60, n estimators 128, 600MB

In [39]:
for Route in direction_1:
    df = pd.read_csv("~/data/Cleaned_By_Route_Header/route_{}.csv".format(Route))
    direction = 1
    print("//--------------------------------------//")  
    print("route:",Route,"\n","direction:", direction)
    final_df = Final_Clean(df, direction)
    Random_Forest(final_df)  

//--------------------------------------//
route: 41 
 direction: 1
Mean Absolute Error (train): 14.54965890648533
Mean Absolute Error (test): 15.143248999168542
Mean Percentage Error: -0.15966730189786887
Mean Absolute Percentage Error: 0.3283791389849814
Mean Squared Error: 473.44337682171874
Root Mean Squared Error: 21.758754027326994
R-square value: 0.6071918736788804
//--------------------------------------//
route: 83A 
 direction: 1
Mean Absolute Error (train): 13.192477290874917
Mean Absolute Error (test): 13.809148522708645
Mean Percentage Error: -0.14764935037596932
Mean Absolute Percentage Error: 0.30699964412323133
Mean Squared Error: 391.0623825199757
Root Mean Squared Error: 19.775297280192166
R-square value: 0.6256006033458772
//--------------------------------------//
route: 84X 
 direction: 1
Mean Absolute Error (train): 15.669176754646557
Mean Absolute Error (test): 17.213856404394917
Mean Percentage Error: -0.07681123056735174
Mean Absolute Percentage Error: 0.183741

In [40]:
for Route in direction_2:
    df = pd.read_csv("~/data/Cleaned_By_Route_Header/route_{}.csv".format(Route))
    direction = 2
    print("//--------------------------------------//")  
    print("route:",Route,"\n","direction:", direction)
    final_df = Final_Clean(df, direction)
    Random_Forest(final_df)  

//--------------------------------------//
route: 25A 
 direction: 2
Mean Absolute Error (train): 13.225462526456267
Mean Absolute Error (test): 14.075363823185262
Mean Percentage Error: -0.12032004116605875
Mean Absolute Percentage Error: 0.2557448029555227
Mean Squared Error: 487.234103292863
Root Mean Squared Error: 22.0733799698384
R-square value: 0.7069129719365339
//--------------------------------------//
route: 46A 
 direction: 2
Mean Absolute Error (train): 16.500691934291893
Mean Absolute Error (test): 17.03895585871692
Mean Percentage Error: -0.21158735651477184
Mean Absolute Percentage Error: 0.39060542085775335
Mean Squared Error: 583.4854888910609
Root Mean Squared Error: 24.15544429090595
R-square value: 0.6262371524660262
//--------------------------------------//
route: 51D 
 direction: 2
Mean Absolute Error (train): 18.59796183673518
Mean Absolute Error (test): 20.37199893600867
Mean Percentage Error: -0.19001035527406537
Mean Absolute Percentage Error: 0.324055247928

even after reduction of the max depth and number of estimators, the pickle files produced are quite large at 600MB in total. Therefore we will compare these results with those received in the decision tree models. 

In [3]:
Dec_tree = pd.read_csv("~/data/Model_Testing/Decision_Tree_Results.csv")
Dec_tree = Dec_tree.drop([Dec_tree.index[64]])
Dec_tree = Dec_tree.drop([Dec_tree.index[124]])

In [13]:
linreg_problem = ['41','83A','84X','40E','16D','41D','25A','46A','51D']
df = pd.DataFrame([])
for line in linreg_problem:
    df = df.append(Dec_tree[Dec_tree['Route']==line])

In [14]:
df

Unnamed: 0,Route,Direction,Mean_Absolute_Error_Train,Mean_Absolute_Error_Test,Mean_Percentage_Error,Mean_Absolute_Percentage_Error,Mean_Squared_Error,Root_Mean_Squared_Error,R_Square
42,41,1,14.865706,15.342374,-0.141056,0.323683,496.78246,22.288617,0.587828
43,41,2,15.724843,16.388909,-0.184556,0.374808,569.813828,23.870774,0.712803
136,83A,1,13.658168,14.152607,-0.126946,0.303211,416.517977,20.408772,0.60123
137,83A,2,12.242469,12.836962,-0.125528,0.29319,381.969002,19.544027,0.674075
158,84X,1,16.818695,18.360727,-0.063635,0.190043,800.534846,28.293725,0.863518
159,84X,2,21.499416,22.777924,-0.111781,0.262865,1285.260303,35.850527,0.833154
243,40E,1,6.80689,8.032067,-0.076684,0.218052,182.310131,13.502227,0.837227
244,40E,2,7.274872,8.24297,-0.0865,0.210595,204.691966,14.30706,0.880038
250,16D,1,19.318965,21.296524,-0.195219,0.41002,951.505438,30.846482,0.543279
252,41D,1,16.680873,20.419952,-0.14727,0.394753,859.29474,29.31373,0.309858


If we reduce the parameters one more time we can see that decision tree produces better results across the board.

Max depth 15, n estimators 64, not satisfactory results and pickle files still quite large, decision tree will be used instead for problematic routes.

In [45]:
for Route in direction_1:
    df = pd.read_csv("~/data/Cleaned_By_Route_Header/route_{}.csv".format(Route))
    direction = 1
    print("//--------------------------------------//")  
    print("route:",Route,"\n","direction:", direction)
    final_df = Final_Clean(df, direction)
    Random_Forest(final_df)  

//--------------------------------------//
route: 41 
 direction: 1
Mean Absolute Error (train): 17.71748655297082
Mean Absolute Error (test): 17.801321344064004
Mean Percentage Error: -0.24254556783484715
Mean Absolute Percentage Error: 0.42603249594804504
Mean Squared Error: 587.2021428871515
Root Mean Squared Error: 24.232254185014472
R-square value: 0.5128081100897828
//--------------------------------------//
route: 83A 
 direction: 1
Mean Absolute Error (train): 16.214863734760634
Mean Absolute Error (test): 16.299870291653935
Mean Percentage Error: -0.22578297439823158
Mean Absolute Percentage Error: 0.4035057710561318
Mean Squared Error: 488.6333231308832
Root Mean Squared Error: 22.105051982089595
R-square value: 0.5321871150417881
//--------------------------------------//
route: 84X 
 direction: 1
Mean Absolute Error (train): 26.67741332803646
Mean Absolute Error (test): 27.037022478415068
Mean Percentage Error: -0.23616094819774505
Mean Absolute Percentage Error: 0.38527645

In [46]:
for Route in direction_2:
    df = pd.read_csv("~/data/Cleaned_By_Route_Header/route_{}.csv".format(Route))
    direction = 2
    print("//--------------------------------------//")  
    print("route:",Route,"\n","direction:", direction)
    final_df = Final_Clean(df, direction)
    Random_Forest(final_df) 

//--------------------------------------//
route: 25A 
 direction: 2
Mean Absolute Error (train): 17.458395477811386
Mean Absolute Error (test): 17.499169797712177
Mean Percentage Error: -0.22689752728474794
Mean Absolute Percentage Error: 0.3741678199679736
Mean Squared Error: 632.5352023247144
Root Mean Squared Error: 25.150252530038628
R-square value: 0.619509674421451
//--------------------------------------//
route: 46A 
 direction: 2
Mean Absolute Error (train): 20.155828884558215
Mean Absolute Error (test): 20.22825368014293
Mean Percentage Error: -0.32563500857468547
Mean Absolute Percentage Error: 0.5244651177736852
Mean Squared Error: 734.7747855873733
Root Mean Squared Error: 27.10672952584604
R-square value: 0.5293258849003242
//--------------------------------------//
route: 51D 
 direction: 2
Mean Absolute Error (train): 28.3807783981671
Mean Absolute Error (test): 29.125510742661472
Mean Percentage Error: -0.4472140967838323
Mean Absolute Percentage Error: 0.618565880551