In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score
pd.options.mode.chained_assignment = None 
from matplotlib import pyplot
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
import pickle
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import csv
pd.set_option('display.max_columns', None)

In [2]:
def Final_Clean(df, direction):

    df = df.loc[df['DIRECTION'] == direction]
    
    #covert to appropriate data types
    df['DAYOFSERVICE'] = df['DAYOFSERVICE'].astype('datetime64[ns]')
    df['ROUTEID'] = df['ROUTEID'].astype('category')
    df['LINEID'] = df['LINEID'].astype('category')

    #adding an hour column so that dataframe can be merged with weather dataframe 
    df['HOUR'] = pd.to_datetime(df['DEPARTURE_TIME_ACTUAL'])
    df['HOUR'] = df['HOUR'].dt.hour

    #Grouping the trips by day of service and trip ID and then 
    #sorting the dataframe according to progrnumber (Sequential position of the stop point in the trip)
    sorted_df = df.groupby(["DAYOFSERVICE","TRIPID"]).apply(lambda x: x.sort_values(["PROGRNUMBER"], ascending = True)).reset_index(drop=True)

    #Creating a feature for the duration of the journey 
    #this is done by taking the actual arrival time of the preceding row from the arrival time of the following row.
    sorted_df['JOURNEY_DURATION'] = sorted_df['ACTUALTIME_ARR'].shift(-1) - sorted_df['ACTUALTIME_ARR'] - sorted_df['DWELL_TIME']

    #creating the peak hour feature
    sorted_df['PEAK_HOUR'] = [1 if x >'07:00:00' and x <'10:00:00' or x >'16:00:00' and x <'19:00:00' else 0 for x in sorted_df['ARRIVAL_TIME_ACTUAL']]

    #creating a public holiday indicator feature 
    dates = pd.to_datetime(['2018-01-01','2018-03-17','2018-03-19','2018-03-30','2018-04-01','2018-04-02','2018-05-07',
     '2018-06-04','2018-08-06','2018-11-29','2018-12-24','2018-12-25','2018-12-26', '2018-12-31'])

    sorted_df['PUBLIC_HOLIDAY'] = sorted_df['DAYOFSERVICE'].isin(dates)
    sorted_df['PUBLIC_HOLIDAY'] *= 1    
    
    #deleting columns that won't be used
    del sorted_df['ARRIVAL_TIME_ACTUAL']
    del sorted_df['DEPARTURE_TIME_ACTUAL']
    del sorted_df['PLANNEDTIME_DEP']
    del sorted_df['PLANNEDTIME_ARR']

    #we want to get rid of all rows where the bus has reached the final destination of the trip
    #the journey duration of these rows is not representative of the actual journey time as 
    #this is the time spent in the bay before the next trip 

    #we create a column END_OF_TRIP that will be 1 for all in transit journeys
    #and negative for the final destination as it is the subtraction of the progress number of the current section
    #from the progress number of the next section of the trip 
    sorted_df['NEXT_STOPPOINTID'] = sorted_df['STOPPOINTID'].shift(-1)
    sorted_df['NEXT_PROGNUMBER']=sorted_df["PROGRNUMBER"].shift(-1)
    sorted_df['END_OF_TRIP'] = sorted_df['NEXT_PROGNUMBER'] - sorted_df['PROGRNUMBER']

    #as the final rows journey duration, NEXT_STOPPOINTID, NEXT_PROGNUMBER and END_OF_TRIP are null
    sorted_df = sorted_df.dropna(axis=0)

    
    #merging weather data 
    weather_df = pd.read_csv('~/data/Clean_Weather_2018.csv')
    weather_df['date'] = weather_df['date'].astype('datetime64[ns]')
    weather_df['weather_main'] = weather_df['weather_main'].astype('category')

    final_df = pd.merge(sorted_df, weather_df, how='left',left_on=['DAYOFSERVICE','HOUR'],right_on=['date','hour'])

    
    del final_df['VEHICLEID']
    del final_df['date']
    del final_df['hour']

    final_df['MONTH']= final_df['MONTH'].astype('category')
    final_df['DAY']= final_df['DAY'].astype('category')
    final_df['HOUR']= final_df['HOUR'].astype('category')
    final_df['STOPPOINTID']= final_df['STOPPOINTID'].astype('category')
    final_df['NEXT_STOPPOINTID']= final_df['NEXT_STOPPOINTID'].astype('int')
    final_df['NEXT_STOPPOINTID']= final_df['NEXT_STOPPOINTID'].astype('category')
    final_df['JOURNEY_DURATION']= final_df['JOURNEY_DURATION'].astype('int')

    #removing the row at the end of each trip where journey duration is not required.
    final_df = final_df.loc[final_df['END_OF_TRIP'] == 1]
    
    #remove negative journey times
    final_df = final_df[(final_df['JOURNEY_DURATION'] > 0)]
    
    #remove outliers that are 3 standard deviations above the mean.
    final_df=final_df[np.abs(final_df.JOURNEY_DURATION-final_df.JOURNEY_DURATION.mean()) <= (3*final_df.JOURNEY_DURATION.std())]

    return final_df

# Parameter tuning for specific route 

In [3]:
df_120 = pd.read_csv("~/data/Cleaned_By_Route_Header/route_120.csv")

In [6]:
df = Final_Clean(df_120, 1)

In [7]:
X = pd.DataFrame(df[['DAY','HOUR','MONTH','STOPPOINTID','NEXT_STOPPOINTID',
                       'PEAK_HOUR', 'wind_speed', 'temp', 'clouds_all', 'humidity', 'weather_main']])

y = df.JOURNEY_DURATION

X = pd.get_dummies(X)

#test-train split 
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [8]:
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 100, num = 10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

In [11]:
# Create the random grid
random_grid = {'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
print(random_grid)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
DT = DecisionTreeRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
DT_random = RandomizedSearchCV(estimator = DT, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=2, n_jobs = -1)
# Fit the random search model
DT_random.fit(X_train, Y_train)

DT_random.best_params_

{'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}
Fitting 3 folds for each of 50 candidates, totalling 150 fits


{'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 30}

In [12]:
# Create the parameter grid based on the results of random search 
param_grid = {
     'min_samples_split': [9,10,11],
     'min_samples_leaf': [3,4,5],
     'max_features': ['sqrt'],
     'max_depth': [25,30,35]
}

# Create a based model
DT = DecisionTreeRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = DT, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(X_train, Y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


GridSearchCV(cv=3, estimator=DecisionTreeRegressor(), n_jobs=-1,
             param_grid={'max_depth': [25, 30, 35], 'max_features': ['sqrt'],
                         'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [9, 10, 11]},
             verbose=2)

In [13]:
grid_search.best_params_

{'max_depth': 35,
 'max_features': 'sqrt',
 'min_samples_leaf': 5,
 'min_samples_split': 11}

# Running these parameters on all routes

In [3]:
def Decision_Tree_Regressor(df):

    X = pd.DataFrame(final_df[['DAY','HOUR','MONTH','STOPPOINTID','NEXT_STOPPOINTID',
                           'PEAK_HOUR', 'wind_speed', 'temp', 'humidity', 'weather_main']])
    
    y = df.JOURNEY_DURATION

    X = pd.get_dummies(X)
    
    dummies = X.columns
    
    with open("/home/tomah/data/Model_Testing/DecisionTreePickles/DecisionTree_route_{}_d{}_headers.pkl".format(Route, direction), 'wb') as handle:
        pickle.dump(dummies, handle, pickle.HIGHEST_PROTOCOL)

    #test-train split 
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state=1)

    
    params = {'max_depth': 35,
             'max_features': 'sqrt',
             'min_samples_leaf': 5,
             'min_samples_split': 11}


    #creating and fitting the decision tree model 
    DT_reg = DecisionTreeRegressor(**params)

    DT_reg.fit(X_train, Y_train)
    
    with open("/home/tomah/data/Model_Testing/DecisionTreePickles/DecisionTree_Model_route_{}_d{}.pkl".format(Route, direction), 'wb') as handle:
        pickle.dump(DT_reg, handle, pickle.HIGHEST_PROTOCOL)

    #predictions from the training set
    y_pred_DT_train = DT_reg.predict(X_train)
    
    #predicitng from the test set 
    y_pred_DT = DT_reg.predict(X_test)
    
    #some evaluation metrics.
    print('Mean Absolute Error (train):', metrics.mean_absolute_error(Y_train, y_pred_DT_train))
    print('Mean Absolute Error (test):', metrics.mean_absolute_error(Y_test, y_pred_DT))
    print('Mean Percentage Error:', np.mean((Y_test - y_pred_DT)/Y_test))
    print('Mean Absolute Percentage Error:', metrics.mean_absolute_percentage_error(Y_test,y_pred_DT)) 
    print('Mean Squared Error:', metrics.mean_squared_error(Y_test, y_pred_DT))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test, y_pred_DT)))
    print('R-square value:', r2_score(Y_test,y_pred_DT))
    
    Mean_abs_error_train = metrics.mean_absolute_error(Y_train, y_pred_DT_train)
    Mean_abs_error_test = metrics.mean_absolute_error(Y_test, y_pred_DT)
    Mean_Percentage_Error = np.mean((Y_test - y_pred_DT)/Y_test)
    Mean_Absolute_Percentage_Error= metrics.mean_absolute_percentage_error(Y_test,y_pred_DT)  
    Mean_squared_error =  metrics.mean_squared_error(Y_test, y_pred_DT)
    Root_Mean_sqaured_error = np.sqrt(metrics.mean_squared_error(Y_test, y_pred_DT))
    R_square_value =  r2_score(Y_test,y_pred_DT)

    with open('Decision_Tree_Results.csv', 'a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([Route, direction, Mean_abs_error_train, Mean_abs_error_test,Mean_Percentage_Error,
                         Mean_Absolute_Percentage_Error, Mean_squared_error, Root_Mean_sqaured_error, R_square_value])

In [4]:
Trips_df = pd.read_csv("~/data/rt_trips_DB_2018.txt", ";")
Line_List = Trips_df.LINEID.unique().tolist()

In [5]:
with open('Decision_Tree_Results.csv', 'a', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Route","Direction", "Mean_Absolute_Error_Train","Mean_Absolute_Error_Test", 
                     "Mean_Percentage_Error", "Mean_Absolute_Percentage_Error",
                     "Mean_Squared_Error", "Root_Mean_Squared_Error", "R_Square"])

In [8]:
for Route in Line_List:
    df = pd.read_csv("~/data/Cleaned_By_Route_Header/route_{}.csv".format(Route))
    print("//--------------------------------------//")
    for direction in range(1,3):
        print("route:",Route,"\n","direction:", direction)
        final_df = Final_Clean(df, direction)
        Decision_Tree_Regressor(final_df)   

//--------------------------------------//
route: 68 
 direction: 1
Mean Absolute Error (train): 12.133268827624613
Mean Absolute Error (test): 12.577672633816091
Mean Percentage Error: -0.17340752859621603
Mean Absolute Percentage Error: 0.34931996792677955
Mean Squared Error: 371.77639407858516
Root Mean Squared Error: 19.281503937156593
R-square value: 0.6826590089886932
route: 68 
 direction: 2
Mean Absolute Error (train): 13.370357767885855
Mean Absolute Error (test): 13.8669486565902
Mean Percentage Error: -0.18727628479634179
Mean Absolute Percentage Error: 0.3634052818049304
Mean Squared Error: 491.03129329651574
Root Mean Squared Error: 22.159225918260677
R-square value: 0.5657684209116157
//--------------------------------------//
route: 25B 
 direction: 1
Mean Absolute Error (train): 13.376540385955094
Mean Absolute Error (test): 14.321233968539325
Mean Percentage Error: -0.094893806152898
Mean Absolute Percentage Error: 0.2459353656672967
Mean Squared Error: 488.94263635073

KeyboardInterrupt: 

In [11]:
for Route in Line_List[32:]:

    df = pd.read_csv("~/data/Cleaned_By_Route_Header/route_{}.csv".format(Route))
    print("//--------------------------------------//")  
    
    if len(df.DIRECTION.unique()) == 1:

        direction = df.DIRECTION.unique()[0]
        print("route:",Route,"\n","direction:", direction)
        final_df = Final_Clean(df, direction)
        Decision_Tree_Regressor(final_df)  
    
    elif len(df.DIRECTION.unique()) == 2:
        for direction in range(1,3):
            print("route:",Route,"\n","direction:", direction)
            final_df = Final_Clean(df, direction)
            Decision_Tree_Regressor(final_df) 

//--------------------------------------//
route: 13 
 direction: 1
Mean Absolute Error (train): 15.662536079453938
Mean Absolute Error (test): 16.15439081540776
Mean Percentage Error: -0.235681558660353
Mean Absolute Percentage Error: 0.4367431425822173
Mean Squared Error: 567.7054785378144
Root Mean Squared Error: 23.82657085142162
R-square value: 0.6935747715111548
route: 13 
 direction: 2
Mean Absolute Error (train): 16.64176163847308
Mean Absolute Error (test): 17.034363563867462
Mean Percentage Error: -0.21245656152811274
Mean Absolute Percentage Error: 0.4101796155763269
Mean Squared Error: 577.0233770835669
Root Mean Squared Error: 24.021310894361424
R-square value: 0.603080274768355
//--------------------------------------//
route: 15B 
 direction: 1
Mean Absolute Error (train): 13.296062752494574
Mean Absolute Error (test): 14.010140905685793
Mean Percentage Error: -0.1251735054051462
Mean Absolute Percentage Error: 0.2925454361028448
Mean Squared Error: 477.0094045602811
Roo

//--------------------------------------//
route: 42 
 direction: 1
Mean Absolute Error (train): 12.171539233687474
Mean Absolute Error (test): 12.651323063058902
Mean Percentage Error: -0.13432835717918631
Mean Absolute Percentage Error: 0.29897948941873104
Mean Squared Error: 375.4793871568473
Root Mean Squared Error: 19.377290500914913
R-square value: 0.7540204179602775
route: 42 
 direction: 2
Mean Absolute Error (train): 10.924263968389749
Mean Absolute Error (test): 11.310445256688942
Mean Percentage Error: -0.11348709895725789
Mean Absolute Percentage Error: 0.27613030620553675
Mean Squared Error: 310.8133175484123
Root Mean Squared Error: 17.629898398697943
R-square value: 0.6065362123474025
//--------------------------------------//
route: 67 
 direction: 1
Mean Absolute Error (train): 12.044581535191107
Mean Absolute Error (test): 12.380809433532447
Mean Percentage Error: -0.11585275420155473
Mean Absolute Percentage Error: 0.28128979708823315
Mean Squared Error: 346.87358016

//--------------------------------------//
route: 54A 
 direction: 1
Mean Absolute Error (train): 14.375806220791933
Mean Absolute Error (test): 15.21263143788079
Mean Percentage Error: -0.09940773050421994
Mean Absolute Percentage Error: 0.25345321894393535
Mean Squared Error: 561.3698452696328
Root Mean Squared Error: 23.693244718054824
R-square value: 0.6555996778437316
route: 54A 
 direction: 2
Mean Absolute Error (train): 15.922185903333725
Mean Absolute Error (test): 16.892117768833657
Mean Percentage Error: -0.11921590082036997
Mean Absolute Percentage Error: 0.28260730669467876
Mean Squared Error: 779.6462960601259
Root Mean Squared Error: 27.922147053192845
R-square value: 0.670310656546876
//--------------------------------------//
route: 66 
 direction: 1
Mean Absolute Error (train): 14.238248399466691
Mean Absolute Error (test): 14.787715584076892
Mean Percentage Error: -0.11885792614365025
Mean Absolute Percentage Error: 0.28050063488128774
Mean Squared Error: 527.45005894

MemoryError: Unable to allocate 1.30 GiB for an array with shape (1410300, 247) and data type float32

In [16]:
for Route in Line_List[61:]:

    df = pd.read_csv("~/data/Cleaned_By_Route_Header/route_{}.csv".format(Route))
    print("//--------------------------------------//")  
    
    if len(df.DIRECTION.unique()) == 1:

        direction = df.DIRECTION.unique()[0]
        print("route:",Route,"\n","direction:", direction)
        final_df = Final_Clean(df, direction)
        Decision_Tree_Regressor(final_df)  
    
    elif len(df.DIRECTION.unique()) == 2:
        for direction in range(1,3):
            print("route:",Route,"\n","direction:", direction)
            final_df = Final_Clean(df, direction)
            Decision_Tree_Regressor(final_df) 

//--------------------------------------//
route: 27 
 direction: 1
Mean Absolute Error (train): 15.452068494166717
Mean Absolute Error (test): 15.825529837602218
Mean Percentage Error: -0.16550811751070127
Mean Absolute Percentage Error: 0.34456865334900333
Mean Squared Error: 598.8303120468927
Root Mean Squared Error: 24.471009624592376
R-square value: 0.5626871576185384
route: 27 
 direction: 2
Mean Absolute Error (train): 15.917429108480722
Mean Absolute Error (test): 16.19930591666838
Mean Percentage Error: -0.19558987232977673
Mean Absolute Percentage Error: 0.3864671421531076
Mean Squared Error: 565.906826357951
Root Mean Squared Error: 23.78879623600049
R-square value: 0.6075609298347686
//--------------------------------------//
route: 15A 
 direction: 1
Mean Absolute Error (train): 13.988858016741894
Mean Absolute Error (test): 15.04955727964218
Mean Percentage Error: -0.1518455191821835
Mean Absolute Percentage Error: 0.3339923969161343
Mean Squared Error: 541.3893127397712


Mean Absolute Percentage Error: 0.3214415928884569
Mean Squared Error: 666.3155101205114
Root Mean Squared Error: 25.813087961739708
R-square value: 0.5964105022852948
//--------------------------------------//
route: 84A 
 direction: 1
Mean Absolute Error (train): 17.76190740053102
Mean Absolute Error (test): 19.519667693457922
Mean Percentage Error: -0.14826499084977554
Mean Absolute Percentage Error: 0.3320812752065931
Mean Squared Error: 1125.4114089898353
Root Mean Squared Error: 33.54715202502047
R-square value: 0.6786125137806123
route: 84A 
 direction: 2
Mean Absolute Error (train): 14.173739658005326
Mean Absolute Error (test): 15.069309599399467
Mean Percentage Error: -0.17860453878786403
Mean Absolute Percentage Error: 0.3526353091748751
Mean Squared Error: 617.3870267437005
Root Mean Squared Error: 24.84727403043844
R-square value: 0.5178223919797182
//--------------------------------------//
route: 220 
 direction: 1
Mean Absolute Error (train): 14.090058295728285
Mean Abs

//--------------------------------------//
route: 26 
 direction: 1
Mean Absolute Error (train): 14.665529959939779
Mean Absolute Error (test): 16.306259049298404
Mean Percentage Error: -0.11061374845788219
Mean Absolute Percentage Error: 0.27769400075190764
Mean Squared Error: 670.1294789473803
Root Mean Squared Error: 25.88685919433604
R-square value: 0.7170090982087344
route: 26 
 direction: 2
Mean Absolute Error (train): 15.16294056233862
Mean Absolute Error (test): 17.06283895284901
Mean Percentage Error: -0.11580494608379263
Mean Absolute Percentage Error: 0.2837594887478318
Mean Squared Error: 821.3420158007967
Root Mean Squared Error: 28.659065159226614
R-square value: 0.6511279357904218
//--------------------------------------//
route: 66A 
 direction: 1
Mean Absolute Error (train): 13.331024043973349
Mean Absolute Error (test): 14.206848584155075
Mean Percentage Error: -0.11166463304794827
Mean Absolute Percentage Error: 0.2718565803165706
Mean Squared Error: 500.920897429475

//--------------------------------------//
route: 104 
 direction: 1
Mean Absolute Error (train): 10.87038917216915
Mean Absolute Error (test): 11.931805750087365
Mean Percentage Error: -0.06686411600995695
Mean Absolute Percentage Error: 0.20766921715705328
Mean Squared Error: 392.32787310886863
Root Mean Squared Error: 19.80726818894692
R-square value: 0.7716958987720473
route: 104 
 direction: 2
Mean Absolute Error (train): 13.403886024166628
Mean Absolute Error (test): 14.494477701525845
Mean Percentage Error: -0.1146050592554986
Mean Absolute Percentage Error: 0.2785632081136105
Mean Squared Error: 512.224589833906
Root Mean Squared Error: 22.632379234934756
R-square value: 0.710511534264695
//--------------------------------------//
route: 33A 
 direction: 1
Mean Absolute Error (train): 8.479529128259115
Mean Absolute Error (test): 8.760511501119739
Mean Percentage Error: -0.08450696176410019
Mean Absolute Percentage Error: 0.22640338560610648
Mean Squared Error: 196.504357604994

Mean Absolute Error (train): 16.870520224316664
Mean Absolute Error (test): 18.91371962989314
Mean Percentage Error: -0.14100549175308896
Mean Absolute Percentage Error: 0.3302232480299687
Mean Squared Error: 841.8498781506499
Root Mean Squared Error: 29.014649371492496
R-square value: 0.7018670344010499
route: 38D 
 direction: 2
Mean Absolute Error (train): 13.940973828981189
Mean Absolute Error (test): 15.962420075959237
Mean Percentage Error: -0.12834577015961432
Mean Absolute Percentage Error: 0.31570167598870863
Mean Squared Error: 631.7852836993846
Root Mean Squared Error: 25.135339339252706
R-square value: 0.5184492927717423
//--------------------------------------//
route: 118 
 direction: 2
Mean Absolute Error (train): 19.464153968084513
Mean Absolute Error (test): 20.886988893839792
Mean Percentage Error: -0.14543052324332031
Mean Absolute Percentage Error: 0.3329071660869437
Mean Squared Error: 1011.8414907563929
Root Mean Squared Error: 31.809455995920345
R-square value: 0.

Mean Absolute Error (train): 21.00807119492397
Mean Absolute Error (test): 22.69192879558487
Mean Percentage Error: -0.1382565724281748
Mean Absolute Percentage Error: 0.334767882104677
Mean Squared Error: 1175.1356024279305
Root Mean Squared Error: 34.28025090964082
R-square value: 0.7116713586371524
route: 25X 
 direction: 2
Mean Absolute Error (train): 23.349682152483336
Mean Absolute Error (test): 24.953443118699028
Mean Percentage Error: -0.1378774234796775
Mean Absolute Percentage Error: 0.32212239066060044
Mean Squared Error: 1498.1958701419856
Root Mean Squared Error: 38.70653523814791
R-square value: 0.7816168123654661
//--------------------------------------//
route: 40E 
 direction: 1
Mean Absolute Error (train): 6.806889523177112
Mean Absolute Error (test): 8.03206713221906
Mean Percentage Error: -0.07668421217120737
Mean Absolute Percentage Error: 0.21805238631132556
Mean Squared Error: 182.3101306367905
Root Mean Squared Error: 13.502226876955907
R-square value: 0.8372271

In [7]:
df_104 = pd.read_csv("~/data/Cleaned_By_Route_Header/route_104.csv")

Try out Decision tree model for a range of max depths, n=10 seems to be the optimal amount.

In [35]:
depths = [5,10,20,30]
for direction in range(1,3):
    for depth in depths:
        print("//--------------------------------------//")
        print("Direction:",direction, '\n', "depth:", depth)
        final_df = Final_Clean(df_120, direction)
        Decision_Tree_Regressor(final_df, depth)

//--------------------------------------//
Direction: 1 
 depth: 5
Mean Absolute Error (train): 22.46715627389422
Mean Absolute Error (test): 22.62058286111831
Mean Squared Error: 1749.935694279427
Root Mean Squared Error: 41.83223271927315
R-square value: 0.46914406431041455
//--------------------------------------//
Direction: 1 
 depth: 10
Mean Absolute Error (train): 16.78935791859173
Mean Absolute Error (test): 17.141907465323936
Mean Squared Error: 1401.5584107876946
Root Mean Squared Error: 37.437393215710074
R-square value: 0.5748268899168445
//--------------------------------------//
Direction: 1 
 depth: 20
Mean Absolute Error (train): 13.117213195689933
Mean Absolute Error (test): 15.273827585624538
Mean Squared Error: 1500.5564568014147
Root Mean Squared Error: 38.73701662236542
R-square value: 0.54479510045175
//--------------------------------------//
Direction: 1 
 depth: 30
Mean Absolute Error (train): 10.673758192623252
Mean Absolute Error (test): 15.604994842975282
Me

In [24]:
for direction in range(1,3):
    print("//--------------------------------------//")
    print("Direction:",direction, '\n')
    final_df = Final_Clean(df_120, direction)
    Decision_Tree_Regressor(final_df)

//--------------------------------------//
Direction: 1 

Mean Absolute Error (train): 16.78935791859173
Mean Absolute Error (test): 17.13298394945538
Mean Squared Error: 1396.8637773085825
Root Mean Squared Error: 37.37464083183386
R-square value: 0.576251041705062
//--------------------------------------//
Direction: 2 

Mean Absolute Error (train): 18.343329014403526
Mean Absolute Error (test): 18.703171199102258
Mean Squared Error: 1443.252699500082
Root Mean Squared Error: 37.99016582617246
R-square value: 0.4555192720771404


In [22]:
for direction in range(1,3):
    print("//--------------------------------------//")
    print("Direction:",direction, '\n')
    final_df = Final_Clean(df_104, direction)
    Decision_Tree_Regressor(final_df)

//--------------------------------------//
Direction: 1 

Mean Absolute Error (train): 18.34890933029717
Mean Absolute Error (test): 19.70109485809695
Mean Squared Error: 2483.5853485905955
Root Mean Squared Error: 49.83558315692308
R-square value: 0.3991381965438606
//--------------------------------------//
Direction: 2 

Mean Absolute Error (train): 22.33260872503538
Mean Absolute Error (test): 23.313833713671308
Mean Squared Error: 3598.9086077805196
Root Mean Squared Error: 59.99090437541778
R-square value: 0.22169437601596698
