In [1]:
import itertools
import json
import os
from random import sample

import contextily as cx
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
from sklearn import metrics
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

from database import data_utils
from models import time_table_model

import importlib
importlib.reload(time_table_model)

<module 'models.time_table_model' from '/Users/zack/Desktop/valle/src/models/time_table_model.py'>

In [None]:
# Read in DeepTTE results
kcm_deeptte_preds = pd.read_csv("../results/kcm2weeks/deeptte.res", delimiter=" ", header=None)
kcm_deeptte_preds.columns = ["label", "pred"]

nwy_deeptte_preds = pd.read_csv("../results/nwy2weeks/deeptte.res", delimiter=" ", header=None)
nwy_deeptte_preds.columns = ["label", "pred"]

In [None]:
# Read in config file
with open("../results/kcm2weeks/data/config.json") as f:
    nwy_config = json.load(f)

with open("../results/nwy2weeks/data/config.json") as f:
    nwy_config = json.load(f)

In [None]:
# Read in test data
nwy_contents = open("../results/kcm2weeks/data/test", "r").read()
kcm_test_data = [json.loads(str(item)) for item in kcm_contents.strip().split('\n')]

nwy_contents = open("../results/nwy2weeks/data/test", "r").read()
nwy_test_data = [json.loads(str(item)) for item in nwy_contents.strip().split('\n')]

In [None]:
# Read in train data
kcm_train_data = []
for i in range(0,5):
    kcm_contents = open("../results/kcm2weeks/data/train_0"+str(i), "r").read()
    kcm_train_data.append([json.loads(str(item)) for item in kcm_contents.strip().split('\n')])
kcm_train_data = list(itertools.chain.from_iterable(kcm_train_data))

nwy_train_data = []
for i in range(0,5):
    nwy_contents = open("../results/nwy2weeks/data/train_0"+str(i), "r").read()
    nwy_train_data.append([json.loads(str(item)) for item in nwy_contents.strip().split('\n')])
nwy_train_data = list(itertools.chain.from_iterable(nwy_train_data))

In [None]:
# Calculate average speed grouped by time of day
dists = [x['dist'] for x in nwy_train_data]
times = [x['time'] for x in nwy_train_data]
hours = [x['timeID'] // 60 for x in nwy_train_data]
speeds = [dists[i] / times[i] for i in range(0,len(dists))] # km/s
nwy_avg_speeds = pd.DataFrame({"hour":hours, "speed":speeds}).groupby("hour").mean().to_dict()
# Predict travel time based on historical average speeds
hours = [x['timeID'] // 60 for x in nwy_test_data]
dists = [x['dist'] for x in nwy_test_data]
speeds = [nwy_avg_speeds['speed'][x] for x in hours]
nwy_avg_preds = [dists[i] / speeds[i] for i in range(0,len(dists))]

# Calculate average speed grouped by time of day
dists = [x['dist'] for x in kcm_train_data]
times = [x['time'] for x in kcm_train_data]
hours = [x['timeID'] // 60 for x in kcm_train_data]
speeds = [dists[i] / times[i] for i in range(0,len(dists))] # km/s
kcm_avg_speeds = pd.DataFrame({"hour":hours, "speed":speeds}).groupby("hour").mean().to_dict()
# Predict travel time based on historical average speeds
hours = [x['timeID'] // 60 for x in kcm_test_data]
dists = [x['dist'] for x in kcm_test_data]
speeds = [kcm_avg_speeds['speed'][x] for x in hours]
kcm_avg_preds = [dists[i] / speeds[i] for i in range(0,len(dists))]

In [None]:
# Resample GPS points to fixed number
nwy_train_data_resample = data_utils.resample_deeptte_gps(nwy_train_data, 128)
nwy_test_data_resample = data_utils.resample_deeptte_gps(nwy_test_data, 128)

kcm_train_data_resample = data_utils.resample_deeptte_gps(kcm_train_data, 128)
kcm_test_data_resample = data_utils.resample_deeptte_gps(kcm_test_data, 128)

In [None]:
# Reshape the resampled GPS data to a 2d np array for train/testing additional models
X_train_nwy, y_train_nwy = data_utils.format_deeptte_to_features(nwy_train_data, nwy_train_data_resample)
X_test_nwy, y_test_nwy = data_utils.format_deeptte_to_features(nwy_test_data, nwy_test_data_resample)

X_train_kcm, y_train_kcm = data_utils.format_deeptte_to_features(kcm_train_data, kcm_train_data_resample)
X_test_kcm, y_test_kcm = data_utils.format_deeptte_to_features(kcm_test_data, kcm_test_data_resample)

In [None]:
# Train GBDT on training data, make preds on test data
nwy_reg = GradientBoostingRegressor(random_state=0)
nwy_reg.fit(X_train_nwy, y_train_nwy)
GradientBoostingRegressor(random_state=0)
nwy_gbdt_preds = nwy_reg.predict(X_test_nwy)

kcm_reg = GradientBoostingRegressor(random_state=0)
kcm_reg.fit(X_train_kcm, y_train_kcm)
GradientBoostingRegressor(random_state=0)
kcm_gbdt_preds = kcm_reg.predict(X_test_kcm)

In [11]:
# Use schedule model to make preds on test data
with open('../results/nwy2weeks/data/test_traces.pkl', 'rb') as f:
    nwy_test_traces = pickle.load(f)
nwy_sch = time_table_model.TimeTableModel("../data/nwy_gtfs/2weeks/", "Europe/Oslo")
nwy_sch_preds, nwy_sch_labels = nwy_sch.predict_using_schedule_only(nwy_test_traces)

# with open('../results/kcm2weeks/data/test_traces.pkl', 'rb') as f:
#     kcm_test_traces = pickle.load(f)
# kcm_sch = time_table_model.TimeTableModel("../data/kcm_gtfs/", "America/Los_Angeles")
# kcm_sch_preds, kcm_sch_labels = kcm_sch.predict_using_schedule_only(kcm_test_traces)

  st = pd.read_csv(self.gtfs_folder+"stop_times.txt")
  sl = pd.read_csv(self.gtfs_folder+"stops.txt")


AttributeError: 'TimeTableModel' object has no attribute 'predict'

In [25]:
nwy_test_traces.head()

Unnamed: 0,datedvehiclejourney,dataframe,vehicle,mode,line,linename,direction,operator,datasource,lat,...,dist_calc,speed_m_s,dist_calc_km,time_cumulative,dist_cumulative,datetime,dateID,weekID,timeID,actual_time_from_midnight
75978,624-100001-170-2246-20220915-5852266,2022-09-15,1703,bus,ATB:Line:1600010,10,Outbound,160,ATB,63.3953888,...,23.589686,1.123318,0.02359,0.0,0.0,2022-09-15 07:00:19+02:00,15,3,420,25219
75979,624-100001-170-2246-20220915-5852266,2022-09-15,1703,bus,ATB:Line:1600010,10,Outbound,160,ATB,63.3974473,...,260.169892,8.392577,0.26017,31.0,0.26017,2022-09-15 07:00:50+02:00,15,3,420,25250
75980,624-100001-170-2246-20220915-5852266,2022-09-15,1703,bus,ATB:Line:1600010,10,Outbound,160,ATB,63.3987772,...,154.484363,7.724218,0.154484,51.0,0.414654,2022-09-15 07:01:10+02:00,15,3,421,25270
75981,624-100001-170-2246-20220915-5852266,2022-09-15,1703,bus,ATB:Line:1600010,10,Outbound,160,ATB,63.4001098,...,162.393484,8.119674,0.162393,71.0,0.577048,2022-09-15 07:01:30+02:00,15,3,421,25290
75982,624-100001-170-2246-20220915-5852266,2022-09-15,1703,bus,ATB:Line:1600010,10,Outbound,160,ATB,63.4018095,...,214.246767,10.202227,0.214247,92.0,0.791295,2022-09-15 07:01:51+02:00,15,3,421,25311


In [5]:
# Compare different methods for predicting travel times of the test data
model_performances = {
    "MAPE": {
        "Trondheim": {
            "DeepTTE": metrics.mean_absolute_percentage_error(nwy_deeptte_preds.label, nwy_deeptte_preds.pred),
            "GBDT": metrics.mean_absolute_percentage_error(nwy_gbdt_preds, [x['time'] for x in nwy_test_data]),
            "AVG": metrics.mean_absolute_percentage_error(nwy_avg_preds, [x['time'] for x in nwy_test_data]),
            "SCH": metrics.mean_absolute_percentage_error(nwy_sch_preds, nwy_sch_labels)
        },
        "Seattle": {
            "DeepTTE": metrics.mean_absolute_percentage_error(kcm_deeptte_preds.label, kcm_deeptte_preds.pred),
            "GBDT": metrics.mean_absolute_percentage_error(kcm_gbdt_preds, [x['time'] for x in kcm_test_data]),
            "AVG": metrics.mean_absolute_percentage_error(kcm_avg_preds, [x['time'] for x in kcm_test_data]),
            "SCH": metrics.mean_absolute_percentage_error(kcm_sch_preds, kcm_sch_labels)
        }
    },
    "RMSE": {
        "Trondheim": {
            "DeepTTE": np.sqrt(metrics.mean_squared_error(nwy_deeptte_preds.label, nwy_deeptte_preds.pred)),
            "GBDT": np.sqrt(metrics.mean_squared_error(nwy_gbdt_preds, [x['time'] for x in nwy_test_data])),
            "AVG": np.sqrt(metrics.mean_squared_error(nwy_avg_preds, [x['time'] for x in nwy_test_data])),
            "SCH": np.sqrt(metrics.mean_squared_error(nwy_sch_preds, nwy_sch_labels)),
        },
        "Seattle": {
            "DeepTTE": np.sqrt(metrics.mean_squared_error(kcm_deeptte_preds.label, kcm_deeptte_preds.pred)),
            "GBDT": np.sqrt(metrics.mean_squared_error(kcm_gbdt_preds, [x['time'] for x in kcm_test_data])),
            "AVG": np.sqrt(metrics.mean_squared_error(kcm_avg_preds, [x['time'] for x in kcm_test_data])),
            "SCH": np.sqrt(metrics.mean_squared_error(kcm_sch_preds, kcm_sch_labels)),
        }
    },
    "MAE": {
        "Trondheim": {
            "DeepTTE": metrics.mean_absolute_error(nwy_deeptte_preds.label, nwy_deeptte_preds.pred),
            "GBDT": metrics.mean_absolute_error(nwy_gbdt_preds, [x['time'] for x in nwy_test_data]),
            "AVG": metrics.mean_absolute_error(nwy_avg_preds, [x['time'] for x in nwy_test_data]),
            "SCH": metrics.mean_absolute_error(nwy_sch_preds, nwy_sch_labels)
        },
        "Seattle": {
            "DeepTTE": metrics.mean_absolute_error(kcm_deeptte_preds.label, kcm_deeptte_preds.pred),
            "GBDT": metrics.mean_absolute_error(kcm_gbdt_preds, [x['time'] for x in kcm_test_data]),
            "AVG": metrics.mean_absolute_error(kcm_avg_preds, [x['time'] for x in kcm_test_data]),
            "SCH": metrics.mean_absolute_error(kcm_sch_preds, kcm_sch_labels)
        }
    }
}
model_performances_df = []
for metric in model_performances.keys():
    df = pd.DataFrame(model_performances[metric]).reset_index()
    df = df.melt(id_vars="index", value_vars=['Trondheim', 'Seattle'])
    df['Metric'] = metric
    model_performances_df.append(df)
model_performances_df = pd.concat(model_performances_df)
model_performances_df.columns = ["Model","City","value","Metric"]
model_performances_df

NameError: name 'nwy_deeptte_preds' is not defined

In [None]:
# Plot accuracy across all models
fig, ax = plt.subplots(1,1)
sns.barplot(model_performances_df[model_performances_df['Metric']=='MAPE'], x="Model", y="value", hue="City")
plt.savefig("../plots/model_performances.png", dpi=1800, bbox_inches='tight')

In [None]:
# List of feature names for understanding importance
feature_names = ['timeID','weekID','dateID','driverID','dist']
feature_names = feature_names + [f"lat_{x}" for x in range(128)]
feature_names = feature_names + [f"lng_{x}" for x in range(128)]

In [None]:
# Sort importances and names
nwy_features_sorted = [x for _, x in sorted(zip(nwy_reg.feature_importances_, feature_names))][::-1]
nwy_importances_sorted = [_ for _, x in sorted(zip(nwy_reg.feature_importances_, feature_names))][::-1]
kcm_features_sorted = [x for _, x in sorted(zip(kcm_reg.feature_importances_, feature_names))][::-1]
kcm_importances_sorted = [_ for _, x in sorted(zip(kcm_reg.feature_importances_, feature_names))][::-1]

# Read out names and importances of top n
print(nwy_features_sorted[:10])
print(kcm_features_sorted[:10])

In [None]:
# Plot importances
fig, ax = plt.subplots(1,1)
plt.bar(nwy_features_sorted[:10], nwy_importances_sorted[:10])
plt.savefig("../plots/nwy_gbdt_importances.png", dpi=1800, bbox_inches='tight')

In [None]:
# Plot importances
fig, ax = plt.subplots(1,1)
plt.bar(kcm_features_sorted[:10], kcm_importances_sorted[:10])
plt.savefig("../plots/kcm_gbdt_importances.png", dpi=1800, bbox_inches='tight')

In [None]:
# Plot importances
fig, ax = plt.subplots(1,1)
plt.bar(nwy_features_sorted[1:], nwy_importances_sorted[1:])
plt.savefig("../plots/nwy_gbdt_non_dist_importances.png", dpi=1800, bbox_inches='tight')

In [None]:
# Plot importances
fig, ax = plt.subplots(1,1)
plt.bar(kcm_features_sorted[1:], kcm_importances_sorted[1:])
plt.savefig("../plots/kcm_gbdt_non_dist_importances.png", dpi=1800, bbox_inches='tight')