In [None]:
import itertools
import json
from math import radians, cos, sin, asin, sqrt
import os
from random import sample

import contextily as cx
import geopandas
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
import numpy as np
import pandas as pd
import pickle
import shapely.geometry
from sklearn import metrics
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from zipfile import ZipFile

from database import data_utils

import importlib
importlib.reload(data_utils)

In [None]:
# Read in DeepTTE results
kcm_results = pd.read_csv("../results/kcm2weeks/deeptte.res", delimiter=" ", header=None)
kcm_results.columns = ["label", "pred"]

nwy_results = pd.read_csv("../results/nwy2weeks/deeptte.res", delimiter=" ", header=None)
nwy_results.columns = ["label", "pred"]

In [None]:
# Read in config file
with open("../results/kcm2weeks/data/config.json") as f:
    kcm_config = json.load(f)

with open("../results/nwy2weeks/data/config.json") as f:
    nwy_config = json.load(f)

In [None]:
# Read in test data
kcm_contents = open("../results/kcm2weeks/data/test", "r").read()
kcm_test_data = [json.loads(str(item)) for item in kcm_contents.strip().split('\n')]

nwy_contents = open("../results/nwy2weeks/data/test", "r").read()
nwy_test_data = [json.loads(str(item)) for item in nwy_contents.strip().split('\n')]

In [None]:
# Read in train data
kcm_train_data = []
for i in range(0,5):
    kcm_contents = open("../results/kcm2weeks/data/train_0"+str(i), "r").read()
    kcm_train_data.append([json.loads(str(item)) for item in kcm_contents.strip().split('\n')])
kcm_train_data = list(itertools.chain.from_iterable(kcm_train_data))

nwy_train_data = []
for i in range(0,5):
    nwy_contents = open("../results/nwy2weeks/data/train_0"+str(i), "r").read()
    nwy_train_data.append([json.loads(str(item)) for item in nwy_contents.strip().split('\n')])
nwy_train_data = list(itertools.chain.from_iterable(nwy_train_data))

In [None]:
# Calculate average speed grouped by time of day
dists = [x['dist'] for x in nwy_train_data]
times = [x['time'] for x in nwy_train_data]
hours = [x['timeID'] // 60 for x in nwy_train_data]
speeds = [dists[i] / times[i] for i in range(0,len(dists))] # km/s
nwy_avg_speeds = pd.DataFrame({"hour":hours, "speed":speeds}).groupby("hour").mean().to_dict()
# Predict travel time based on historical average speeds
hours = [x['timeID'] // 60 for x in nwy_test_data]
dists = [x['dist'] for x in nwy_test_data]
speeds = [nwy_avg_speeds['speed'][x] for x in hours]
nwy_avg_preds = [dists[i] / speeds[i] for i in range(0,len(dists))]

# Calculate average speed grouped by time of day
dists = [x['dist'] for x in kcm_train_data]
times = [x['time'] for x in kcm_train_data]
hours = [x['timeID'] // 60 for x in kcm_train_data]
speeds = [dists[i] / times[i] for i in range(0,len(dists))] # km/s
kcm_avg_speeds = pd.DataFrame({"hour":hours, "speed":speeds}).groupby("hour").mean().to_dict()
# Predict travel time based on historical average speeds
hours = [x['timeID'] // 60 for x in kcm_test_data]
dists = [x['dist'] for x in kcm_test_data]
speeds = [kcm_avg_speeds['speed'][x] for x in hours]
kcm_avg_preds = [dists[i] / speeds[i] for i in range(0,len(dists))]

In [None]:
# Resample GPS points to fixed number
nwy_train_data_resample = data_utils.resample_deeptte_gps(nwy_train_data, 128)
nwy_test_data_resample = data_utils.resample_deeptte_gps(nwy_test_data, 128)

kcm_train_data_resample = data_utils.resample_deeptte_gps(kcm_train_data, 128)
kcm_test_data_resample = data_utils.resample_deeptte_gps(kcm_test_data, 128)

In [None]:
# Reshape the resampled GPS data to a 2d np array for train/testing additional models
X_train_nwy, y_train_nwy = data_utils.format_deeptte_to_features(nwy_train_data, nwy_train_data_resample)
X_test_nwy, y_test_nwy = data_utils.format_deeptte_to_features(nwy_test_data, nwy_test_data_resample)

X_train_kcm, y_train_kcm = data_utils.format_deeptte_to_features(kcm_train_data, kcm_train_data_resample)
X_test_kcm, y_test_kcm = data_utils.format_deeptte_to_features(kcm_test_data, kcm_test_data_resample)

In [None]:
# Train GBDT on training data, make preds on test data
nwy_reg = GradientBoostingRegressor(random_state=0)
nwy_reg.fit(X_train_nwy, y_train_nwy)
GradientBoostingRegressor(random_state=0)
nwy_gbdt_preds = nwy_reg.predict(X_test_nwy)

kcm_reg = GradientBoostingRegressor(random_state=0)
kcm_reg.fit(X_train_kcm, y_train_kcm)
GradientBoostingRegressor(random_state=0)
kcm_gbdt_preds = kcm_reg.predict(X_test_kcm)

In [None]:
# Compare different methods for predicting travel times of the test data
print("DeepTTE")
print("-KCM-")
print(f"MAPE: {metrics.mean_absolute_percentage_error(kcm_results.label, kcm_results.pred)}")
print(f"RMSE: {np.sqrt(metrics.mean_squared_error(kcm_results.label, kcm_results.pred))}")
print(f"MAE: {metrics.mean_absolute_error(kcm_results.label, kcm_results.pred)}")
print("-NWY-")
print(f"MAPE: {metrics.mean_absolute_percentage_error(nwy_results.label, nwy_results.pred)}")
print(f"RMSE: {np.sqrt(metrics.mean_squared_error(nwy_results.label, nwy_results.pred))}")
print(f"MAE: {metrics.mean_absolute_error(nwy_results.label, nwy_results.pred)}")
print()
print("Average")
print("-KCM-")
print(f"MAPE: {metrics.mean_absolute_percentage_error([x['time'] for x in kcm_test_data], kcm_avg_preds)}")
print(f"RMSE: {np.sqrt(metrics.mean_squared_error([x['time'] for x in kcm_test_data], kcm_avg_preds))}")
print(f"MAE: {metrics.mean_absolute_error([x['time'] for x in kcm_test_data], kcm_avg_preds)}")
print("-NWY-")
print(f"MAPE: {metrics.mean_absolute_percentage_error([x['time'] for x in nwy_test_data], nwy_avg_preds)}")
print(f"RMSE: {np.sqrt(metrics.mean_squared_error([x['time'] for x in nwy_test_data], nwy_avg_preds))}")
print(f"MAE: {metrics.mean_absolute_error([x['time'] for x in nwy_test_data], nwy_avg_preds)}")
print()
print("GBDT")
print("-KCM-")
print(f"MAPE: {metrics.mean_absolute_percentage_error([x['time'] for x in kcm_test_data], kcm_gbdt_preds)}")
print(f"RMSE: {np.sqrt(metrics.mean_squared_error([x['time'] for x in kcm_test_data], kcm_gbdt_preds))}")
print(f"MAE: {metrics.mean_absolute_error([x['time'] for x in kcm_test_data], kcm_gbdt_preds)}")
print("-NWY-")
print(f"MAPE: {metrics.mean_absolute_percentage_error([x['time'] for x in nwy_test_data], nwy_gbdt_preds)}")
print(f"RMSE: {np.sqrt(metrics.mean_squared_error([x['time'] for x in nwy_test_data], nwy_gbdt_preds))}")
print(f"MAE: {metrics.mean_absolute_error([x['time'] for x in nwy_test_data], nwy_gbdt_preds)}")


In [None]:
# List of feature names for understanding importance
feature_names = ['timeID','weekID','dateID','driverID','dist']
feature_names = feature_names + [f"lat_{x}" for x in range(128)]
feature_names = feature_names + [f"lng_{x}" for x in range(128)]
feature_names

In [None]:
from matplotlib import pyplot
# plot
pyplot.bar(range(len(nwy_reg.feature_importances_)), nwy_reg.feature_importances_)
pyplot.show()

In [None]:
np.argmax(nwy_reg.feature_importances_)

In [None]:
X_test[:,np.argmax(nwy_reg.feature_importances_)]

In [None]:
nwy_test_data_resample