In [None]:
import importlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from models import grid
from utils import data_utils, shape_utils

RUN_FOLDER = "../results/debug/"
NETWORK = "kcm"

In [None]:
kcm_inputs = data_utils.load_all_inputs(RUN_FOLDER, "kcm/", 0)
atb_inputs = data_utils.load_all_inputs(RUN_FOLDER, "atb/", 0)

In [None]:
# Look at a single trip
test_traces = kcm_inputs['test_traces']
shingle_data = test_traces[test_traces['shingle_id']==np.unique(test_traces.shingle_id)[55]]
# Plot overview of the shingle
plot_data = shingle_data
fig, axes = plt.subplots(1,1)
# shape_utils.plot_gtfs_trip(axes, plot_data['trip_id'].iloc[0], kcm_inputs['gtfs_data'], kcm_inputs['config']['epsg'])
shape_utils.plot_gtfsrt_trip(axes, plot_data, kcm_inputs['config']['epsg'])

In [None]:
# Look at a single trip
test_traces = atb_inputs['test_traces']
shingle_data = test_traces[test_traces['shingle_id']==np.unique(test_traces.shingle_id)[0]]
shingle_data.head()
# Plot overview of the shingle
plot_data = shingle_data
fig, axes = plt.subplots(1,1)
# shape_utils.plot_gtfs_trip(axes, plot_data['trip_id'].iloc[0], atb_inputs['gtfs_data'], atb_inputs['config']['epsg'])
shape_utils.plot_gtfsrt_trip(axes, plot_data, atb_inputs['config']['epsg'])

In [None]:
# Examine grid
grid_kcm = kcm_inputs['train_grid_ffill']
grid_atb = atb_inputs['train_grid_ffill']
print(f"KCM grid density: {grid_kcm.get_density()}, fill density: {grid_kcm.get_fill_density()}")
print(f"AtB grid density: {grid_atb.get_density()}, fill density: {grid_atb.get_fill_density()}")
print(f"KCM grid cell speeds: {grid_kcm.get_masked_content()}, mean: {np.mean(grid_kcm.get_masked_content())}, std: {np.std(grid_kcm.get_masked_content())}, min: {np.min(grid_kcm.get_masked_content())}, max: {np.max(grid_kcm.get_masked_content())}")
print(f"AtB grid cell speeds: {grid_atb.get_masked_content()}, mean: {np.mean(grid_atb.get_masked_content())}, std: {np.std(grid_atb.get_masked_content())}, min: {np.min(grid_atb.get_masked_content())}, max: {np.max(grid_atb.get_masked_content())}")
print(f"KCM grid sample sizes: {grid_kcm.get_masked_counts()}, mean: {np.mean(grid_kcm.get_masked_counts())}, std: {np.std(grid_kcm.get_masked_counts())}, min: {np.min(grid_kcm.get_masked_counts())}, max: {np.max(grid_kcm.get_masked_counts())}")
print(f"AtB grid sample sizes: {grid_atb.get_masked_counts()}, mean: {np.mean(grid_atb.get_masked_counts())}, std: {np.std(grid_atb.get_masked_counts())}, min: {np.min(grid_atb.get_masked_counts())}, max: {np.max(grid_atb.get_masked_counts())}")

In [None]:
# Look at grid features for a shingle
sample = kcm_inputs['train_data'][1]
tbins = sample['tbin_idx']
xbins = sample['xbin_idx']
ybins = sample['ybin_idx']
grid_features = grid.extract_grid_features(grid_kcm.get_fill_content(), tbins, xbins, ybins, kcm_inputs['config'], buffer=3)
grid_features = np.concatenate([np.expand_dims(x, 0) for x in grid_features], axis=0)
print(grid_features.shape) # (tsteps, channels, latsteps, lonsteps)
grid.save_grid_anim(grid_features, "grid_speeds_shingle.mp4", np.min(grid_features[:,:4,:,:]), np.max(grid_features[:,:4,:,:]))

In [None]:
# # Save plots of full grid features
# grid_features = grid_kcm.get_fill_content()
# print(grid_features.shape) # (tsteps, channels, latsteps, lonsteps)
# grid.save_grid_anim(grid_features, "grid_speeds_kcm.mp4", np.min(grid_features[:,:4,:,:]), np.max(grid_features[:,:4,:,:]))

In [None]:
# # Save plots of full grid features
# grid_features = grid_atb.get_fill_content()
# print(grid_features.shape) # (tsteps, channels, latsteps, lonsteps)
# grid.save_grid_anim(grid_features, "grid_speeds_atb.mp4", np.min(grid_features[:,:4,:,:]), np.max(grid_features[:,:4,:,:]))

In [None]:
# Histogram of scheduled travel times
plot_data = kcm_inputs['train_traces']
sns.histplot(plot_data.scheduled_time_s)
plt.title(f"Scheduled Travel Time (s) (KCM) [{np.min(plot_data['scheduled_time_s'])}, {np.max(plot_data['scheduled_time_s'])}]")
plt.xlabel("Travel Time (s)")
plt.savefig("../plots/kcm_scheduled_time_dist.png")

In [None]:
# Histogram of scheduled travel times
plot_data = atb_inputs['train_traces']
sns.histplot(plot_data.scheduled_time_s)
plt.title(f"Scheduled Travel Time (s) (AtB) [{np.min(plot_data['scheduled_time_s'])}, {np.max(plot_data['scheduled_time_s'])}]")
plt.xlabel("Travel Time (s)")
plt.savefig("../plots/atb_scheduled_time_dist.png")

In [None]:
# Shingle distance
metric = kcm_inputs['train_traces'].groupby('shingle_id').sum(numeric_only=True)['dist_calc_m']
sns.histplot(metric)
plt.title(f"Shingle Distances (KCM) [{np.min(metric)}, {np.round(np.max(metric))}]")
plt.xlabel("Travel Dist (m)")
plt.savefig("../plots/kcm_shingle_dists.png")

In [None]:
# Shingle distance
metric = atb_inputs['train_traces'].groupby('shingle_id').sum(numeric_only=True)['dist_calc_m']
sns.histplot(metric)
plt.title(f"Shingle Distances (AtB) [{np.min(metric)}, {np.round(np.max(metric))}]")
plt.xlabel("Travel Dist (m)")
plt.savefig("../plots/atb_shingle_dists.png")

In [None]:
# Shingle travel time
metric = kcm_inputs['train_traces'].groupby('shingle_id').last()['time_cumulative_s']
sns.histplot(metric)
plt.title(f"Shingle Travel Times (KCM) [{np.min(metric)}, {np.round(np.max(metric))}]")
plt.xlabel("Travel Time (s)")
plt.savefig("../plots/kcm_shingle_times.png")

In [None]:
# Shingle travel time
metric = atb_inputs['train_traces'].groupby('shingle_id').last()['time_cumulative_s']
sns.histplot(metric)
plt.title(f"Shingle Travel Times (AtB) [{np.min(metric)}, {np.round(np.max(metric))}]")
plt.xlabel("Travel Time (s)")
plt.savefig("../plots/atb_shingle_times.png")

In [None]:
# Points per trajectory
metric = kcm_inputs['train_traces'].groupby(['shingle_id']).count()['lat']
sns.histplot(metric)
plt.title(f"Observations per Shingle (KCM) [{np.min(metric)}, {np.round(np.max(metric))}]")
plt.xlabel("Observations (n)")
plt.savefig("../plots/kcm_shingle_n.png")

In [None]:
# Points per trajectory
metric = atb_inputs['train_traces'].groupby(['shingle_id']).count()['lat']
sns.histplot(metric)
plt.title(f"Observations per Shingle (AtB) [{np.min(metric)}, {np.round(np.max(metric))}]")
plt.xlabel("Observations (n)")
plt.savefig("../plots/atb_shingle_n.png")

In [None]:
# # Histogram of bus travel times in schedule
# gtfs_data = kcm_inputs['gtfs_data']
# x = gtfs_data[['trip_id','arrival_s']]
# y = gtfs_data[['trip_id','arrival_s']].shift()
# y.columns = [colname+"_shift" for colname in y.columns]
# z = pd.concat([x,y], axis=1)
# z = z[z['trip_id']==z['trip_id_shift']]
# z['tt'] = z['arrival_s'] - z['arrival_s_shift']
# z = z[z['tt']<=250]
# z = z.dropna()
# sns.histplot(z.tt)
# plt.title(f"Stop Arrival Gaps (KCM) [{np.min(z['tt'])}, {np.max(z['tt'])}]")
# plt.xlabel("Travel Time (s)")
# plt.axvline(30, 0.0, 20000, color="black")
# plt.savefig("../plots/kcm_gtfs_arrival_gaps.png")

In [None]:
# # Histogram of bus travel times in schedule
# gtfs_data = atb_inputs['gtfs_data']

# x = gtfs_data[['trip_id','arrival_s']]
# y = gtfs_data[['trip_id','arrival_s']].shift()
# y.columns = [colname+"_shift" for colname in y.columns]
# z = pd.concat([x,y], axis=1)
# z = z[z['trip_id']==z['trip_id_shift']]
# z['tt'] = z['arrival_s'] - z['arrival_s_shift']
# z = z[z['tt']<=250]
# z = z.dropna()
# sns.histplot(z.tt)
# plt.title(f"Stop Arrival Gaps (AtB) [{np.min(z['tt'])}, {np.max(z['tt'])}]")
# plt.xlabel("Travel Time (s)")
# plt.axvline(30, 0.0, 20000, color="black")
# plt.savefig("../plots/atb_gtfs_arrival_gaps.png")

In [None]:
# Adjacent trip analysis

In [None]:
# # Run parameters
# shingle_sample_size = 200
# d_buffers = np.linspace(1,1000,20)
# t_buffers = np.linspace(1,60*20,20)
# b_buffer = None
# orthogonal = False

# # Look at speed correlation between adjacent bus trips
# if NETWORK=="kcm":
#     traces = kcm_inputs['test_traces']
# else:
#     traces = atb_inputs['test_traces']
# # Look at only a single hour for the sake of speed (assumes loaded test data comes from single day)
# traces = traces[traces.time<10*60]
# traces = traces[traces.time>=9*60]
# shingle_ids = pd.unique(traces['shingle_id'])
# shingle_id_sample = np.random.choice(shingle_ids, shingle_sample_size, replace=False)

# # Test shingle data against all other data
# shingle_traces = traces[traces['shingle_id'].isin(shingle_id_sample)][['x','y','locationtime','bearing','speed_m_s','shingle_id']]
# shingle_groups = shingle_traces.groupby('shingle_id')
# adj_traces = traces[~traces['shingle_id'].isin(shingle_id_sample)][['x','y','locationtime','bearing','speed_m_s']].values
# # Save results for each shingle, and buffer combination
# targets_res = np.zeros((len(shingle_id_sample), len(d_buffers), len(t_buffers)))
# preds_res = np.zeros((len(shingle_id_sample), len(d_buffers), len(t_buffers)))

# # Parallel: 1min with 200 samples, 20x20 buffers, 1hr data
# # Each item in the results corresponds to a distance buffer; it has n subitems, 1 for each time buffer
# # Within a d-t buffer, there are 4 values; (targets, preds, d_buffer idx, t_buffer idx)
# def parallel_get_metrics(d):
#     dist_buffer = d_buffers[d]
#     results = []
#     for t, t_buffer in enumerate(t_buffers):
#         res = shingle_groups.apply(shape_utils.get_adjacent_metric, adj_traces, dist_buffer, t_buffer, b_buffer=b_buffer, orthogonal=orthogonal)
#         targets = np.array([x[0] for x in res])
#         preds = np.array([x[1] for x in res])
#         results.append((targets, preds, d, t))
#     return results
# res_parallel = Parallel(n_jobs=-2)(delayed(parallel_get_metrics)(x) for x in range(len(d_buffers)))

# # Expand parallel adjacent metric results
# for d_idx in range(len(res_parallel)):
#     for t_idx in range(len(res_parallel[0])):
#         targets = res_parallel[d_idx][t_idx][0]
#         preds = res_parallel[d_idx][t_idx][1]
#         targets_res[:,d_idx,t_idx] = targets
#         preds_res[:,d_idx,t_idx] = preds
# # Get R2 value for each combination of preds and targets and for each d and t buffer
# r_values = np.empty((preds_res.shape[1], preds_res.shape[2]))
# slope_mean = 0.0
# slope_ct = 0
# for d in range(preds_res.shape[1]):
#     for t in range(preds_res.shape[2]):
#         try:
#             non_nan_preds = preds_res[:,d,t][~np.isnan(preds_res[:,d,t])]
#             non_nan_targs = targets_res[:,d,t][~np.isnan(preds_res[:,d,t])]
#             if len(non_nan_preds)>1:
#                 slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(non_nan_preds, non_nan_targs)
#                 r_values[d,t] = r_value**2
#                 slope_mean += slope
#                 slope_ct += 1
#             else:
#                 r_values[d,t] = np.nan
#         except:
#             r_values[d,t] = np.nan
            
# # Plot results for all d/t combos
# print(f"Average slope: {slope_mean / slope_ct}")
# fig, ax = plt.subplots()
# im = ax.imshow(r_values, origin="lower", cmap="plasma")
# fig.suptitle("R2 of Linear Fit to Mean Adjacent Speeds")
# ax.set_title(f"B={b_buffer} deg, Orth={str(orthogonal)}, Net={NETWORK}")
# ax.set_xlabel("Time (s)")
# ax.set_xticks([x for x in range(len(t_buffers))])
# ax.set_xticklabels([str(int(x)) for x in t_buffers], rotation=45)
# ax.set_ylabel("Distance (m)")
# ax.set_yticks([y for y in range(len(d_buffers))])
# ax.set_yticklabels([str(int(y)) for y in d_buffers])
# cbar = fig.colorbar(im, ax=ax)
# plt.draw()
# plt.savefig(f"../plots/R2_adj_{b_buffer}_{str(orthogonal)}_{NETWORK}.png")
# plt.show()