In [1]:
import os
import importlib
import json
import matplotlib.pyplot as plt
import geopandas as gpd
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import pyproj
import seaborn as sns
import lightning.pytorch as pl
import numpy as np
from shapely.geometry import Point
from sklearn import metrics
import torch
from torch.utils.data import DataLoader, SequentialSampler
from dotenv import load_dotenv
load_dotenv()

from models import grids
from utils import data_loader, data_utils, model_utils

run_folder = "../results/debug/"
atb_network_folder = "atb/"

In [2]:
with open(f"{run_folder}{atb_network_folder}deeptte_formatted/train_summary_config.json", "r") as f:
    atb_config = json.load(f)
atb_crs = pyproj.crs.CRS.from_epsg(atb_config['epsg'][0])

px.set_mapbox_access_token(os.getenv("MAPBOX_TOKEN"))
default_crs = pyproj.crs.CRS.from_epsg(4326)

fold_num = 4
grid_s_size = 500
model_type="GRU"
skip_gtfs=False
num_workers=0
pin_memory=False

# Define embedded variables for network models
embed_dict = {
    'timeID': {
        'vocab_size': 1440,
        'embed_dims': 27
    },
    'weekID': {
        'vocab_size': 7,
        'embed_dims': 4
    }
}
hyperparameter_dict = {
    'FF': {
        'batch_size': 1024,
        'hidden_size': 128,
        'num_layers': 2,
        'dropout_rate': .2
    },
    'CONV': {
        'batch_size': 1024,
        'hidden_size': 64,
        'num_layers': 3,
        'dropout_rate': .1
    },
    'GRU': {
        'batch_size': 1024,
        'hidden_size': 64,
        'num_layers': 2,
        'dropout_rate': .05
    },
    'TRSF': {
        'batch_size': 1024,
        'hidden_size': 64,
        'num_layers': 3,
        'dropout_rate': .1
    },
    'DEEPTTE': {
        'batch_size': 1024
    }
}

### Inference on Sample of Shingles

In [5]:
# Set up model
atb_base_model_list, atb_nn_model = model_utils.make_one_model(model_type, hyperparameter_dict=hyperparameter_dict, embed_dict=embed_dict, config=atb_config, skip_gtfs=skip_gtfs, load_weights=True, weight_folder=f"{run_folder}{atb_network_folder}models/{model_type}/logs/{model_type}/version_{fold_num}/checkpoints/", fold_num=4)
print(f"Evaluating: {atb_nn_model.model_name}")
# Set up dataset
atb_dataset = data_loader.LoadSliceDataset(f"{run_folder}{atb_network_folder}deeptte_formatted/test", atb_config, skip_gtfs=skip_gtfs)
atb_ngrid = grids.NGridBetter(atb_config['grid_bounds'][0], grid_s_size)
atb_ngrid.add_grid_content(atb_dataset.get_all_samples(keep_cols=['shingle_id','locationtime','x','y','speed_m_s','bearing']), trace_format=True)
atb_ngrid.build_cell_lookup()
atb_dataset.grid = atb_ngrid
atb_dataset.add_grid_features = atb_nn_model.requires_grid
loader = DataLoader(atb_dataset, sampler=SequentialSampler(atb_dataset), collate_fn=atb_nn_model.collate_fn, batch_size=atb_nn_model.batch_size, pin_memory=pin_memory, num_workers=num_workers, drop_last=False)
trainer = pl.Trainer(
    accelerator="cpu",
    logger=False
)
# Extract predictions
preds_and_labels = trainer.predict(model=atb_nn_model, dataloaders=loader)
atb_preds = np.concatenate([x['out'][x['mask']] for x in preds_and_labels])
atb_labels = np.concatenate([x['y'][x['mask']] for x in preds_and_labels])

GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Evaluating: GRU


  rank_zero_warn(
  rank_zero_warn(


Predicting: 0it [00:00, ?it/s]

In [9]:
# Extract data points and connect to predictions
atb_data = [x for x in atb_dataset]
atb_feats = np.concatenate([x['samp'] for x in atb_data])
atb_ys = np.array([x['norm_label'] for x in atb_data])
atb_res = pd.DataFrame(atb_feats, columns=atb_dataset.col_names)
atb_res['preds'] = atb_preds
atb_res['labels'] = atb_labels
# Overall accuracy on data points
print(f"MAE: {metrics.mean_absolute_error(atb_labels,atb_preds)}")
print(f"MAPE: {metrics.mean_absolute_percentage_error(atb_labels,atb_preds)}")
atb_res

MAE: 8.943105697631836
MAPE: 0.23745349049568176


Unnamed: 0,shingle_id,weekID,timeID,timeID_s,locationtime,lon,lat,x,y,x_cent,...,time_cumulative_s,speed_m_s,bearing,stop_x_cent,stop_y_cent,scheduled_time_s,stop_dist_km,passed_stops_n,preds,labels
0,0.0,0.0,351.0,21117.0,1.679288e+09,10.396793,63.388836,569804.3750,7029674.0,332.347473,...,0.0,4.251283,127.441689,473.993958,-4657.806641,-57.0,0.142810,0.0,34.638683,31.0
1,0.0,0.0,352.0,21179.0,1.679288e+09,10.403745,63.388042,570153.6250,7029593.5,681.605103,...,62.0,5.781717,-13.015158,819.965820,-4854.753906,3.0,0.169565,1.0,48.577423,62.0
2,0.0,0.0,353.0,21210.0,1.679288e+09,10.406557,63.387131,570296.3750,7029495.0,824.360657,...,93.0,5.590468,-34.540405,819.965820,-4854.753906,3.0,0.004401,0.0,32.311226,31.0
3,0.0,0.0,354.0,21261.0,1.679288e+09,10.408208,63.385395,570383.1250,7029303.5,911.108521,...,144.0,4.123089,-65.635513,910.008667,-5004.914551,63.0,0.041642,1.0,34.126015,51.0
4,0.0,0.0,354.0,21292.0,1.679288e+09,10.404663,63.385056,570206.8125,7029261.5,734.819031,...,175.0,5.844251,-166.668304,731.514038,-5205.553711,123.0,0.117282,1.0,32.621185,31.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150523,18101.0,2.0,1323.0,79380.0,1.679519e+09,10.375283,63.430183,568630.6250,7034257.5,-841.391541,...,481.0,2.100926,23.891922,-1011.080750,-146.867905,301.0,0.178154,0.0,30.059349,31.0
150524,18101.0,2.0,1323.0,79401.0,1.679519e+09,10.378190,63.430222,568775.5625,7034264.5,-696.437622,...,502.0,6.910987,2.828478,-457.915497,-79.355148,421.0,0.238600,1.0,33.088455,21.0
150525,18101.0,2.0,1323.0,79432.0,1.679519e+09,10.382559,63.430237,568993.5000,7034271.0,-478.513123,...,533.0,7.032803,1.668054,-457.915497,-79.355148,421.0,0.020599,0.0,37.053215,31.0
150526,18101.0,2.0,1324.0,79464.0,1.679519e+09,10.383121,63.430244,569021.5625,7034272.5,-450.455200,...,565.0,0.878457,3.508643,-457.915497,-79.355148,421.0,0.007718,0.0,29.314407,32.0


In [None]:
# Get geometries and other features for every prediciton point
atb_res['pred_speeds'] = atb_res['dist_calc_km']*1000 / atb_res['preds']
atb_res['label_speeds'] = atb_res['dist_calc_km']*1000 / atb_res['labels']
atb_res['absolute_error'] = abs(atb_res['preds'] - atb_res['labels'])
atb_res['hour'] = atb_res['timeID']//60
points = gpd.points_from_xy(atb_res['lon'], atb_res['lat'], crs="EPSG:4326")
atb_res = gpd.GeoDataFrame(atb_res, geometry=points)
atb_res = atb_res.sample(100000)

In [None]:
# Base data variance vs model variance in time/space
print(f"{np.mean(atb_res['labels'])}, {np.std(atb_res['labels'])}")
print(f"{np.mean(atb_res['preds'])}, {np.std(atb_res['preds'])}")

In [None]:
# axes = geoplot.pointplot(atb_res, projection=geoplot.crs.AlbersEqualArea(), s=0.1)
# geoplot.kdeplot(atb_res, fill=True, cmap='coolwarm', alpha=0.5, bw_adjust=0.5, ax=axes)
# axes.set_title(f"Point Heatmap(atb)")
# plt.savefig("../plots/model_spatial_performance_atb.png")

# axes = geoplot.pointplot(atb_res, projection=geoplot.crs.AlbersEqualArea(), s=0.1)
# geoplot.kdeplot(atb_res, fill=True, cmap='coolwarm', alpha=0.5, bw_adjust=0.5, ax=axes)
# axes.set_title(f"Point Heatmap (AtB)")
# plt.savefig("../plots/model_spatial_performance_atb.png")

In [None]:
fig = ff.create_hexbin_mapbox(
    data_frame=atb_res,
    lat="lat",
    lon="lon",
    nx_hexagon=30,
    opacity=0.7,
    labels={"color": "SD of Predictions"},
    color="pred_speeds",
    agg_func=np.std,
    color_continuous_scale="Icefire",
    range_color=[0,10]
)
fig.show()

In [None]:
fig = ff.create_hexbin_mapbox(
    data_frame=atb_res,
    lat="lat",
    lon="lon",
    nx_hexagon=30,
    opacity=0.7,
    labels={"color": "SD of Labels"},
    color="label_speeds",
    agg_func=np.std,
    color_continuous_scale="Icefire",
    range_color=[0,10]
)
fig.show()

In [None]:
fig = ff.create_hexbin_mapbox(
    data_frame=atb_res,
    lat="lat",
    lon="lon",
    nx_hexagon=30,
    opacity=0.7,
    labels={"color": "MAE"},
    color="absolute_error",
    agg_func=np.mean,
    color_continuous_scale="Icefire",
    range_color=[0,100]
)
fig.show()
# fig.write_image("hexbin_map.png")

In [None]:
fig, axes = plt.subplots(3,2)
fig.tight_layout()
axes = axes.flatten()
fig.set_figheight(10)
fig.set_figwidth(8)
sns.histplot(atb_res, x="pred_speeds", ax=axes[0])
axes[0].set_title(f"Predicted Speeds(atb)")
axes[0].set_xlabel("Speed (m/s)")
# sns.histplot(atb_res, x="pred_speeds", ax=axes[1])
# axes[1].set_title(f"GRU Mean Absolute Error (AtB)")
# axes[1].set_xlabel("MAE (s)")
sns.histplot(atb_res, x="label_speeds", ax=axes[2])
axes[2].set_title(f"Label Speeds (atb)")
axes[2].set_xlabel("Speed (m/s)")
# sns.histplot(atb_res, x="label_speeds", ax=axes[3])
# axes[3].set_title(f"Label Speeds (AtB)")
# axes[3].set_xlabel("Speed (m/s)")
sns.histplot(atb_res, x="absolute_error", ax=axes[4])
axes[4].set_title(f"Absolute Error (atb)")
axes[4].set_xlabel("Error (s)")
# sns.histplot(atb_res, x="absolute_error", ax=axes[5])
# axes[5].set_title(f"Absolute Error (AtB)")
# axes[5].set_xlabel("Error (s)")
plt.savefig("../plots/model_distribution_comparison.png")

In [None]:
fig, axes = plt.subplots(3,2)
fig.tight_layout()
axes = axes.flatten()
fig.set_figheight(10)
fig.set_figwidth(8)
sns.lineplot(atb_res, x="hour", y="pred_speeds", ax=axes[0])
axes[0].set_title(f"Predicted Speeds (atb)")
axes[0].set_xlabel("Hour of Day")
axes[0].set_ylabel("Speed (m/s)")
axes[0].set_xlim(0,24)
axes[0].set_ylim(0,50)
# sns.lineplot(atb_res, x="hour", y="pred_speeds", ax=axes[1])
# axes[1].set_title(f"Predicted Speeds (AtB)")
# axes[1].set_xlabel("Hour of Day")
# axes[1].set_ylabel("Speed (m/s)")
# axes[1].set_xlim(0,24)
# axes[1].set_ylim(0,50)
sns.lineplot(atb_res, x="hour", y="label_speeds", ax=axes[2])
axes[2].set_title(f"Label Speeds (atb)")
axes[2].set_xlabel("Hour of Day")
axes[2].set_ylabel("Speed (m/s)")
axes[2].set_xlim(0,24)
axes[2].set_ylim(0,50)
# sns.lineplot(atb_res, x="hour", y="label_speeds", ax=axes[3])
# axes[3].set_title(f"Label Speeds (AtB)")
# axes[3].set_xlabel("Hour of Day")
# axes[3].set_ylabel("Speed (m/s)")
# axes[3].set_xlim(0,24)
# axes[3].set_ylim(0,50)
sns.lineplot(atb_res, x="hour", y="absolute_error", ax=axes[4])
axes[4].set_title(f"Mean Absolute Error (atb)")
axes[4].set_xlabel("Hour of Day")
axes[4].set_ylabel("MAE (s)")
axes[4].set_xlim(0,24)
axes[4].set_ylim(0,50)
# sns.lineplot(atb_res, x="hour", y="absolute_error", ax=axes[5])
# axes[5].set_title(f"Mean Absolute Error (AtB)")
# axes[5].set_xlabel("Hour of Day")
# axes[5].set_ylabel("MAE (s)")
# axes[5].set_xlim(0,24)
# axes[5].set_ylim(0,50)
plt.savefig("../plots/model_hourly_comparison.png")

### Inference on Entire Network

In [None]:
importlib.reload(data_loader)

In [None]:
# Create grid of regularly spaced fake shingles to feed model
inference_shingles = data_utils.create_grid_of_shingles(100, atb_config['grid_bounds'][0], atb_config['coord_ref_center'][0])

In [None]:
atb_dataset = data_loader.ContentDataset(inference_shingles, atb_config, skip_gtfs=True)
loader = DataLoader(atb_dataset, sampler=SequentialSampler(atb_dataset), collate_fn=atb_nn_model.collate_fn, batch_size=atb_nn_model.batch_size, pin_memory=PIN_MEMORY, num_workers=NUM_WORKERS, drop_last=False)
trainer = pl.Trainer(
    accelerator="cpu",
    logger=False
)
preds_and_labels = trainer.predict(model=atb_nn_model, dataloaders=loader)

In [None]:
print(f"Evaluating: {atb_nn_model.model_name}")
# Set up dataset
atb_dataset = data_loader.LoadSliceDataset(f"{RUN_FOLDER}{atb_NETWORK_FOLDER}deeptte_formatted/test", atb_config, skip_gtfs=skip_gtfs)
atb_ngrid = grids.NGridBetter(atb_config['grid_bounds'][0], grid_s_size)
atb_ngrid.add_grid_content(atb_dataset.get_all_samples(keep_cols=['shingle_id','locationtime','x','y','speed_m_s','bearing']), trace_format=True)
atb_ngrid.build_cell_lookup()
atb_dataset.grid = atb_ngrid
atb_dataset.add_grid_features = atb_nn_model.requires_grid
loader = DataLoader(atb_dataset, sampler=SequentialSampler(atb_dataset), collate_fn=atb_nn_model.collate_fn, batch_size=atb_nn_model.batch_size, pin_memory=PIN_MEMORY, num_workers=NUM_WORKERS, drop_last=False)
trainer = pl.Trainer(
    accelerator="cpu",
    logger=False
)
preds_and_labels = trainer.predict(model=atb_nn_model, dataloaders=loader)

In [None]:
inference_shingles[0].keys()

In [None]:
print(f"Evaluating: {atb_model.model_name}")
# Set up dataset
atb_dataset = data_loader.GenericDataset([], atb_config)
atb_dataset.content = inference_shingles
atb_dataset.add_grid_features = atb_model.requires_grid
loader = DataLoader(atb_dataset, sampler=SequentialSampler(atb_dataset), collate_fn=atb_model.collate_fn, batch_size=BATCH_SIZE, pin_memory=PIN_MEMORY, num_workers=NUM_WORKERS, drop_last=False)
atb_data = atb_dataset.content
# Test model
atb_labels, atb_preds, avg_batch_loss, seq_lens = model_utils.predict(atb_model, loader, sequential_flag=True)
atb_labels = data_utils.de_normalize(atb_labels, atb_config['time_calc_s_mean'], atb_config['time_calc_s_std'])
atb_preds = data_utils.de_normalize(atb_preds, atb_config['time_calc_s_mean'], atb_config['time_calc_s_std'])
atb_mask = data_utils.create_tensor_mask(torch.cat(seq_lens)).numpy()
atb_preds = atb_preds*atb_mask
atb_preds = atb_preds[atb_preds!=0.0]
atb_labels = atb_labels*atb_mask
atb_labels = atb_labels[atb_labels!=0.0]

In [None]:
# Get geometries and other features for every prediciton point
lon = np.concatenate([np.array(sample['lon']) for sample in atb_data])
lat = np.concatenate([np.array(sample['lat']) for sample in atb_data])
x_coords = np.concatenate([np.array(sample['x']) for sample in atb_data])
y_coords = np.concatenate([np.array(sample['y']) for sample in atb_data])
dist_calc_km = np.concatenate([np.array(sample['dist_calc_km']) for sample in atb_data])
bearing = np.concatenate([np.array(sample['bearing']) for sample in atb_data])
atb_res = pd.DataFrame({
    "lon": lon,
    "lat": lat,
    "x_coords": x_coords,
    "y_coords": y_coords,
    "dist_calc_km": dist_calc_km,
    "bearing": bearing,
    "preds": atb_preds,
    "labels": atb_labels
})
atb_res['speed_m_s'] = atb_res['dist_calc_km']*1000 / atb_res['preds']
# Transform x and y to replace dummy lat and lon for mapping
transformer = pyproj.Transformer.from_crs(atb_crs, default_crs)
atb_res['lat'], atb_res['lon'] = transformer.transform(atb_res['x_coords'], atb_res['y_coords'])
points = gpd.points_from_xy(atb_res['lon'], atb_res['lat'], crs=default_crs)
atb_res = gpd.GeoDataFrame(atb_res, geometry=points)

In [None]:
fig = ff.create_hexbin_mapbox(
    data_frame=atb_res[atb_res['bearing']==90],
    lat="lat",
    lon="lon",
    nx_hexagon=50,
    opacity=0.5,
    labels={"color": "Mean Speed (m/s)"},
    color="speed_m_s",
    agg_func=np.mean,
    color_continuous_scale="Icefire_r",
)
fig.show()

In [None]:
fig = ff.create_hexbin_mapbox(
    data_frame=atb_res[atb_res['bearing']==0],
    lat="lat",
    lon="lon",
    nx_hexagon=50,
    opacity=0.5,
    labels={"color": "Mean Speed (m/s)"},
    color="speed_m_s",
    agg_func=np.mean,
    color_continuous_scale="Icefire_r",
)
fig.show()