In [None]:
import os
import importlib
import json
import matplotlib.pyplot as plt
import geopandas as gpd
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import pyproj
import seaborn as sns
import lightning.pytorch as pl
import numpy as np
from shapely.geometry import Point
from sklearn import metrics
import torch
from torch.utils.data import DataLoader, SequentialSampler
from dotenv import load_dotenv
load_dotenv()

from models import grids
from utils import data_loader, data_utils, model_utils

RUN_FOLDER = "../results/full_run/"
KCM_NETWORK_FOLDER = "kcm/"
ATB_NETWORK_FOLDER = "atb/"

with open(f"{RUN_FOLDER}{KCM_NETWORK_FOLDER}deeptte_formatted/train_summary_config.json", "r") as f:
    kcm_config = json.load(f)
kcm_crs = pyproj.crs.CRS.from_epsg(kcm_config['epsg'][0])

with open(f"{RUN_FOLDER}{ATB_NETWORK_FOLDER}deeptte_formatted/train_summary_config.json", "r") as f:
    atb_config = json.load(f)
atb_crs = pyproj.crs.CRS.from_epsg(atb_config['epsg'][0])

px.set_mapbox_access_token(os.getenv("MAPBOX_TOKEN"))
default_crs = pyproj.crs.CRS.from_epsg(4326)

In [None]:
# Define embedded variables for network models
embed_dict = {
    'timeID': {
        'vocab_size': 1440,
        'embed_dims': 27
    },
    'weekID': {
        'vocab_size': 7,
        'embed_dims': 4
    }
}
hyperparameter_dict = {
    'FF': {
        'batch_size': 1024,
        'hidden_size': 128,
        'num_layers': 2,
        'dropout_rate': .2
    },
    'CONV': {
        'batch_size': 1024,
        'hidden_size': 64,
        'num_layers': 3,
        'dropout_rate': .1
    },
    'GRU': {
        'batch_size': 1024,
        'hidden_size': 64,
        'num_layers': 2,
        'dropout_rate': .05
    },
    'TRSF': {
        'batch_size': 1024,
        'hidden_size': 64,
        'num_layers': 3,
        'dropout_rate': .1
    },
    'DEEPTTE': {
        'batch_size': 1024
    }
}

fold_num = 4
grid_s_size = 500
model_type="GRU"
skip_gtfs=False
NUM_WORKERS=0
PIN_MEMORY=False

In [None]:
# Load trained models
kcm_base_model_list, kcm_nn_model = model_utils.make_one_model(model_type, hyperparameter_dict=hyperparameter_dict, embed_dict=embed_dict, config=kcm_config, load_weights=True, weight_folder=f"{RUN_FOLDER}{KCM_NETWORK_FOLDER}models/{model_type}/", fold_num=4, skip_gtfs=skip_gtfs)
# model_list = model_utils.make_all_models_nosch(HIDDEN_SIZE, BATCH_SIZE, embed_dict, device, kcm_config, load_weights=True, weight_folder=f"{RUN_FOLDER}{KCM_NETWORK_FOLDER}models/", fold_num=FOLD_NUM)
# kcm_model = model_list[1]

# Load trained models
atb_base_model_list, atb_nn_model = model_utils.make_one_model(model_type, hyperparameter_dict=hyperparameter_dict, embed_dict=embed_dict, config=atb_config, skip_gtfs=skip_gtfs)
# model_list = model_utils.make_all_models_nosch(HIDDEN_SIZE, BATCH_SIZE, embed_dict, device, atb_config, load_weights=True, weight_folder=f"{RUN_FOLDER}{ATB_NETWORK_FOLDER}models/", fold_num=FOLD_NUM)
# atb_model = model_list[1]

### Inference on Sample of Shingles

In [None]:
print(f"Evaluating: {kcm_nn_model.model_name}")
# Set up dataset
kcm_dataset = data_loader.LoadSliceDataset(f"{RUN_FOLDER}{KCM_NETWORK_FOLDER}deeptte_formatted/test", kcm_config, skip_gtfs=skip_gtfs)
kcm_ngrid = grids.NGridBetter(kcm_config['grid_bounds'][0], grid_s_size)
kcm_ngrid.add_grid_content(kcm_dataset.get_all_samples(keep_cols=['shingle_id','locationtime','x','y','speed_m_s','bearing']), trace_format=True)
kcm_ngrid.build_cell_lookup()
kcm_dataset.grid = kcm_ngrid
kcm_dataset.add_grid_features = kcm_nn_model.requires_grid
loader = DataLoader(kcm_dataset, sampler=SequentialSampler(kcm_dataset), collate_fn=kcm_nn_model.collate_fn, batch_size=kcm_nn_model.batch_size, pin_memory=PIN_MEMORY, num_workers=NUM_WORKERS, drop_last=False)
trainer = pl.Trainer(
    accelerator="cpu",
    logger=False
)
preds_and_labels = trainer.predict(model=kcm_nn_model, dataloaders=loader)

In [None]:
# Extract predictions
kcm_preds = np.concatenate([x['out'][x['mask']] for x in preds_and_labels])
kcm_labels = np.concatenate([x['y'][x['mask']] for x in preds_and_labels])

In [None]:
# Extract data points
kcm_data = [x for x in kcm_dataset]
kcm_feats = np.concatenate([x['samp'] for x in kcm_data])
kcm_ys = np.array([x['norm_label'] for x in kcm_data])
kcm_res = pd.DataFrame(kcm_feats, columns=kcm_dataset.col_names)
kcm_res['preds'] = kcm_preds
kcm_res['labels'] = kcm_labels
kcm_res

In [None]:
# Overall accuracy
print(f"MAE: {metrics.mean_absolute_error(kcm_labels,kcm_preds)}")
print(f"MAPE: {metrics.mean_absolute_percentage_error(kcm_labels,kcm_preds)}")

# print(f"MAE: {sklearn.metrics.mean_absolute_error(atb_labels,atb_preds)}")
# print(f"MAPE: {sklearn.metrics.mean_absolute_percentage_error(atb_labels,atb_preds)}")

In [None]:
# Get geometries and other features for every prediciton point
kcm_res['pred_speeds'] = kcm_res['dist_calc_km']*1000 / kcm_res['preds']
kcm_res['label_speeds'] = kcm_res['dist_calc_km']*1000 / kcm_res['labels']
kcm_res['absolute_error'] = abs(kcm_res['preds'] - kcm_res['labels'])
kcm_res['hour'] = kcm_res['timeID']//60
points = gpd.points_from_xy(kcm_res['lon'], kcm_res['lat'], crs="EPSG:4326")
kcm_res = gpd.GeoDataFrame(kcm_res, geometry=points)
kcm_res = kcm_res.sample(100000)

In [None]:
# Base data variance vs model variance in time/space
print(f"{np.mean(kcm_res['labels'])}, {np.std(kcm_res['labels'])}")
print(f"{np.mean(kcm_res['preds'])}, {np.std(kcm_res['preds'])}")

In [None]:
# axes = geoplot.pointplot(kcm_res, projection=geoplot.crs.AlbersEqualArea(), s=0.1)
# geoplot.kdeplot(kcm_res, fill=True, cmap='coolwarm', alpha=0.5, bw_adjust=0.5, ax=axes)
# axes.set_title(f"Point Heatmap(KCM)")
# plt.savefig("../plots/model_spatial_performance_kcm.png")

# axes = geoplot.pointplot(atb_res, projection=geoplot.crs.AlbersEqualArea(), s=0.1)
# geoplot.kdeplot(atb_res, fill=True, cmap='coolwarm', alpha=0.5, bw_adjust=0.5, ax=axes)
# axes.set_title(f"Point Heatmap (AtB)")
# plt.savefig("../plots/model_spatial_performance_atb.png")

In [None]:
fig = ff.create_hexbin_mapbox(
    data_frame=kcm_res,
    lat="lat",
    lon="lon",
    nx_hexagon=30,
    opacity=0.7,
    labels={"color": "SD of Predictions"},
    color="pred_speeds",
    agg_func=np.std,
    color_continuous_scale="Icefire",
    range_color=[0,10]
)
fig.show()

In [None]:
fig = ff.create_hexbin_mapbox(
    data_frame=kcm_res,
    lat="lat",
    lon="lon",
    nx_hexagon=30,
    opacity=0.7,
    labels={"color": "SD of Labels"},
    color="label_speeds",
    agg_func=np.std,
    color_continuous_scale="Icefire",
    range_color=[0,10]
)
fig.show()

In [None]:
fig = ff.create_hexbin_mapbox(
    data_frame=kcm_res,
    lat="lat",
    lon="lon",
    nx_hexagon=30,
    opacity=0.7,
    labels={"color": "MAE"},
    color="absolute_error",
    agg_func=np.mean,
    color_continuous_scale="Icefire",
    range_color=[0,100]
)
fig.show()
# fig.write_image("hexbin_map.png")

In [None]:
fig, axes = plt.subplots(3,2)
fig.tight_layout()
axes = axes.flatten()
fig.set_figheight(10)
fig.set_figwidth(8)
sns.histplot(kcm_res, x="pred_speeds", ax=axes[0])
axes[0].set_title(f"Predicted Speeds(KCM)")
axes[0].set_xlabel("Speed (m/s)")
# sns.histplot(atb_res, x="pred_speeds", ax=axes[1])
# axes[1].set_title(f"GRU Mean Absolute Error (AtB)")
# axes[1].set_xlabel("MAE (s)")
sns.histplot(kcm_res, x="label_speeds", ax=axes[2])
axes[2].set_title(f"Label Speeds (KCM)")
axes[2].set_xlabel("Speed (m/s)")
# sns.histplot(atb_res, x="label_speeds", ax=axes[3])
# axes[3].set_title(f"Label Speeds (AtB)")
# axes[3].set_xlabel("Speed (m/s)")
sns.histplot(kcm_res, x="absolute_error", ax=axes[4])
axes[4].set_title(f"Absolute Error (KCM)")
axes[4].set_xlabel("Error (s)")
# sns.histplot(atb_res, x="absolute_error", ax=axes[5])
# axes[5].set_title(f"Absolute Error (AtB)")
# axes[5].set_xlabel("Error (s)")
plt.savefig("../plots/model_distribution_comparison.png")

In [None]:
fig, axes = plt.subplots(3,2)
fig.tight_layout()
axes = axes.flatten()
fig.set_figheight(10)
fig.set_figwidth(8)
sns.lineplot(kcm_res, x="hour", y="pred_speeds", ax=axes[0])
axes[0].set_title(f"Predicted Speeds (KCM)")
axes[0].set_xlabel("Hour of Day")
axes[0].set_ylabel("Speed (m/s)")
axes[0].set_xlim(0,24)
axes[0].set_ylim(0,50)
# sns.lineplot(atb_res, x="hour", y="pred_speeds", ax=axes[1])
# axes[1].set_title(f"Predicted Speeds (AtB)")
# axes[1].set_xlabel("Hour of Day")
# axes[1].set_ylabel("Speed (m/s)")
# axes[1].set_xlim(0,24)
# axes[1].set_ylim(0,50)
sns.lineplot(kcm_res, x="hour", y="label_speeds", ax=axes[2])
axes[2].set_title(f"Label Speeds (KCM)")
axes[2].set_xlabel("Hour of Day")
axes[2].set_ylabel("Speed (m/s)")
axes[2].set_xlim(0,24)
axes[2].set_ylim(0,50)
# sns.lineplot(atb_res, x="hour", y="label_speeds", ax=axes[3])
# axes[3].set_title(f"Label Speeds (AtB)")
# axes[3].set_xlabel("Hour of Day")
# axes[3].set_ylabel("Speed (m/s)")
# axes[3].set_xlim(0,24)
# axes[3].set_ylim(0,50)
sns.lineplot(kcm_res, x="hour", y="absolute_error", ax=axes[4])
axes[4].set_title(f"Mean Absolute Error (KCM)")
axes[4].set_xlabel("Hour of Day")
axes[4].set_ylabel("MAE (s)")
axes[4].set_xlim(0,24)
axes[4].set_ylim(0,50)
# sns.lineplot(atb_res, x="hour", y="absolute_error", ax=axes[5])
# axes[5].set_title(f"Mean Absolute Error (AtB)")
# axes[5].set_xlabel("Hour of Day")
# axes[5].set_ylabel("MAE (s)")
# axes[5].set_xlim(0,24)
# axes[5].set_ylim(0,50)
plt.savefig("../plots/model_hourly_comparison.png")

### Inference on Entire Network

In [None]:
importlib.reload(data_loader)

In [None]:
# Create grid of regularly spaced fake shingles to feed model
inference_shingles = data_utils.create_grid_of_shingles(100, kcm_config['grid_bounds'][0], kcm_config['coord_ref_center'][0])

In [None]:
kcm_dataset = data_loader.ContentDataset(inference_shingles, kcm_config, skip_gtfs=True)
loader = DataLoader(kcm_dataset, sampler=SequentialSampler(kcm_dataset), collate_fn=kcm_nn_model.collate_fn, batch_size=kcm_nn_model.batch_size, pin_memory=PIN_MEMORY, num_workers=NUM_WORKERS, drop_last=False)
trainer = pl.Trainer(
    accelerator="cpu",
    logger=False
)
preds_and_labels = trainer.predict(model=kcm_nn_model, dataloaders=loader)

In [None]:
print(f"Evaluating: {kcm_nn_model.model_name}")
# Set up dataset
kcm_dataset = data_loader.LoadSliceDataset(f"{RUN_FOLDER}{KCM_NETWORK_FOLDER}deeptte_formatted/test", kcm_config, skip_gtfs=skip_gtfs)
kcm_ngrid = grids.NGridBetter(kcm_config['grid_bounds'][0], grid_s_size)
kcm_ngrid.add_grid_content(kcm_dataset.get_all_samples(keep_cols=['shingle_id','locationtime','x','y','speed_m_s','bearing']), trace_format=True)
kcm_ngrid.build_cell_lookup()
kcm_dataset.grid = kcm_ngrid
kcm_dataset.add_grid_features = kcm_nn_model.requires_grid
loader = DataLoader(kcm_dataset, sampler=SequentialSampler(kcm_dataset), collate_fn=kcm_nn_model.collate_fn, batch_size=kcm_nn_model.batch_size, pin_memory=PIN_MEMORY, num_workers=NUM_WORKERS, drop_last=False)
trainer = pl.Trainer(
    accelerator="cpu",
    logger=False
)
preds_and_labels = trainer.predict(model=kcm_nn_model, dataloaders=loader)

In [None]:
inference_shingles[0].keys()

In [None]:
print(f"Evaluating: {kcm_model.model_name}")
# Set up dataset
kcm_dataset = data_loader.GenericDataset([], kcm_config)
kcm_dataset.content = inference_shingles
kcm_dataset.add_grid_features = kcm_model.requires_grid
loader = DataLoader(kcm_dataset, sampler=SequentialSampler(kcm_dataset), collate_fn=kcm_model.collate_fn, batch_size=BATCH_SIZE, pin_memory=PIN_MEMORY, num_workers=NUM_WORKERS, drop_last=False)
kcm_data = kcm_dataset.content
# Test model
kcm_labels, kcm_preds, avg_batch_loss, seq_lens = model_utils.predict(kcm_model, loader, sequential_flag=True)
kcm_labels = data_utils.de_normalize(kcm_labels, kcm_config['time_calc_s_mean'], kcm_config['time_calc_s_std'])
kcm_preds = data_utils.de_normalize(kcm_preds, kcm_config['time_calc_s_mean'], kcm_config['time_calc_s_std'])
kcm_mask = data_utils.create_tensor_mask(torch.cat(seq_lens)).numpy()
kcm_preds = kcm_preds*kcm_mask
kcm_preds = kcm_preds[kcm_preds!=0.0]
kcm_labels = kcm_labels*kcm_mask
kcm_labels = kcm_labels[kcm_labels!=0.0]

In [None]:
# Get geometries and other features for every prediciton point
lon = np.concatenate([np.array(sample['lon']) for sample in kcm_data])
lat = np.concatenate([np.array(sample['lat']) for sample in kcm_data])
x_coords = np.concatenate([np.array(sample['x']) for sample in kcm_data])
y_coords = np.concatenate([np.array(sample['y']) for sample in kcm_data])
dist_calc_km = np.concatenate([np.array(sample['dist_calc_km']) for sample in kcm_data])
bearing = np.concatenate([np.array(sample['bearing']) for sample in kcm_data])
kcm_res = pd.DataFrame({
    "lon": lon,
    "lat": lat,
    "x_coords": x_coords,
    "y_coords": y_coords,
    "dist_calc_km": dist_calc_km,
    "bearing": bearing,
    "preds": kcm_preds,
    "labels": kcm_labels
})
kcm_res['speed_m_s'] = kcm_res['dist_calc_km']*1000 / kcm_res['preds']
# Transform x and y to replace dummy lat and lon for mapping
transformer = pyproj.Transformer.from_crs(kcm_crs, default_crs)
kcm_res['lat'], kcm_res['lon'] = transformer.transform(kcm_res['x_coords'], kcm_res['y_coords'])
points = gpd.points_from_xy(kcm_res['lon'], kcm_res['lat'], crs=default_crs)
kcm_res = gpd.GeoDataFrame(kcm_res, geometry=points)

In [None]:
fig = ff.create_hexbin_mapbox(
    data_frame=kcm_res[kcm_res['bearing']==90],
    lat="lat",
    lon="lon",
    nx_hexagon=50,
    opacity=0.5,
    labels={"color": "Mean Speed (m/s)"},
    color="speed_m_s",
    agg_func=np.mean,
    color_continuous_scale="Icefire_r",
)
fig.show()

In [None]:
fig = ff.create_hexbin_mapbox(
    data_frame=kcm_res[kcm_res['bearing']==0],
    lat="lat",
    lon="lon",
    nx_hexagon=50,
    opacity=0.5,
    labels={"color": "Mean Speed (m/s)"},
    color="speed_m_s",
    agg_func=np.mean,
    color_continuous_scale="Icefire_r",
)
fig.show()