In [None]:
import pickle
import sys
from zoneinfo import ZoneInfo
sys.path.append("../")

from dotenv import load_dotenv
load_dotenv()
import geopandas as gpd
import importlib
import contextily as cx
import logging
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import plotly.figure_factory as ff
import plotly.express as px
import numpy as np
px.set_mapbox_access_token(os.environ["MAPBOX_TOKEN"])
import plotly.express as px
import lightning.pytorch as pl
import rasterio as rio
from rasterio.plot import show
import seaborn as sns
import shapely
import statsmodels.api as sm
import torch
from torch.utils.data import DataLoader

from openbustools import plotting, spatial, standardfeeds
from openbustools.traveltime import data_loader, model_utils
from openbustools.drivecycle import trajectory
from openbustools.drivecycle.physics import conditions, energy, vehicle

In [None]:
if torch.cuda.is_available():
    num_workers=4
    pin_memory=True
    accelerator="cuda"
else:
    num_workers=0
    pin_memory=False
    accelerator="cpu"

logging.getLogger("lightning").setLevel(logging.ERROR)
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)
logging.getLogger("lightning.pytorch.accelerators.cuda").setLevel(logging.ERROR)

model = model_utils.load_model("../logs/saved_models/", "kcm_1month_resampled", "GRU", 0)
model.eval()

In [None]:
train_data_folders = [f"../data/kcm_realtime/processed/"]
test_data_folders = [f"../data/atb_realtime/processed/"]
train_dates = standardfeeds.get_date_list('2023_03_15', 30)
test_dates = standardfeeds.get_date_list('2023_04_15', 7)

### Inference on Holdout Routes

In [None]:
holdout_data, holdout_routes, holdout_config = data_loader.load_h5(train_data_folders, test_dates, only_holdout=True, holdout_routes=model.holdout_routes, config=model.config)
holdout_dataset = data_loader.H5Dataset(holdout_data)
holdout_dataset.include_grid = model.include_grid

test_data, holdout_routes, test_config = data_loader.load_h5(train_data_folders, test_dates, only_holdout=False, holdout_routes=model.holdout_routes, config=model.config)
test_dataset = data_loader.H5Dataset(test_data)
test_dataset.include_grid = model.include_grid

In [None]:
gtfs = standardfeeds.get_gtfs_shapes_lookup("../data/kcm_gtfs/2023_01_23/")
gtfs_shapes = standardfeeds.get_gtfs_shapes("../data/kcm_gtfs/2023_01_23/", epsg=32148)

In [None]:
# # Read the pickle files, splitting holdout and non-holdout samples
# res = {}
# all_holdout_shingles = []
# all_other_shingles = []
# for day in train_dates:
#     print(day)
#     shingles = pd.read_pickle(Path('..', 'data', 'kcm_realtime', 'processed', day))
#     holdout_shingles = shingles[shingles['route_id'].isin(model.holdout_routes)].sample(10)
#     other_shingles = shingles[~shingles['route_id'].isin(model.holdout_routes)].sample(10)
#     all_holdout_shingles.append(holdout_shingles)
#     all_other_shingles.append(other_shingles)
# all_holdout_shingles = pd.concat(all_holdout_shingles)
# all_other_shingles = pd.concat(all_other_shingles)

# print(pd.unique(all_holdout_shingles['route_id']))
# print([x in pd.unique(all_other_shingles['route_id']) for x in pd.unique(all_holdout_shingles['route_id'])])
# print(pd.unique(all_other_shingles['route_id']))
# holdout_routes_in_data = pd.unique(all_holdout_shingles['route_id'])

In [None]:
# all_other_shingles

In [None]:
# # Plot the holdout shapes over heatmap of all other shingles
# df = all_other_shingles
# holdout_shapes = gtfs_shapes[gtfs_shapes['route_id'].isin(holdout_routes_in_data)].groupby('route_id').nth(0)

# fig, axes = plt.subplots(1, 1, figsize=(10,10))
# sns.kdeplot(ax=axes, x=df.x, y=df.y, cmap="Reds", fill=True, bw_adjust=.4)
# holdout_shapes.plot(ax=axes, column='route_id', linewidth=3)
# cx.add_basemap(ax=axes, crs=holdout_shapes.crs.to_string(), alpha=0.3, source=cx.providers.MapBox(accessToken=os.getenv(key="MAPBOX_TOKEN")))
# plt.show()

In [None]:
# Make predictions for the holdout route shingles
loader = DataLoader(
    holdout_dataset,
    collate_fn=model.collate_fn,
    batch_size=model.batch_size,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers,
    pin_memory=pin_memory
)
trainer = pl.Trainer(
    accelerator=accelerator,
    logger=False,
    inference_mode=True,
    enable_progress_bar=False,
    enable_model_summary=False,
)
preds_and_labels = trainer.predict(model=model, dataloaders=loader)

# Extract predictions for full shingles, and for individual points
all_preds = np.concatenate([x['preds'] for x in preds_and_labels])
all_labels = np.concatenate([x['labels'] for x in preds_and_labels])

all_preds_raw = np.concatenate([x['preds_raw'][x['mask']] for x in preds_and_labels])
all_labels_raw = np.concatenate([x['labels_raw'][x['mask']] for x in preds_and_labels])

mape = np.mean(np.abs(all_preds - all_labels) / all_labels)
mape_raw = np.mean(np.abs(all_preds_raw - all_labels_raw) / all_labels_raw)
mae = np.mean(np.abs(all_preds - all_labels))
mae_raw = np.mean(np.abs(all_preds_raw - all_labels_raw))
rmse = np.sqrt(np.mean(np.square(all_preds - all_labels)))
rmse_raw = np.sqrt(np.mean(np.square(all_preds_raw - all_labels_raw)))

print(f"MAPE: {mape:.2f}, MAPE PT {mape_raw:.2f}")
print(f"MAE: {mae:.2f}, MAE PT {mae_raw:.2f}")
print(f"RMSE: {rmse:.2f}, RMSE PT {rmse_raw:.2f}")

# Extract the input features from the dataset at the point-level
shingles = []
for i in range(len(holdout_data)):
    # Skip the first element of each shingle, which is the start point
    shingle = holdout_data[i]['feats_n'][1:]
    shingles.append(shingle)
shingles = np.concatenate(shingles)

# Combine the input features, labels and predictions into analysis dataframe
df = pd.DataFrame(shingles, columns=data_loader.NUM_FEAT_COLS)
df['preds'] = all_preds_raw
df['labels'] = all_labels_raw
df['residuals'] = np.abs(df['preds'] - df['labels'])
df['residuals_sq'] = (df['preds'] - df['labels']) ** 2
df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.x, df.y), crs="EPSG:32148")

# Reproject to WGS84
df = df.to_crs("EPSG:4326")
df['lon'] = df.geometry.x
df['lat'] = df.geometry.y

In [None]:
sns.histplot(df['residuals'])

In [None]:
fig = ff.create_hexbin_mapbox(
    data_frame=df,
    lat="lat",
    lon="lon",
    width=500,
    height=700,
    nx_hexagon=30,
    labels={"color": "MAE"},
    color="residuals",
    agg_func=np.mean,
    color_continuous_scale="Icefire",
    range_color=[2,3],
    mapbox_style='open-street-map',
)
fig.show()

In [None]:
fig = ff.create_hexbin_mapbox(
    data_frame=df,
    lat="lat",
    lon="lon",
    width=500,
    height=700,
    nx_hexagon=20,
    opacity=0.9,
    labels={"color": "Std. Residuals"},
    color="residuals",
    agg_func=np.std,
    color_continuous_scale="Icefire",
    range_color=[2,5]
)
fig.show()

In [None]:
fig = ff.create_hexbin_mapbox(
    data_frame=df,
    lat="lat",
    lon="lon",
    width=500,
    height=700,
    nx_hexagon=20,
    opacity=0.9,
    labels={"color": "Avg. Speed (m/s)"},
    color="calc_speed_m_s",
    agg_func=np.mean,
    color_continuous_scale="Icefire",
    range_color=[0,20]
)
fig.show()

In [None]:
fig = ff.create_hexbin_mapbox(
    data_frame=df,
    lat="lat",
    lon="lon",
    width=500,
    height=700,
    nx_hexagon=20,
    opacity=0.9,
    labels={"color": "Std. Speed (m/s)"},
    color="calc_speed_m_s",
    agg_func=np.std,
    color_continuous_scale="Icefire",
    range_color=[0,8]
)
fig.show()

In [None]:
# plot_df = df[df['MAPE']<1].copy()
# fig, axes = plt.subplots(1, 1, figsize=(10,10))
# plt.hexbin(plot_df.x, plot_df.y, plot_df.MAPE, cmap='plasma', gridsize=15)
# sns.kdeplot(ax=axes, x=plot_df.x, y=plot_df.y, weights=plot_df.MAPE, cmap="plasma", fill=True, bw_adjust=.2)
# cx.add_basemap(ax=axes, crs=plot_df.crs.to_string(), alpha=0.3, source=cx.providers.MapBox(accessToken=os.getenv(key="MAPBOX_TOKEN")))
# plt.show()

### Inference on All Routes

In [None]:
# Make predictions for all route shingles
loader = DataLoader(
    test_dataset,
    collate_fn=model.collate_fn,
    batch_size=model.batch_size,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers,
    pin_memory=pin_memory
)
trainer = pl.Trainer(
    accelerator=accelerator,
    logger=False,
    inference_mode=True,
    enable_progress_bar=False,
    enable_model_summary=False,
)
preds_and_labels = trainer.predict(model=model, dataloaders=loader)

# Extract predictions for full shingles, and for individual points
all_preds = np.concatenate([x['preds'] for x in preds_and_labels])
all_labels = np.concatenate([x['labels'] for x in preds_and_labels])

all_preds_raw = np.concatenate([x['preds_raw'][x['mask']] for x in preds_and_labels])
all_labels_raw = np.concatenate([x['labels_raw'][x['mask']] for x in preds_and_labels])

mape = np.mean(np.abs(all_preds - all_labels) / all_labels)
mape_raw = np.mean(np.abs(all_preds_raw - all_labels_raw) / all_labels_raw)
mae = np.mean(np.abs(all_preds - all_labels))
mae_raw = np.mean(np.abs(all_preds_raw - all_labels_raw))
rmse = np.sqrt(np.mean(np.square(all_preds - all_labels)))
rmse_raw = np.sqrt(np.mean(np.square(all_preds_raw - all_labels_raw)))

print(f"MAPE: {mape:.2f}, MAPE PT {mape_raw:.2f}")
print(f"MAE: {mae:.2f}, MAE PT {mae_raw:.2f}")
print(f"RMSE: {rmse:.2f}, RMSE PT {rmse_raw:.2f}")

# Extract the input features from the dataset at the point-level
shingles = []
for i in range(len(test_data)):
    # Skip the first element of each shingle, which is the start point
    shingle = test_data[i]['feats_n'][1:]
    shingles.append(shingle)
shingles = np.concatenate(shingles)

# Combine the input features, labels and predictions into analysis dataframe
df = pd.DataFrame(shingles, columns=data_loader.NUM_FEAT_COLS)
df['preds'] = all_preds_raw
df['labels'] = all_labels_raw
df['residuals'] = np.abs(df['preds'] - df['labels'])
df['residuals_sq'] = (df['preds'] - df['labels']) ** 2
df['pred_speed_m_s'] = df['calc_dist_m'] / df['preds']
df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.x, df.y), crs="EPSG:32148")

# Reproject to WGS84
df = df.to_crs("EPSG:4326")
df['lon'] = df.geometry.x
df['lat'] = df.geometry.y

In [None]:
# fig = ff.create_hexbin_mapbox(
#     data_frame=df,
#     lat="lat",
#     lon="lon",
#     width=1000,
#     height=1200,
#     nx_hexagon=200,
#     labels={"color": "MAE"},
#     color="residuals",
#     agg_func=np.mean,
#     color_continuous_scale="Icefire",
#     range_color=[1.5,3.5],
#     mapbox_style='open-street-map',
# )
# fig.show()

In [None]:
# fig = ff.create_hexbin_mapbox(
#     data_frame=df,
#     lat="lat",
#     lon="lon",
#     width=1000,
#     height=1200,
#     nx_hexagon=100,
#     labels={"color": "MAE"},
#     color="calc_speed_m_s",
#     agg_func=np.mean,
#     color_continuous_scale="Icefire",
#     range_color=[0,30],
#     mapbox_style='open-street-map',
# )
# fig.show()

In [None]:
# fig = ff.create_hexbin_mapbox(
#     data_frame=df,
#     lat="lat",
#     lon="lon",
#     width=1000,
#     height=1200,
#     nx_hexagon=100,
#     labels={"color": "MAE"},
#     color="calc_speed_m_s",
#     agg_func=np.std,
#     color_continuous_scale="Icefire",
#     range_color=[0,10],
#     mapbox_style='open-street-map',
# )
# fig.show()

In [None]:
# fig = ff.create_hexbin_mapbox(
#     data_frame=df,
#     lat="lat",
#     lon="lon",
#     width=1000,
#     height=1200,
#     nx_hexagon=100,
#     labels={"color": "MAE"},
#     color="pred_speed_m_s",
#     agg_func=np.mean,
#     color_continuous_scale="Icefire",
#     range_color=[0,30],
#     mapbox_style='open-street-map',
# )
# fig.show()

In [None]:
# fig = ff.create_hexbin_mapbox(
#     data_frame=df,
#     lat="lat",
#     lon="lon",
#     width=1000,
#     height=1200,
#     nx_hexagon=100,
#     labels={"color": "MAE"},
#     color="pred_speed_m_s",
#     agg_func=np.std,
#     color_continuous_scale="Icefire",
#     range_color=[0,100],
#     mapbox_style='open-street-map',
# )
# fig.show()