In [None]:
import pickle
import sys
from zoneinfo import ZoneInfo
sys.path.append("../")

from dotenv import load_dotenv
load_dotenv()
import geopandas as gpd
import importlib
import contextily as cx
import logging
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import plotly.figure_factory as ff
import plotly.express as px
import numpy as np
px.set_mapbox_access_token(os.environ["MAPBOX_TOKEN"])
import plotly.express as px
import lightning.pytorch as pl
import rasterio as rio
from rasterio.plot import show
import seaborn as sns
import shapely
import statsmodels.api as sm
import torch
from torch.utils.data import DataLoader

from openbustools import plotting, spatial, standardfeeds
from openbustools.traveltime import data_loader, model_utils
from openbustools.drivecycle import trajectory
from openbustools.drivecycle.physics import conditions, energy, vehicle

### Choose Model and Dates

In [None]:
if torch.cuda.is_available():
    num_workers=4
    pin_memory=True
    accelerator="cuda"
else:
    num_workers=0
    pin_memory=False
    accelerator="cpu"

logging.getLogger("lightning").setLevel(logging.ERROR)
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)
logging.getLogger("lightning.pytorch.accelerators.cuda").setLevel(logging.ERROR)

train_data_folders = [f"../data/kcm_realtime/processed/"]
test_data_folders = [f"../data/atb_realtime/processed/"]
train_days = standardfeeds.get_date_list('2023_03_15', 30)
train_days = [x.split(".")[0] for x in train_days]
test_days = standardfeeds.get_date_list('2023_04_15', 7)
test_days = [x.split(".")[0] for x in test_days]

model = model_utils.load_model("../logs/", "kcm", "GRU", 0)
model.eval()

### Inference on Holdout Routes

In [None]:
test_dataset = data_loader.NumpyDataset(
    train_data_folders,
    test_days,
    holdout_routes=model.holdout_routes,
    load_in_memory=True,
    config = model.config,
    only_holdouts=True
)
test_loader = DataLoader(
    test_dataset,
    collate_fn=model.collate_fn,
    batch_size=model.batch_size,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers,
    pin_memory=pin_memory
)
trainer = pl.Trainer(
    accelerator=accelerator,
    logger=False,
    inference_mode=True
)
preds_and_labels = trainer.predict(model=model, dataloaders=test_loader)

# Extract predictions for full shingles, and for individual points
all_preds = np.concatenate([x['preds'] for x in preds_and_labels])
all_labels = np.concatenate([x['labels'] for x in preds_and_labels])

all_preds_raw = np.concatenate([x['preds_raw'][x['mask']] for x in preds_and_labels])
all_labels_raw = np.concatenate([x['labels_raw'][x['mask']] for x in preds_and_labels])

mape = np.mean(np.abs(all_preds - all_labels) / all_labels)
mape_raw = np.mean(np.abs(all_preds_raw - all_labels_raw) / all_labels_raw)
mae = np.mean(np.abs(all_preds - all_labels))
mae_raw = np.mean(np.abs(all_preds_raw - all_labels_raw))
rmse = np.sqrt(np.mean(np.square(all_preds - all_labels)))
rmse_raw = np.sqrt(np.mean(np.square(all_preds_raw - all_labels_raw)))

print(f"MAPE: {mape:.2f}, MAPE PT {mape_raw:.2f}")
print(f"MAE: {mae:.2f}, MAE PT {mae_raw:.2f}")
print(f"RMSE: {rmse:.2f}, RMSE PT {rmse_raw:.2f}")

In [None]:
# Combine the input features, labels and predictions into analysis dataframe
all_samples = []
for i in range(len(test_dataset)):
    sample = test_dataset.find_sample(i)
    sample = np.concatenate([sample, np.ones((sample.shape[0], 1)) * i], axis=1)
    all_samples.append(sample)
all_samples = np.concatenate(all_samples)
all_samples = pd.DataFrame(all_samples, columns=data_loader.NUM_FEAT_COLS+["shingle_id"])

# Add predicted/labeled times to the dataframe, with masked out first prediction for each shingle
all_samples['n'] = all_samples.groupby('shingle_id').cumcount()
all_samples.loc[all_samples['n'] == 0, 'preds'] = 0
all_samples.loc[all_samples['n'] != 0, 'preds'] = all_preds_raw
all_samples.loc[all_samples['n'] == 0, 'labels'] = 0
all_samples.loc[all_samples['n'] != 0, 'labels'] = all_labels_raw

# Calculate residuals
all_samples['residuals'] = np.abs(all_samples['preds'] - all_samples['labels'])
all_samples['residuals_sq'] = (all_samples['preds'] - all_samples['labels']) ** 2
all_samples = gpd.GeoDataFrame(all_samples, geometry=gpd.points_from_xy(all_samples.x, all_samples.y), crs="EPSG:32148")

# Reproject to WGS84
all_samples = all_samples.to_crs("EPSG:4326")
all_samples['lon'] = all_samples.geometry.x
all_samples['lat'] = all_samples.geometry.y

In [None]:
sns.histplot(all_samples['residuals'])

In [None]:
fig = ff.create_hexbin_mapbox(
    data_frame=all_samples,
    lat="lat",
    lon="lon",
    width=500,
    height=700,
    nx_hexagon=30,
    labels={"color": "MAE"},
    color="residuals",
    agg_func=np.mean,
    color_continuous_scale="Icefire",
    range_color=[0,20],
    mapbox_style='open-street-map',
)
fig.show()

In [None]:
# gtfs = standardfeeds.get_gtfs_shapes_lookup("../data/kcm_gtfs/2023_01_23/")
# gtfs_shapes = standardfeeds.get_gtfs_shapes("../data/kcm_gtfs/2023_01_23/", epsg=32148)

In [None]:
# # Read the pickle files, splitting holdout and non-holdout samples
# res = {}
# all_holdout_shingles = []
# all_other_shingles = []
# for day in train_days:
#     print(day)
#     shingles = pd.read_pickle(Path('..', 'data', 'kcm_realtime', 'processed', 'analysis',f"{day}.pkl"))
#     holdout_shingles = shingles[shingles['route_id'].isin(model.holdout_routes)].sample(10)
#     other_shingles = shingles[~shingles['route_id'].isin(model.holdout_routes)].sample(10)
#     all_holdout_shingles.append(holdout_shingles)
#     all_other_shingles.append(other_shingles)
# all_holdout_shingles = pd.concat(all_holdout_shingles)
# all_other_shingles = pd.concat(all_other_shingles)

# print(pd.unique(all_holdout_shingles['route_id']))
# print([x in pd.unique(all_other_shingles['route_id']) for x in pd.unique(all_holdout_shingles['route_id'])])
# print(pd.unique(all_other_shingles['route_id']))
# holdout_routes_in_data = pd.unique(all_holdout_shingles['route_id'])

In [None]:
# # Plot the holdout shapes over heatmap of all other shingles
# df = all_other_shingles
# holdout_shapes = gtfs_shapes[gtfs_shapes['route_id'].isin(holdout_routes_in_data)].groupby('route_id').nth(0)

# fig, axes = plt.subplots(1, 1, figsize=(10,10))
# sns.kdeplot(ax=axes, x=df.x, y=df.y, cmap="Reds", fill=True, bw_adjust=.4)
# holdout_shapes.plot(ax=axes, column='route_id', linewidth=3)
# cx.add_basemap(ax=axes, crs=holdout_shapes.crs.to_string(), alpha=0.3, source=cx.providers.MapBox(accessToken=os.getenv(key="MAPBOX_TOKEN")))
# plt.show()

### Inference on All Routes

In [None]:
test_dataset = data_loader.NumpyDataset(
    train_data_folders,
    test_days,
    holdout_routes=model.holdout_routes,
    load_in_memory=True,
    config = model.config,
)
test_loader = DataLoader(
    test_dataset,
    collate_fn=model.collate_fn,
    batch_size=model.batch_size,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers,
    pin_memory=pin_memory
)
trainer = pl.Trainer(
    accelerator=accelerator,
    logger=False,
    inference_mode=True
)
preds_and_labels = trainer.predict(model=model, dataloaders=test_loader)

# Extract predictions for full shingles, and for individual points
all_preds = np.concatenate([x['preds'] for x in preds_and_labels])
all_labels = np.concatenate([x['labels'] for x in preds_and_labels])

all_preds_raw = np.concatenate([x['preds_raw'][x['mask']] for x in preds_and_labels])
all_labels_raw = np.concatenate([x['labels_raw'][x['mask']] for x in preds_and_labels])

mape = np.mean(np.abs(all_preds - all_labels) / all_labels)
mape_raw = np.mean(np.abs(all_preds_raw - all_labels_raw) / all_labels_raw)
mae = np.mean(np.abs(all_preds - all_labels))
mae_raw = np.mean(np.abs(all_preds_raw - all_labels_raw))
rmse = np.sqrt(np.mean(np.square(all_preds - all_labels)))
rmse_raw = np.sqrt(np.mean(np.square(all_preds_raw - all_labels_raw)))

print(f"MAPE: {mape:.2f}, MAPE PT {mape_raw:.2f}")
print(f"MAE: {mae:.2f}, MAE PT {mae_raw:.2f}")
print(f"RMSE: {rmse:.2f}, RMSE PT {rmse_raw:.2f}")

In [None]:
# Combine the input features, labels and predictions into analysis dataframe
all_samples = []
for i in range(len(test_dataset)):
    sample = test_dataset.find_sample(i)
    sample = np.concatenate([sample, np.ones((sample.shape[0], 1)) * i], axis=1)
    all_samples.append(sample)
all_samples = np.concatenate(all_samples)
all_samples = pd.DataFrame(all_samples, columns=data_loader.NUM_FEAT_COLS+["shingle_id"])

# Add predicted/labeled times to the dataframe, with masked out first prediction for each shingle
all_samples['n'] = all_samples.groupby('shingle_id').cumcount()
all_samples.loc[all_samples['n'] == 0, 'preds'] = 0
all_samples.loc[all_samples['n'] != 0, 'preds'] = all_preds_raw
all_samples.loc[all_samples['n'] == 0, 'labels'] = 0
all_samples.loc[all_samples['n'] != 0, 'labels'] = all_labels_raw

# Calculate residuals
all_samples['residuals'] = np.abs(all_samples['preds'] - all_samples['labels'])
all_samples['residuals_sq'] = (all_samples['preds'] - all_samples['labels']) ** 2
all_samples = gpd.GeoDataFrame(all_samples, geometry=gpd.points_from_xy(all_samples.x, all_samples.y), crs="EPSG:32148")

# Reproject to WGS84
all_samples = all_samples.to_crs("EPSG:4326")
all_samples['lon'] = all_samples.geometry.x
all_samples['lat'] = all_samples.geometry.y

In [None]:
sns.histplot(all_samples['residuals'])

In [None]:
fig = ff.create_hexbin_mapbox(
    data_frame=all_samples.sample(100000),
    lat="lat",
    lon="lon",
    width=500,
    height=700,
    nx_hexagon=200,
    labels={"color": "MAE"},
    color="residuals",
    agg_func=np.mean,
    color_continuous_scale="Icefire",
    range_color=[0,20],
    mapbox_style='open-street-map',
)
fig.show()