In [None]:
import datetime
import os
import pickle
import random
import sys
sys.path.append("../")

import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from openbustools.traveltime import grid, data_loader
from openbustools import plotting, spatial, standardfeeds

In [None]:
network_name = 'kcm'
data_folders = ['../data/kcm_realtime/processed/', '../data/atb_realtime/processed/']
train_date = '2023_03_15'
train_n = 3
train_dates = standardfeeds.get_date_list(train_date, int(train_n))

### Metadata

In [None]:
route_ids = {k:[] for k in data_folders}
sample_size = 5
for folder_name in data_folders:
    data_dates = [f for f in os.listdir(folder_name) if os.path.isfile(f'{folder_name}{f}')]
    n_dates = len(data_dates)
    n_samples = []
    n_points = []
    for file_name in random.sample(data_dates, sample_size):
        if file_name[-4:] == '.pkl':
            data = pd.read_pickle(f'{folder_name}{file_name}')
            n_samples.append(len(pd.unique(data['shingle_id'])))
            n_points.append(len(data))
            route_ids[folder_name].append(pd.unique(data['route_id']))
    print(folder_name)
    print(f"{n_dates} days")
    print(f"{np.mean(n_points) = :.0f}, {np.std(n_points) = :.0f} points per day")
    print(f"{np.mean(n_samples) = :.0f}, {np.std(n_samples) = :.0f} samples per day")
    print(f"{np.mean(n_points) * n_dates = :.0f} points")
    print(f"{np.mean(n_samples) * n_dates = :.0f} samples")
    unique_routes = np.unique(np.concatenate(route_ids[folder_name]))
    print(np.random.choice(unique_routes, int(len(unique_routes)*.05), replace=False))
    print()

### Traces

In [None]:
train_data, holdout_routes, train_config = data_loader.load_h5([data_folders[0]], train_dates, holdout_routes=data_loader.HOLDOUT_ROUTES)
train_dataset = data_loader.H5Dataset(train_data)

In [None]:
# Sample one trajectory for plotting, use KCM network
sample = np.random.randint(len(train_dataset))
sample_df = pd.DataFrame(train_dataset.data[sample]['feats_n']).set_axis(data_loader.NUM_FEAT_COLS, axis=1)
sample_df = gpd.GeoDataFrame(sample_df, geometry=gpd.points_from_xy(sample_df['x'].to_numpy(), sample_df['y'].to_numpy()), crs='EPSG:32148')
sample_df.head()

In [None]:
plotting.formatted_shingle_scatterplot(sample_df)

### Feature Distributions

In [None]:
data_df = train_dataset.to_df()
data_df['data_folder'] = data_folders[0]
data_df['cumul_dist_m'] = data_df.groupby(['shingle_id'])['calc_dist_m'].cumsum()
data_df['cumul_dist_km'] = data_df['cumul_dist_m'] / 1000

In [None]:
plotting.formatted_feature_distributions_histplot(data_df)

### Grid

In [None]:
g = pickle.load(open(f"../data/kcm_realtime/processed/grid/{train_date}.pkl", 'rb'))

In [None]:
res = grid.convert_to_frames(g)

In [None]:
plotting.formatted_grid_animation(res[500:600,:,:])