In [None]:
from pathlib import Path
import pickle
import random
import sys
sys.path.append("../")

import numpy as np
import pandas as pd

from openbustools.traveltime import grid
from openbustools import plotting, standardfeeds

In [None]:
network_name = 'kcm'
data_folders = ['kcm_realtime', 'atb_realtime']
train_day = '2023_03_15'
train_n = 3
train_days = standardfeeds.get_date_list(train_day, int(train_n))

### Metadata

In [None]:
sample_size = 10
route_ids = {k:[] for k in data_folders}
for folder_name in data_folders:
    folder_path = Path("..", "data", folder_name, "processed", "analysis")
    folder_days = list(folder_path.glob("*.pkl"))
    n_days = len(folder_days)
    n_samples = []
    n_points = []
    for file_name in random.sample(folder_days, sample_size):
        data = pd.read_pickle(file_name)
        n_samples.append(len(pd.unique(data['shingle_id'])))
        n_points.append(len(data))
        route_ids[folder_name].append(pd.unique(data['route_id']))
    print(folder_name)
    print(f"{n_days} days")
    print(f"{np.mean(n_points) = :_.0f}, {np.std(n_points) = :_.0f} points per day")
    print(f"{np.mean(n_samples) = :_.0f}, {np.std(n_samples) = :_.0f} samples per day")
    print(f"{np.mean(n_points) * n_days = :_.0f} points")
    print(f"{np.mean(n_samples) * n_days = :_.0f} samples")
    unique_routes = np.unique(np.concatenate(route_ids[folder_name]))
    print(np.random.choice(unique_routes, int(len(unique_routes)*.05), replace=False))
    print()

### Traces

In [None]:
for folder_name in data_folders:
    folder_path = Path("..", "data", folder_name, "processed", "analysis")
    folder_days = list(folder_path.glob("*.pkl"))
    day_sample = pd.read_pickle(random.choice(folder_days))
    shingle_sample = random.choice(day_sample.shingle_id)
    shingle_sample = day_sample[day_sample['shingle_id'] == shingle_sample]
    print(f"{folder_name} Route ID: {shingle_sample['route_id'].iloc[0]}")
    plotting.formatted_shingle_scatterplot(shingle_sample)

### Feature Distributions

In [None]:
all_data = []
for folder_name in data_folders:
    folder_path = Path("..", "data", folder_name, "processed", "analysis")
    folder_days = list(folder_path.glob("*.pkl"))
    day_sample = pd.read_pickle(random.choice(folder_days)).to_crs("EPSG:4326")
    all_data.append(day_sample)
all_data = pd.concat(all_data)

In [None]:
plotting.formatted_feature_distributions_histplot(all_data)

### Grid

In [None]:
for folder_name in data_folders:
    folder_path = Path("..", "data", folder_name, "processed", "grid")
    folder_days = list(folder_path.glob("*.pkl"))
    day_sample = pickle.load(open(random.choice(folder_days), 'rb'))
    grid_frames = grid.convert_to_frames(day_sample)
    plotting.formatted_grid_animation(grid_frames[500:600,:,:])