In [1]:
import datetime
import os
import random
import sys
sys.path.append("../")

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from openbustools.traveltime import grid, data_loader
from openbustools import plotting, spatial, standardfeeds

In [6]:
network_name = 'kcm'
data_folders = ['../data/kcm_realtime/processed/', '../data/atb_realtime/processed/']
train_date = '2023_03_15'
train_n = 3
train_dates = standardfeeds.get_date_list(train_date, int(train_n))

### Metadata

In [29]:
route_ids = {k:[] for k in data_folders}
sample_size = 5
for folder_name in data_folders:
    data_dates = [f for f in os.listdir(folder_name) if os.path.isfile(f'{folder_name}{f}')]
    n_dates = len(data_dates)
    n_samples = []
    n_points = []
    for file_name in random.sample(data_dates, sample_size):
        data = pd.read_pickle(f'{folder_name}{file_name}')
        n_samples.append(len(pd.unique(data['shingle_id'])))
        n_points.append(len(data))
        route_ids[folder_name].append(pd.unique(data['route_id']))
    print(folder_name)
    print(f"{n_dates} days")
    print(f"{np.mean(n_points)}, {np.std(n_points)} points per day")
    print(f"{np.mean(n_samples)}, {np.std(n_samples)} samples per day")
    print(f"{int(np.mean(n_points) * n_dates)} points")
    print(f"{int(np.mean(n_samples) * n_dates)} samples")
    unique_routes = np.unique(np.concatenate(route_ids[folder_name]))
    print(np.random.choice(unique_routes, int(len(unique_routes)*.05), replace=False))

../data/kcm_realtime/processed/
230 days
345127.4, 104873.11225972079 points per day
21122.4, 5311.272600799171 samples per day
79379302 points
4858152 samples
['100246' '102705' '100113' '102720' '100031' '100068']
../data/atb_realtime/processed/
230 days
55001.6, 17375.701270452366 points per day
3975.6, 826.9221486935757 samples per day
12650368 points
914388 samples
['ATB:Line:2_26' 'ATB:Line:2_25' 'ATB:Line:2_116']


### Traces

In [None]:
train_dataset = data_loader.DictDataset(data_folders, train_dates)
data_df = train_dataset.data
data_df.head()

In [None]:
# Sample one trajectory for plotting, use KCM network
sample = data_df[data_df['data_folder']=='../data/kcm_realtime/processed/'].sample(1)
sample_trip_id = sample['trip_id'].iloc[0]
sample_file = sample['file'].iloc[0]
sample_df = data_df[(data_df['trip_id']==sample_trip_id)&(data_df['file']==sample_file)]
sample_df = sample_df[sample_df['file']==sample_file]
sample_df = sample_df.to_crs('EPSG:32148').reset_index()
sample_df.head()

In [None]:
plotting.formatted_shingle_scatterplot(sample_df)

### Feature Distributions

In [None]:
plotting.formatted_feature_distributions_histplot(data_df)

### Grid

In [None]:
g = train_dataset.grids[data_folders[0]][train_date]

In [None]:
res = grid.convert_to_frames(g)

In [None]:
plotting.formatted_grid_animation(res[500:600,:,:])