In [None]:
from pathlib import Path
import pickle
import random
import sys
sys.path.append("../")

import gtfs_kit as gk
from matplotlib import animation
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from openbustools.traveltime import grid
from openbustools import plotting, standardfeeds, spatial

In [None]:
data_folders = ['kcm_realtime', 'atb_realtime']

### Metadata

In [None]:
sample_size = 10
route_ids = {k:[] for k in data_folders}
for folder_name in data_folders:
    folder_path = Path("..", "data", folder_name, "processed", "analysis")
    folder_days = list(folder_path.glob("*.pkl"))
    n_days = len(folder_days)
    n_samples = []
    n_points = []
    for file_name in random.sample(folder_days, sample_size):
        data = pd.read_pickle(file_name)
        n_samples.append(len(pd.unique(data['shingle_id'])))
        n_points.append(len(data))
        route_ids[folder_name].append(pd.unique(data['route_id']))
    print(folder_name)
    print(f"{n_days} days")
    print(f"{np.mean(n_points) = :_.0f}, {np.std(n_points) = :_.0f} points per day")
    print(f"{np.mean(n_samples) = :_.0f}, {np.std(n_samples) = :_.0f} samples per day")
    print(f"{np.mean(n_points) * n_days = :_.0f} points")
    print(f"{np.mean(n_samples) * n_days = :_.0f} samples")
    unique_routes = np.unique(np.concatenate(route_ids[folder_name]))
    print(np.random.choice(unique_routes, int(len(unique_routes)*.05), replace=False))
    print()

### Traces

In [None]:
for folder_name in data_folders:
    folder_path = Path("..", "data", folder_name, "processed", "analysis")
    folder_days = list(folder_path.glob("*.pkl"))
    day_sample = pd.read_pickle(random.choice(folder_days))
    shingle_sample = random.choice(day_sample.shingle_id)
    shingle_sample = day_sample[day_sample['shingle_id'] == shingle_sample]
    print(f"{folder_name} Route ID: {shingle_sample['route_id'].iloc[0]}")
    plotting.formatted_shingle_scatterplot(shingle_sample)

### Feature Distributions

In [None]:
all_data = []
for folder_name in data_folders:
    folder_path = Path("..", "data", folder_name, "processed", "analysis")
    folder_days = list(folder_path.glob("*.pkl"))
    day_sample = pd.read_pickle(folder_path / random.choice(folder_days)).to_crs("EPSG:4326")
    all_data.append(day_sample)
all_data = pd.concat(all_data)

In [None]:
plotting.formatted_feature_distributions_histplot(all_data, title_text="network_sample_distributions")

### Grid

In [None]:
# import importlib
# importlib.reload(plotting)

# cleaned_sources = pd.read_csv("../data/cleaned_sources.csv")

# for i,row in cleaned_sources.iloc[:].iterrows():
#     try:
#         # Load network realtime data from a random available day
#         folder_name = f"{row['uuid']}_realtime"
#         file_path = Path("..", "data", "other_feeds", folder_name, "processed", "grid", "2024_04_10.pkl")
#         print(f"Processing {folder_name}")
#         day_sample = pickle.load(open(file_path, 'rb'))
#         grid_frames = grid.convert_to_frames(day_sample)
#         # grid_frames = grid.ffill_array(grid_frames)

#         # Make the plot
#         plotting.formatted_grid_animation(grid_frames, title_text=f"grid_animation_{folder_name}", start_frame=120, end_frame=900, location_str=f"{row['provider']} - ({row['municipality']}, {row['country_code']})", fps=20)
#     except:
#         print(f"Error processing {folder_name}")

### Other Networks

In [None]:
# # Interpolate location and time
# realtime_data['timestamp'] = pd.to_datetime(realtime_data['locationtime'], unit='s')
# realtime_data['x_interp'] = realtime_data['x'].resample()
# sns.histplot(all_realtime_data['calc_time_s'])
# realtime_data

In [None]:
import importlib
importlib.reload(plotting)

cleaned_sources = pd.read_csv("../data/cleaned_sources.csv")

all_realtime_data = []
all_static_data = []
for i,row in cleaned_sources.iloc[:].iterrows():
    # Load network realtime data from a random available day
    realtime_file_path = Path("..", "data", "other_feeds", f"{row['uuid']}_realtime", "processed", "analysis")
    realtime_files = list(realtime_file_path.glob("*.pkl"))
    if len(realtime_files) == 0:
        print(f"No realtime data found for {row['uuid']}")
        continue
    file_path = random.choice(realtime_files)
    print(f"Processing {file_path}")
    realtime_data = pd.read_pickle(file_path).to_crs("EPSG:4326")
    # Load network static data corresponding to the realtime data
    static_file_path = Path("..", "data", "other_feeds", f"{row['uuid']}_static")
    best_static = standardfeeds.latest_available_static(realtime_data.realtime_filename[0][:-4], static_file_path)
    static_feed = gk.read_feed(Path(static_file_path, best_static), dist_units="km")
    # Add calculated params
    bbox = static_feed.compute_bounds()
    area_sqkm = spatial.bbox_area(bbox[0], bbox[1], bbox[2], bbox[3])
    # All from most active service ID
    largest_service_id = static_feed.trips.groupby('service_id')['block_id'].count().idxmax()
    trips = static_feed.trips[static_feed.trips['service_id'] == largest_service_id].copy()
    # Each trip is a block if no block IDs
    if trips['block_id'].nunique() == 0:
        no_block_ids = 1
        trips['block_id'] = trips['trip_id']
    else:
        no_block_ids = 0
    # Trip distance
    shapes = static_feed.geometrize_shapes()
    shapes['line_dist_m'] = shapes['geometry'].apply(lambda x: x.length / .00001)
    trips = pd.merge(trips, shapes, on='shape_id', how='left')
    stop_times = static_feed.stop_times
    # Trip active duration
    stop_times = stop_times.dropna(subset=['arrival_time', 'departure_time']).copy()
    stop_times['start'] = stop_times['arrival_time'].apply(lambda x: int(x.split(":")[0])*60 + int(x.split(":")[1]))
    stop_times['end'] = stop_times['departure_time'].apply(lambda x: int(x.split(":")[0])*60 + int(x.split(":")[1]))
    stop_times = stop_times.groupby('trip_id').agg({'start': 'first', 'end': 'last'})
    stop_times['trip_duration_min'] = stop_times['end'] - stop_times['start']
    trips = pd.merge(trips, stop_times, on='trip_id', how='left')
    # Trip deadhead
    trips = trips.sort_values(['block_id', 'start'])
    trips['trip_deadhead_min'] = trips.groupby('block_id').apply(lambda x: x['start'].shift(-1) - x['end'], include_groups=False).values
    trips['trip_deadhead_min'] = trips['trip_deadhead_min'].fillna(0)
    static_data = {
        "provider": row['provider'],
        "municipality": row['municipality'],
        "country_code": row['country_code'],
        "n_realtime_points": len(realtime_data),
        "n_realtime_trips": len(pd.unique(realtime_data['trip_id'])),
        "n_static_trips": len(trips),
        "n_static_blocks": trips['block_id'].nunique(),
        "no_block_ids": no_block_ids,
        "service_veh_km": trips['line_dist_m'].sum() / 1000,
        "mean_block_veh_km": trips.groupby('block_id')['line_dist_m'].sum().mean() / 1000,
        "mean_block_duration_min": trips.groupby('block_id')['trip_duration_min'].sum().mean(),
        "mean_block_deadhead_min": trips.groupby('block_id')['trip_deadhead_min'].sum().mean(),
        "area_sqkm": area_sqkm,
    }
    realtime_data['provider'] = row['provider']
    all_realtime_data.append(realtime_data)
    all_static_data.append(static_data)
all_realtime_data = pd.concat(all_realtime_data)
all_static_data = pd.DataFrame(all_static_data)

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(10, 18))
sns.barplot(x=all_static_data['mean_block_deadhead_min'], y=all_static_data['provider'], ax=axes, order=all_static_data.groupby('provider')['mean_block_deadhead_min'].mean().sort_values(ascending=False).index)
print(all_static_data['mean_block_deadhead_min'].describe())

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(10, 18))
sns.barplot(x=all_static_data['mean_block_duration_min'], y=all_static_data['provider'], ax=axes, order=all_static_data.groupby('provider')['mean_block_duration_min'].mean().sort_values(ascending=False).index)
print(all_static_data['mean_block_duration_min'].describe())

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(10, 18))
sns.barplot(x=all_static_data['mean_block_veh_km'], y=all_static_data['provider'], ax=axes, order=all_static_data.groupby('provider')['mean_block_veh_km'].mean().sort_values(ascending=False).index)
print(all_static_data['mean_block_veh_km'].describe())

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(10, 18))
sns.barplot(x=all_static_data['service_veh_km'], y=all_static_data['provider'], ax=axes, order=all_static_data.groupby('provider')['service_veh_km'].mean().sort_values(ascending=False).index)
print(all_static_data['service_veh_km'].describe())

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(10, 18))
sns.barplot(x=all_static_data['n_static_blocks'], y=all_static_data['provider'], ax=axes, order=all_static_data.groupby('provider')['n_static_blocks'].mean().sort_values(ascending=False).index)
print(all_static_data['n_static_blocks'].describe())

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(10, 18))
sns.barplot(x=all_static_data['area_sqkm'], y=all_static_data['provider'], ax=axes, order=all_static_data.groupby('provider')['area_sqkm'].mean().sort_values(ascending=False).index)
print(all_static_data['area_sqkm'].describe())

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(10, 18))
sns.barplot(x=all_static_data['n_realtime_points'], y=all_static_data['provider'], ax=axes, order=all_static_data.groupby('provider')['n_realtime_points'].mean().sort_values(ascending=False).index)
print(all_static_data['n_realtime_points'].describe())