# Defs

In [30]:
import re
import sys
import json
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import matplotlib.pyplot as plt
root = Path.cwd().parent
sys.path.append(str(root))
plt.rcParams.update({"text.usetex": True, "font.family": "Computer Modern"}) 
from src.utils.time import hhmmss_to_seconds

root = Path.cwd().parent
sys.path.append(str(root))
plt.rcParams.update({"text.usetex": True, "font.family": "Computer Modern"}) 

parent = Path('..','data')
dynamic_path = parent / 'dynamic_gtfs'
static_path = parent / 'static_gtfs'

# Load static GTFS

In [31]:
routes = pd.read_csv(static_path / 'routes.txt', sep=',', low_memory=False)
trips = pd.read_csv(static_path / 'trips.txt', sep=',', low_memory=False)
stops = pd.read_csv(static_path / 'stops.txt', sep=',', low_memory=False)
shapes = pd.read_csv(static_path / 'shapes.txt', sep=',', low_memory=False)
stop_times = pd.read_csv(static_path / 'stop_times.txt', sep=',', low_memory=False)
stop_times['arrival_time'] = stop_times['arrival_time'].apply(hhmmss_to_seconds)
stop_times['departure_time'] = stop_times['departure_time'].apply(hhmmss_to_seconds)

KeyboardInterrupt: 

# Filtering static data (just buses)

In [6]:
bus_route_ids = list(routes[routes['route_type'] == 3]['route_id'].unique())
bus_trip_ids  = list(trips[trips['route_id'].isin(bus_route_ids)]['trip_id'].unique())
bus_shape_ids = list(trips[trips['route_id'].isin(bus_route_ids)]['shape_id'].unique())
bus_stop_ids = list(stop_times[stop_times['trip_id'].isin(bus_trip_ids)]['stop_id'].unique())

In [7]:
prev_lens = {
    'routes': len(routes), 
    'trips': len(trips), 
    'shapes': len(shapes), 
    'stops': len(stops),
    'stop_times': len(stop_times), 
}

routes = routes[routes['route_id'].isin(bus_route_ids)]
trips = trips[trips['trip_id'].isin(bus_trip_ids)]
shapes = shapes[shapes['shape_id'].isin(bus_shape_ids)]
stops = stops[stops['stop_id'].isin(bus_stop_ids)]
stop_times = stop_times[stop_times['trip_id'].isin(bus_trip_ids)]

filtered_lens = {
    'routes': len(routes), 
    'trips': len(trips), 
    'shapes': len(shapes), 
    'stops': len(stops),
    'stop_times': len(stop_times), 
}

In [8]:
print(f"{'Table':<12} {'Before':>8} {'After':>8} {'Diff':>8}")
print("-" * 36)
for table in prev_lens:
    before = prev_lens[table]
    after = filtered_lens[table]
    diff = before - after
    print(f"{table:<12} {before:>8} {after:>8} {diff:>8}")

Table          Before    After     Diff
------------------------------------
routes            367      303       64
trips          260366   175048    85318
shapes         631719   485517   146202
stops            6181     4501     1680
stop_times    5326956  3693058  1633898


# Graph construction

### Plot map

In [6]:
fig, ax = plt.subplots(1, figsize=(16,9), dpi=450, facecolor='black')
ax.set_facecolor('black')

ax.scatter(stops['stop_lon'], stops['stop_lat'], s=2, c='white', alpha=0.7)
for _, group in shapes.groupby("shape_id"):
    group = group.sort_values("shape_pt_sequence")
    ax.plot(group["shape_pt_lon"], group["shape_pt_lat"], linewidth=0.3, c='white')

ax.set_aspect('equal')
plt.tight_layout()
fig.savefig('graph.png', facecolor=fig.get_facecolor())
plt.close(fig)


### Construct graph from constant

In [25]:
# Neceessary rows
tmp = stop_times.copy()
tmp = tmp[tmp['stop_headsign'].notna() & (tmp['stop_headsign'] != "")]
tmp = stop_times[['trip_id','stop_id','arrival_time','departure_time','stop_sequence','shape_dist_traveled']].copy()
tmp = tmp.sort_values(['trip_id', 'stop_sequence']).copy()

# Next stop and arrival time
tmp['next_stop'] = tmp.groupby('trip_id')['stop_id'].shift(-1)
tmp['next_arrival_time'] = tmp.groupby('trip_id')['arrival_time'].shift(-1)
tmp['next_shape_dist'] = tmp.groupby('trip_id')['shape_dist_traveled'].shift(-1)

# Travel time and distance until the next stop
tmp['dt'] = tmp['next_arrival_time'] - tmp['departure_time']
tmp['ds'] = tmp['next_shape_dist'] - tmp['shape_dist_traveled']

# Drop invalid rows, and rename the cols
tmp = tmp.dropna(subset=['next_stop', 'dt'])[['stop_id', 'next_stop', 'dt', 'ds']]
tmp = tmp.rename(columns={'stop_id': 'src', 'next_stop': 'dst'})

# Edges: average travel time between nodes
edges = tmp.groupby(['src', 'dst'], as_index=False).agg({
    'dt': 'mean',
    'ds': 'mean'
})
del tmp

# Nodes: stops
nodes = stops[['stop_id','stop_lat','stop_lon']]

In [27]:
# Check the filtering
nodes_of_edges = set(pd.concat([edges['src'], edges['dst']]).unique())
nodes_itself = set(nodes['stop_id'])
nodes_of_edges - nodes_itself, nodes_itself - nodes_of_edges

(set(), set())

In [28]:
edges.to_csv(parent / "graphs" / "edges.csv", index=False)
nodes.to_csv(parent / "graphs" / "nodes.csv", index=False)

### Find edge features

In [None]:
edges = pd.read_csv(parent / "graphs" / "edges.csv")
nodes = pd.read_csv(parent / "graphs" / "nodes.csv")
display(edges.head(4))
display(nodes.head(4))

In [18]:
import networkx as nx

G = nx.DiGraph()

G.add_nodes_from(nodes)

# Add edges with weight
for _, row in edges.iterrows():
    G.add_edge(row['src'], row['dst'], weight=row['travel_time'])

In [26]:
from pyvis.network import Network

net = Network(height="800px", width="100%", directed=True)
net.from_nx(G)

# Disable labels
for node in net.nodes:
    node["label"] = ""
    
net.save_graph("large_graph.html")