In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
# data directory
data_dir = "../data"

In [3]:
# read main csv file for historical data
df = pd.read_csv(os.path.join(data_dir, "2020_2025.csv"), parse_dates=["transit_timestamp"])

  df = pd.read_csv(os.path.join(data_dir, "2020_2025.csv"), parse_dates=["transit_timestamp"])


In [4]:
df = df.sort_values(["transit_timestamp", "station_complex_id"])

In [15]:
def remove_commas(x):
    if isinstance(x, int):
        return x
    return int(x.replace(',',''))
df["ridership"] = df["ridership"].apply(remove_commas).astype(int)

In [127]:

df = (
    df.groupby(["transit_timestamp", "station_complex_id"], as_index=False)
      .agg({"ridership": "sum"})
)

KeyboardInterrupt: 

In [19]:
stats = (
    df.groupby("station_complex_id")["ridership"]
      .agg(["mean", "std"])
      .reset_index()
)

df = df.merge(stats, on="station_complex_id", how="left")

df["ridership_norm"] = (
    (df["ridership"] - df["mean"]) / (df["std"] + 1e-6)
)

Unnamed: 0,transit_timestamp,station_complex_id,ridership,mean,std,ridership_norm
0,2020-01-31 21:00:00,1,237,322.984314,356.672864,-0.241073
1,2020-01-31 21:00:00,2,105,225.071762,250.889779,-0.478584
2,2020-01-31 21:00:00,3,238,289.131624,316.989901,-0.161304
3,2020-01-31 21:00:00,4,214,259.488151,284.206943,-0.160053
4,2020-01-31 21:00:00,5,128,140.768783,138.165089,-0.092417
...,...,...,...,...,...,...
17583106,2024-12-31 23:00:00,628,759,1621.510567,1879.448684,-0.458917
17583107,2024-12-31 23:00:00,629,255,364.325241,300.372848,-0.363965
17583108,2024-12-31 23:00:00,630,184,481.450077,434.032815,-0.685317
17583109,2024-12-31 23:00:00,635,158,564.122551,616.238983,-0.659034


In [21]:
# encode time
df["hour"] = df["transit_timestamp"].dt.hour
df["sin_hour"] = np.sin(2*np.pi*df["hour"]/24)
df["cos_hour"] = np.cos(2*np.pi*df["hour"]/24)

In [22]:
# get all station nodes
all_nodes = df["station_complex_id"].unique()
num_nodes = len(all_nodes)
# convert station_complex_id into 0 ... N-1 id
cmplx_to_node_id = {all_nodes[i]: i for i in range(len(all_nodes))}

In [126]:
stats.to_csv('../data/processed/stats.csv', index=False)

In [23]:
# add additional column for node id
df["node_id"] = df["station_complex_id"].apply(lambda x: cmplx_to_node_id[x])

In [24]:
# load edges
edges = pd.read_csv(os.path.join(data_dir, "complex_edges.csv"))

In [25]:
# construct tensor of edges based on node id not complex id
edge_tensor = []
for i in range(len(edges)):
    cmplx_start = edges.iloc[i, 0]
    cmplx_end = edges.iloc[i, 1]
    if cmplx_start not in cmplx_to_node_id or cmplx_end not in cmplx_to_node_id:
        continue
    start = cmplx_to_node_id[cmplx_start]
    end = cmplx_to_node_id[cmplx_end]
    edge_tensor.append([start, end])
    edge_tensor.append([end, start])

edge_tensor = np.array(edge_tensor)
    

In [26]:
import torch
edge_tensor = torch.from_numpy(edge_tensor.T)

In [27]:
num_nodes = edge_tensor.max() + 1

In [28]:
del all_nodes
del edges

In [61]:
from tqdm import tqdm

features = []
targets = []

groups = {t: g for t, g in df.groupby("transit_timestamp")}
timestamps = sorted(groups.keys())

for t0, t1 in tqdm(
        zip(timestamps[:-1], timestamps[1:]),
        total=len(timestamps) - 1,
        desc="Building graph snapshots"
    ):

    X = torch.zeros(num_nodes, 3)
    y = torch.zeros(num_nodes)

    g0 = groups[t0]
    g1 = groups[t1]

    idx0 = torch.tensor(g0["node_id"].values)
    idx1 = torch.tensor(g1["node_id"].values)

    X[idx0, 0] = torch.tensor(g0["ridership_norm"].values.astype(np.float32))
    X[idx0, 1] = torch.tensor(g0["sin_hour"].values.astype(np.float32))
    X[idx0, 2] = torch.tensor(g0["cos_hour"].values.astype(np.float32))

    y[idx1] = torch.tensor(g1["ridership_norm"].values.astype(np.float32))

    features.append(X)
    targets.append(y)

Building graph snapshots: 100%|██████████| 43101/43101 [00:11<00:00, 3796.32it/s]


In [57]:
import torch.nn as nn
from torch_geometric.nn import GCNConv

class GNN(nn.Module):
    def __init__(self, in_dim, hidden_dim):
        super().__init__()
        self.conv1 = GCNConv(in_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.mlp   = nn.Linear(hidden_dim, 1)

    def forward(self, x, edge_index):
        h = torch.relu(self.conv1(x, edge_index))
        h = torch.relu(self.conv2(h, edge_index))
        return self.mlp(h).squeeze()

In [58]:
%load_ext autoreload
%autoreload 2

In [59]:
torch.cuda.is_available()

True

In [63]:
model = GNN(in_dim=3, hidden_dim=64)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.MSELoss()

model.train()

for epoch in range(10):
    total_loss = 0

    for X, y in tqdm(zip(features, targets), total=len(features)):
        optimizer.zero_grad()

        y_hat = model(X, edge_tensor)
        loss = loss_fn(y_hat, y)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch}: loss = {total_loss/len(features):.4f}")
    if epoch % 2 != 0:
        torch.save(model.state_dict(), '../models/ichack_{}.pt'.format(epoch))

100%|██████████| 43101/43101 [01:22<00:00, 521.23it/s]


Epoch 0: loss = 0.1851


100%|██████████| 43101/43101 [01:22<00:00, 520.99it/s]


Epoch 1: loss = 0.1380


100%|██████████| 43101/43101 [01:22<00:00, 520.47it/s]


Epoch 2: loss = 0.1341


100%|██████████| 43101/43101 [01:22<00:00, 520.15it/s]


Epoch 3: loss = 0.1314


100%|██████████| 43101/43101 [01:22<00:00, 520.59it/s]


Epoch 4: loss = 0.1299


100%|██████████| 43101/43101 [01:22<00:00, 520.52it/s]


Epoch 5: loss = 0.1290


100%|██████████| 43101/43101 [01:22<00:00, 519.67it/s]


Epoch 6: loss = 0.1282


100%|██████████| 43101/43101 [01:22<00:00, 519.86it/s]


Epoch 7: loss = 0.1278


100%|██████████| 43101/43101 [01:22<00:00, 520.26it/s]


Epoch 8: loss = 0.1274


100%|██████████| 43101/43101 [01:22<00:00, 519.94it/s]


Epoch 9: loss = 0.1270
