In [2]:
import numpy as np
import pandas as pd
import os

In [3]:
# data directory
data_dir = "../data"

In [13]:
df = pd.read_csv('test.csv', parse_dates=["transit_timestamp"])

Unnamed: 0,transit_timestamp,transit_mode,station_complex_id,ridership,transfers
0,2024-12-31 15:00:00,subway,127,10,0
1,2024-12-31 15:00:00,subway,127,1,0
2,2024-12-31 15:00:00,subway,127,31,2
3,2024-12-31 15:00:00,subway,127,21,0
4,2024-12-31 15:00:00,subway,127,12,0
...,...,...,...,...,...
29995,2024-12-31 23:00:00,subway,636,2,0
29996,2024-12-31 23:00:00,subway,636,9,1
29997,2024-12-31 23:00:00,subway,636,1,0
29998,2024-12-31 23:00:00,subway,636,40,0


In [15]:
def remove_commas(x):
    if isinstance(x, int):
        return x
    return int(x.replace(',',''))
df["ridership"] = df["ridership"].apply(remove_commas).astype(int)

In [16]:
df = (
    df.groupby(["transit_timestamp", "station_complex_id"], as_index=False)
      .agg({"ridership": "sum"})
)

In [17]:
stats = pd.read_csv('../data/processed/stats.csv')

In [20]:
stn_mean = dict(zip(stats['station_complex_id'], stats['mean']))
stn_std = dict(zip(stats['station_complex_id'], stats['std']))

In [22]:
df['ridership_norm'] = df.apply(lambda row: (row['ridership'] - stn_mean[row['station_complex_id']]) / stn_std[row['station_complex_id']], axis=1)

In [24]:
# encode time
df["hour"] = df["transit_timestamp"].dt.hour
df["sin_hour"] = np.sin(2*np.pi*df["hour"]/24)
df["cos_hour"] = np.cos(2*np.pi*df["hour"]/24)

In [25]:
ComplexNodes = pd.read_csv('ComplexNodes.csv')
ComplexNodes = dict(zip(ComplexNodes['complex_id'], ComplexNodes['node_id']))

In [26]:
df['node_id'] = df['station_complex_id'].apply(lambda x: ComplexNodes[x])

In [29]:
# load edges
edges = pd.read_csv(os.path.join(data_dir, "ComplexEdges.csv"))
# construct tensor of edges based on node id not complex id
edge_tensor = []
for i in range(len(edges)):
    cmplx_start = edges.iloc[i, 0]
    cmplx_end = edges.iloc[i, 1]
    if cmplx_start not in ComplexNodes or cmplx_end not in ComplexNodes:
        continue
    start = ComplexNodes[cmplx_start]
    end = ComplexNodes[cmplx_end]
    edge_tensor.append([start, end])
    edge_tensor.append([end, start])

edge_tensor = np.array(edge_tensor)

In [30]:
import torch
edge_tensor = torch.from_numpy(edge_tensor.T)

In [31]:
num_nodes = edge_tensor.max() + 1

In [32]:
from tqdm import tqdm

features = []
targets = []

groups = {t: g for t, g in df.groupby("transit_timestamp")}
timestamps = sorted(groups.keys())

for t0, t1 in tqdm(
        zip(timestamps[:-1], timestamps[1:]),
        total=len(timestamps) - 1,
        desc="Building graph snapshots"
    ):

    X = torch.zeros(num_nodes, 3)
    y = torch.zeros(num_nodes)

    g0 = groups[t0]
    g1 = groups[t1]

    idx0 = torch.tensor(g0["node_id"].values)
    idx1 = torch.tensor(g1["node_id"].values)

    X[idx0, 0] = torch.tensor(g0["ridership_norm"].values.astype(np.float32))
    X[idx0, 1] = torch.tensor(g0["sin_hour"].values.astype(np.float32))
    X[idx0, 2] = torch.tensor(g0["cos_hour"].values.astype(np.float32))

    y[idx1] = torch.tensor(g1["ridership_norm"].values.astype(np.float32))

    features.append(X)
    targets.append(y)

Building graph snapshots: 100%|██████████| 8/8 [00:00<00:00, 2129.36it/s]


In [33]:
import torch.nn as nn
from torch_geometric.nn import GCNConv

class GNN(nn.Module):
    def __init__(self, in_dim, hidden_dim):
        super().__init__()
        self.conv1 = GCNConv(in_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.mlp   = nn.Linear(hidden_dim, 1)

    def forward(self, x, edge_index):
        h = torch.relu(self.conv1(x, edge_index))
        h = torch.relu(self.conv2(h, edge_index))
        return self.mlp(h).squeeze()

  from .autonotebook import tqdm as notebook_tqdm


In [34]:
model = GNN(in_dim=3, hidden_dim=64)

In [35]:
model.load_state_dict(torch.load('../models/ichack_9.pt'))

<All keys matched successfully>

In [37]:
model.eval()
loss_fn = nn.MSELoss()
with torch.no_grad():
    total_loss = 0
    for X, y in tqdm(zip(features, targets), total=len(features)):
        y_hat = model(X, edge_tensor)
        loss = loss_fn(y_hat, y)
        total_loss += loss.item()

    print(f"MSE = {total_loss/len(features):.4f}")

100%|██████████| 8/8 [00:00<00:00, 1024.59it/s]

MSE = 0.1304



