# Sankey Builder

The goal is to get Montreal's financial data from the raw pdf files into the same structure used in Canada Spend's sankey structure (<href srf="https://github.com/BuildCanada/CanadaSpends/blob/main/data/ontario/sankey.json">see here</href>)

The excel file was built by scraping the PDF data of the financial statements.

This script is used to build out the sankey json structure.

In [12]:
import pandas as pd
import json
from pathlib import Path

# ========= Config =========
excel_path = Path("./montreal_financial_statement_data_2024.xlsx")  # <-- change if needed
sheet_name = "cleaned_data"
value_col  = "value"


In [None]:
# Hierarchy (left -> right). This is the flipped order you asked for:
# Category first, then the specific source, then capital/op, then transfer.
# You can tweak this list if you want a different grouping precedence.
column_order = [
    "sankey_1",
    "sankey_2",
    "sankey_3",  # (spelled as in the sheet)
    "sankey_4",
    "sankey_5"
]

# ========= Load =========
df = pd.read_excel(excel_path, sheet_name=sheet_name)
df[value_col] = pd.to_numeric(df[value_col], errors="coerce").fillna(0)

In [14]:
df.head()

Unnamed: 0,sankey_5,sankey_4,sankey_3,sankey_2,sankey_1,value,source
0,,,,Taxes,revenue,4174298,
1,,,,Payments in lieu of taxes,revenue,287923,
2,,,,Quota shares,revenue,544237,
3,,,,Service rendered,revenue,2155564,
4,,,,Fee Collection,revenue,416493,


# Data Cleaning

We need to clean the data a little bit. The values are reported in thousands, where Ontario's was in millions.

In [15]:
df['value'] = df['value'] / 1000000
df.head()

Unnamed: 0,sankey_5,sankey_4,sankey_3,sankey_2,sankey_1,value,source
0,,,,Taxes,revenue,4.174298,
1,,,,Payments in lieu of taxes,revenue,0.287923,
2,,,,Quota shares,revenue,0.544237,
3,,,,Service rendered,revenue,2.155564,
4,,,,Fee Collection,revenue,0.416493,


In [20]:
# Split revenue vs spending
rev_df = df[df["sankey_1"] == "revenue"].copy()
exp_df = df[df["sankey_1"] == "spending"].copy()

In [21]:
def build_tree(sub_df: pd.DataFrame, cols, value_col="value", root_name="Root", round_to=3):
    """
    Build a nested dict {"name": root_name, "children":[...]} for Sankey.
    - Each row contributes its value to a path formed by non-null labels in `cols`.
    - Internal nodes get "children"; leaves get {"name": ..., "amount": ...}.
    """
    def make_node():
        return {"__children": {}, "__amount": 0.0}

    root = make_node()

    for _, row in sub_df.iterrows():
        amt = float(row[value_col])
        if not amt:
            continue

        # Path from chosen columns, skipping nulls
        path = []
        for c in cols:
            val = row.get(c)
            if pd.notna(val):
                path.append(str(val))

        # Accumulate down the trie
        node = root
        node["__amount"] += amt
        for label in path:
            if label not in node["__children"]:
                node["__children"][label] = make_node()
            node = node["__children"][label]
            node["__amount"] += amt

    # Collapse trie -> Sankey schema
    def collapse(node, name):
        if node["__children"]:
            return {
                "name": name,
                "children": [
                    collapse(child_node, child_name)
                    for child_name, child_node in node["__children"].items()
                ],
            }
        else:
            return {"name": name, "amount": round(node["__amount"], round_to)}

    return {
        "name": root_name,
        "children": [
            collapse(child_node, child_name)
            for child_name, child_node in root["__children"].items()
        ],
    }

In [22]:
# Build both sides
revenue_data  = build_tree(rev_df, column_order, value_col=value_col, root_name="Revenue")
spending_data = build_tree(exp_df, column_order, value_col=value_col, root_name="Spending")
total_spend = round(float(exp_df[value_col].sum()), 3)
total_revenue = round(float(rev_df[value_col].sum()), 3)

# Compose output
out = {
    "total":    total_revenue - total_spend,
    "spending": total_spend,
    "revenue":  total_revenue,
    "spending_data": spending_data,
    "revenue_data":  revenue_data,
}

In [23]:
# ========= Save & Preview =========
out_path = Path("../sankey.json")  # rename if you like
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(out, f, ensure_ascii=False, indent=2)

print("Wrote:", out_path)
print("Totals:", {"revenue": out["revenue"], "spending": out["spending"], "total": out["total"]})


Wrote: ..\sankey.json
Totals: {'revenue': 9.804, 'spending': 10.692, 'total': -0.8879999999999999}
