# Sankey Builder

This notebook assumes you have done steps 1 and 2 and have created the final_data spreadsheet in order to start building the sankey.json.

## Purpose

Purpose of this notebook is to create the sankey json to feed into the front-end.

In [5]:
import pandas as pd
import json
from pathlib import Path


# Helper function to build out the sankey data

def build_tree(sub_df: pd.DataFrame, cols, value_col="value", root_name="Root", round_to=8):
    """
    Build a nested dict {"name": root_name, "children":[...]} for Sankey.
    - Each row contributes its value to a path formed by non-null labels in `cols`.
    - Internal nodes get "children"; leaves get {"name": ..., "amount": ...}.
    """
    def make_node():
        return {"__children": {}, "__amount": 0.0}

    root = make_node()

    for _, row in sub_df.iterrows():
        amt = float(row[value_col])
        if not amt:
            continue

        # Path from chosen columns, skipping nulls
        path = []
        for c in cols:
            val = row.get(c)
            if pd.notna(val):
                path.append(str(val))

        # Accumulate down the trie
        node = root
        node["__amount"] += amt
        for label in path:
            if label not in node["__children"]:
                node["__children"][label] = make_node()
            node = node["__children"][label]
            node["__amount"] += amt

    # Collapse trie -> Sankey schema
    def collapse(node, name):
        if node["__children"]:
            return {
                "name": name,
                "children": [
                    collapse(child_node, child_name)
                    for child_name, child_node in node["__children"].items()
                ],
            }
        else:
            return {"name": name, "amount": round(node["__amount"], round_to)}

    return {
        "name": root_name,
        "children": [
            collapse(child_node, child_name)
            for child_name, child_node in root["__children"].items()
        ],
    }

# ========= Config =========
excel_path = Path("./processed_data/final_data.xlsx")  # <-- change if needed
sheet_name = "Sheet1"
value_cols  = ["2023_24","2024_25"]


In [6]:

# Hierarchy (left -> right). This is the flipped order you asked for:
# Category first, then the specific source, then capital/op, then transfer.
# You can tweak this list if you want a different grouping precedence.
column_order = [
    "sankey_2",
    "sankey_3",
    "sankey_4",
]

# ========= Load =========
for value_col in value_cols:
    df = pd.read_excel(excel_path, sheet_name=sheet_name)
    df[value_col] = pd.to_numeric(df[value_col], errors="coerce").fillna(0)
    # Split revenue vs spending
    rev_df = df[df["sankey_1"] == "revenue"].copy()
    exp_df = df[df["sankey_1"] == "spending"].copy()
    # Build both sides
    revenue_data  = build_tree(rev_df, column_order, value_col=value_col, root_name="Revenue")
    spending_data = build_tree(exp_df, column_order, value_col=value_col, root_name="Spending")
    total_spend = round(float(exp_df[value_col].sum()), 8)
    total_revenue = round(float(rev_df[value_col].sum()), 8)

    # Compose output
    out = {
        "total":    total_revenue - total_spend,
        "spending": total_spend,
        "revenue":  total_revenue,
        "spending_data": spending_data,
        "revenue_data":  revenue_data,
    }
    # ========= Save & Preview =========
    out_path = Path("../"+ value_col + "/sankey.json")  # rename if you like
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(out, f, ensure_ascii=False, indent=2)

    print("Wrote:", out_path)
    print("Totals:", {"revenue": out["revenue"], "spending": out["spending"], "total": out["total"]})

Wrote: ..\2023_24\sankey.json
Totals: {'revenue': 208.975, 'spending': 209.668, 'total': -0.693000000000012}
Wrote: ..\2024_25\sankey.json
Totals: {'revenue': 226.161, 'spending': 227.251, 'total': -1.0900000000000034}
