This project takes in a CSV of transactions from [Intuit's Mint, Expenses Tracking App](https://mint.intuit.com) and summarizes cash flow into a Sankey visual, i.e.:
* exports a txt that can be used to generate a Sankey diagram at [SankeyMatic](https://sankeymatic.com/build)
* displays a sankey plot using python plotly (not as beautiful imo)

In [None]:
# Import dependencies
import yaml
import pandas as pd

TRANSACTION_CSV = ""
CATEGORIES_YAML = ""

# Load dataset
df = pd.read_csv(TRANSACTION_CSV)

# Load a map to help normalize categories in dataset
# Maps 'Category' (in the transaction csv, which could be a main category or sub category) -> (Main Category, Sub Category).
# Example:
# Shopping (main category) -> (Shopping, None)
# Clothing (sub category) -> (Shopping, Clothing)
category_map = {}
with open(CATEGORIES_YAML, "r") as rstream:
    for main, subs in yaml.safe_load(rstream).items():
        category_map[main] = (main, None)
        if subs:
            for sub_category in subs:
                category_map[sub_category] = (main, sub_category)


In [None]:
# Add Main Category and Sub Category fields to every row (transaction) using the category_map

df["Main Category"] = df.apply(lambda row: category_map[row["Category"]][0], axis=1)

df["Sub Category"] = df.apply(
    lambda row: category_map[row["Category"]][1], axis=1
).fillna(df["Main Category"] + " (uncategorized)")


In [None]:
# Filter out unwanted categories and accounts

is_unwanted_accounts = df["Account Name"].isin([])
is_unwanted_categories = df["Main Category"].isin(["Transfer"])

df = df[~is_unwanted_accounts & ~is_unwanted_categories]


In [None]:
# Calculate the sums for each Sub Category and Main Category

# Make debits negative so sums are accurate
df.loc[df["Transaction Type"] == "debit", "Amount"] *= -1


sums_by_main_category = df.groupby("Main Category", dropna=False, as_index=False)[
    "Amount"
].sum(numeric_only=True)


# Note: Keep Main Category in this groupby filter for convenience: later we'll need that in generating the sankey input
sums_by_subcategory = df.groupby(
    ["Sub Category", "Main Category"], dropna=False, as_index=False
)["Amount"].sum(numeric_only=True)


sums_by_subcategory


In [None]:
# Transform the dataframe to Sankey Diagrams

import plotly.graph_objects as go

INFLOW_CATEGORIES = ["Income", "Taxes"]

inflows = sums_by_main_category[
    sums_by_main_category["Main Category"].isin(INFLOW_CATEGORIES)
]
outflows = sums_by_main_category[
    ~sums_by_main_category["Main Category"].isin(INFLOW_CATEGORIES)
]
outflows_subcategory = sums_by_subcategory[
    ~sums_by_subcategory["Main Category"].isin(INFLOW_CATEGORIES)
]


def cashflow_to_sankeymatic():
    # Sankeymatic (https://sankeymatic.com/build) uses a text-based format like:
    # Source [amount(absolute)] Sink
    output = ""

    # Layer 1: Route all inflows: <inflow> -> Net Income
    for i, row in inflows.iterrows():
        amount = abs(round(row["Amount"], 2))
        mc = row["Main Category"]
        output += f"{mc} [{amount}] Net Income\n"

    # Layer 2: Route outflows (main category): Net Income -> <main category>
    for i, row in outflows.iterrows():
        amount = abs(round(row["Amount"], 2))
        mc = row["Main Category"]
        output += f"Net Income [{amount}] {mc}\n"

    # Layer 3: Route outflows (sub category) under each main category: <main category> -> <sub category>
    for i, row in outflows_subcategory.iterrows():
        amount = abs(round(row["Amount"], 2))
        mc = row["Main Category"]
        sc = row["Sub Category"]
        output += f"{mc} [{amount}] {sc}\n"
    return output


def cashflow_to_plotly():
    sources = []
    targets = []
    amounts = []

    labels_index = {"Net Income": 0}
    labels = ["Net Income"]

    def get_label_index(label):
        if label not in labels_index:
            labels.append(label)
            labels_index[label] = len(labels) - 1
        return labels_index[label]

    # Layer 1: Route all inflows: <inflow> -> Net Income
    for i, row in inflows.iterrows():
        sources.append(get_label_index(row["Main Category"]))
        amounts.append(abs(round(row["Amount"], 2)))
        targets.append(0)

    # Layer 2: Route outflows (main category): Net Income -> <main category>
    for i, row in outflows.iterrows():
        sources.append(0)
        amounts.append(abs(round(row["Amount"], 2)))
        targets.append(get_label_index(row["Main Category"]))

    # Layer 3: Route outflows (sub category) under each main category: <main category> -> <sub category>
    for i, row in outflows_subcategory.iterrows():
        sources.append(get_label_index(row["Main Category"]))
        amounts.append(abs(round(row["Amount"], 2)))
        targets.append(get_label_index(row["Sub Category"]))

    fig = go.Figure(
        data=[
            go.Sankey(
                arrangement="fixed",
                node=dict(
                    pad=15,
                    thickness=5,
                    line=dict(color="black", width=0.5),
                    label=labels,
                    color="blue",
                ),
                link=dict(source=sources, target=targets, value=amounts),
            )
        ]
    )

    fig.update_layout(title_text="Sankey Diagram of Cash Flow", width=1000, height=1200)
    fig.show()


# Save sankeymatic output to a text file, that's txt input to feed into SankeyMatic (if you want that option)
sankey = cashflow_to_sankeymatic()
with open("sankeymatic-input.txt", "w") as rstream:
    rstream.write(sankey)

# Also display a Sankey diagram locally using plotly
cashflow_to_plotly()
