In [1]:
import pandas as pd
import os

os.chdir("..")
os.getcwd()


'/Users/suryayalavarthi/Downloads/household_finance_lab'

In [2]:
df_tx = pd.read_csv("data/cleaned/transactions_clean.csv")
df_tx.head()


Unnamed: 0,transaction_id,account_id,date,amount,merchant,raw_category
0,bdd640fb-0667-4ad1-9c80-317fa3b1799d,IUPM04409079772781,2023-11-05 15:54:38,3198.94,houston group,transport
1,23b8c1e9-3924-46de-beb1-3b9046685257,BLAT22216107051843,2024-04-21 22:21:55,129.93,anderson-phillips,grocery
2,bd9c66b3-ad3c-4d6d-9a3d-1fa7bc8960a9,UTXA55295806601382,2023-07-17 13:25:56,1378.77,jensen group,shopping
3,972a8469-1641-4f82-8b9d-2434e465e150,XICF70493862044851,2023-06-27 16:09:52,1119.94,"nelson, gomez and rodriguez",healthcare
4,17fc695a-07a0-4a6e-8822-e8f36c031199,KOSW19711121259020,2024-03-26 23:45:31,3683.67,caldwell group,entertainment


In [3]:
df_tx.columns
df_tx.shape


(5389, 6)

In [4]:
def categorize_spending(merchant, category):
    text = f"{merchant} {category}".lower()

    if any(k in text for k in ["rent", "mortgage"]):
        return "housing"
    elif any(k in text for k in ["grocery", "walmart", "kroger", "restaurant", "food"]):
        return "food"
    elif any(k in text for k in ["gas", "uber", "lyft", "transport"]):
        return "transport"
    elif any(k in text for k in ["electric", "internet", "utility"]):
        return "utilities"
    elif any(k in text for k in ["hospital", "pharmacy", "medical"]):
        return "healthcare"
    elif any(k in text for k in ["netflix", "spotify", "movie", "entertainment"]):
        return "entertainment"
    elif any(k in text for k in ["amazon", "shopping", "retail"]):
        return "shopping"
    else:
        return "other"


In [5]:
df_tx["spend_category"] = df_tx.apply(
    lambda row: categorize_spending(row["merchant"], row["raw_category"]),
    axis=1
)

df_tx["spend_category"].value_counts()


spend_category
other            3469
food              744
entertainment     407
shopping          389
transport         380
Name: count, dtype: int64

In [6]:
needs_categories = [
    "housing",
    "food",
    "transport",
    "utilities",
    "healthcare"
]

df_tx["needs_or_wants"] = df_tx["spend_category"].apply(
    lambda x: "needs" if x in needs_categories else "wants"
)

df_tx["needs_or_wants"].value_counts()


needs_or_wants
wants    4265
needs    1124
Name: count, dtype: int64

In [7]:
fixed_categories = [
    "housing",
    "utilities",
    "healthcare"
]

df_tx["fixed_or_variable"] = df_tx["spend_category"].apply(
    lambda x: "fixed" if x in fixed_categories else "variable"
)

df_tx["fixed_or_variable"].value_counts()


fixed_or_variable
variable    5389
Name: count, dtype: int64

In [8]:
df_tx[["spend_category", "needs_or_wants", "fixed_or_variable"]].head(10)


Unnamed: 0,spend_category,needs_or_wants,fixed_or_variable
0,transport,needs,variable
1,food,needs,variable
2,shopping,wants,variable
3,other,wants,variable
4,entertainment,wants,variable
5,other,wants,variable
6,other,wants,variable
7,transport,needs,variable
8,entertainment,wants,variable
9,other,wants,variable


In [9]:
expense_by_category = (
    df_tx.groupby("spend_category")["amount"]
    .sum()
    .sort_values()
)

expense_by_category


spend_category
shopping          952695.92
transport         961520.24
entertainment    1016578.19
food             1940103.98
other            8626656.21
Name: amount, dtype: float64

In [10]:
fixed_vs_variable = (
    df_tx.groupby("fixed_or_variable")["amount"]
    .sum()
)

fixed_vs_variable


fixed_or_variable
variable    13497554.54
Name: amount, dtype: float64

In [11]:
total_income = df_tx[df_tx["amount"] > 0]["amount"].sum()
total_expense = abs(df_tx[df_tx["amount"] < 0]["amount"].sum())

savings_rate = round((total_income - total_expense) / total_income * 100, 2)

total_income, total_expense, savings_rate


(13497554.540000001, 0.0, 100.0)

In [12]:
df_debts = pd.read_csv("data/cleaned/debts_clean.csv")
df_debts


Unnamed: 0,debt_id,account_id,current_balance,interest_rate,minimum_payment
0,D1,IUPM04409079772781,5200.0,22.9,160.0
1,D2,BLAT22216107051843,12400.0,7.2,275.0
2,D3,UTXA55295806601382,8600.0,9.9,210.0
3,D4,XICF70493862044851,3100.0,19.5,95.0
4,D5,KOSW19711121259020,17800.0,5.9,320.0


In [13]:
total_debt = df_debts["current_balance"].sum()
debt_burden_ratio = round(total_debt / total_income * 100, 2)

total_debt, debt_burden_ratio


(47100.0, 0.35)

In [14]:
diversification_score = min(len(df_tx["spend_category"].unique()) * 10, 100)
diversification_score


50

In [15]:
savings_score = min(max(savings_rate, 0), 100)
debt_score = max(0, 100 - debt_burden_ratio)

financial_wellness_score = round(
    0.4 * savings_score +
    0.3 * diversification_score +
    0.3 * debt_score,
    2
)

financial_wellness_score


84.9