In [1]:
import pandas as pd
import numpy as np
import os

os.chdir("..")


In [2]:
df_tx = pd.read_csv("data/cleaned/transactions_clean.csv")
df_debts = pd.read_csv("data/cleaned/debts_clean.csv")


In [3]:
df_tx.shape, df_debts.shape


((5389, 6), (5, 5))

In [4]:
df_tx["date"] = pd.to_datetime(df_tx["date"])
df_tx["month"] = df_tx["date"].dt.to_period("M")


In [5]:
monthly_cashflow = (
    df_tx.groupby("month")
    .agg(
        income=("amount", lambda x: x[x > 0].sum()),
        expenses=("amount", lambda x: abs(x[x < 0].sum()))
    )
    .reset_index()
)

monthly_cashflow


Unnamed: 0,month,income,expenses
0,2023-01,170478.36,0.0
1,2023-02,539168.83,0.0
2,2023-03,568602.98,0.0
3,2023-04,608623.38,0.0
4,2023-05,493361.16,0.0
5,2023-06,505794.42,0.0
6,2023-07,591276.01,0.0
7,2023-08,698684.18,0.0
8,2023-09,491826.0,0.0
9,2023-10,565523.14,0.0


In [6]:
avg_monthly_income = monthly_cashflow["income"].mean()
avg_monthly_expense = monthly_cashflow["expenses"].mean()

avg_monthly_income, avg_monthly_expense


(539902.1816, 0.0)

In [9]:
df_tx.columns


Index(['transaction_id', 'account_id', 'date', 'amount', 'merchant',
       'raw_category', 'month'],
      dtype='object')

In [10]:
def categorize_spending(merchant, category):
    text = f"{merchant} {category}".lower()

    if any(k in text for k in ["rent", "mortgage"]):
        return "housing"
    elif any(k in text for k in ["grocery", "walmart", "kroger", "restaurant", "food"]):
        return "food"
    elif any(k in text for k in ["gas", "uber", "lyft", "transport"]):
        return "transport"
    elif any(k in text for k in ["electric", "internet", "utility"]):
        return "utilities"
    elif any(k in text for k in ["hospital", "pharmacy", "medical"]):
        return "healthcare"
    elif any(k in text for k in ["netflix", "spotify", "movie", "entertainment"]):
        return "entertainment"
    elif any(k in text for k in ["amazon", "shopping", "retail"]):
        return "shopping"
    else:
        return "other"

df_tx["spend_category"] = df_tx.apply(
    lambda row: categorize_spending(row["merchant"], row["raw_category"]),
    axis=1
)


In [11]:
fixed_categories = ["housing", "utilities", "healthcare"]

df_tx["fixed_or_variable"] = df_tx["spend_category"].apply(
    lambda x: "fixed" if x in fixed_categories else "variable"
)


In [12]:
df_tx[["spend_category", "fixed_or_variable"]].head()


Unnamed: 0,spend_category,fixed_or_variable
0,transport,variable
1,food,variable
2,shopping,variable
3,other,variable
4,entertainment,variable


In [13]:
fixed_variable_monthly = (
    df_tx.groupby(["month", "fixed_or_variable"])["amount"]
    .sum()
    .unstack(fill_value=0)
    .reset_index()
)

fixed_variable_monthly["fixed"] = abs(fixed_variable_monthly.get("fixed", 0))
fixed_variable_monthly["variable"] = abs(fixed_variable_monthly.get("variable", 0))

fixed_variable_monthly.head()


fixed_or_variable,month,variable,fixed
0,2023-01,170478.36,0
1,2023-02,539168.83,0
2,2023-03,568602.98,0
3,2023-04,608623.38,0
4,2023-05,493361.16,0


In [15]:
avg_fixed_cost = fixed_variable_monthly["fixed"].mean()
avg_variable_cost = fixed_variable_monthly["variable"].mean()

avg_fixed_cost, avg_variable_cost


(0.0, 539902.1816)

In [16]:
baseline_monthly_income = avg_monthly_income
baseline_fixed_cost = avg_fixed_cost
baseline_variable_cost = avg_variable_cost

baseline_monthly_savings = baseline_monthly_income - (
    baseline_fixed_cost + baseline_variable_cost
)

baseline_savings_rate = round(
    baseline_monthly_savings / baseline_monthly_income * 100,
    2
)

baseline_monthly_income, baseline_monthly_savings, baseline_savings_rate


(539902.1816, 0.0, 0.0)

In [17]:
income_drop_factor = 0.8  # 20% drop

stress1_income = baseline_monthly_income * income_drop_factor
stress1_savings = stress1_income - (
    baseline_fixed_cost + baseline_variable_cost
)

stress1_savings_rate = round(
    stress1_savings / stress1_income * 100,
    2
)

stress1_income, stress1_savings, stress1_savings_rate


(431921.74528000003, -107980.43631999998, -25.0)

In [18]:
expense_spike_factor = 1.3  # 30% increase

stress2_variable_cost = baseline_variable_cost * expense_spike_factor
stress2_savings = baseline_monthly_income - (
    baseline_fixed_cost + stress2_variable_cost
)

stress2_savings_rate = round(
    stress2_savings / baseline_monthly_income * 100,
    2
)

stress2_variable_cost, stress2_savings, stress2_savings_rate


(701872.83608, -161970.65448000003, -30.0)

In [19]:
stress_test_summary = pd.DataFrame([
    {
        "scenario": "baseline",
        "monthly_income": baseline_monthly_income,
        "monthly_expenses": baseline_fixed_cost + baseline_variable_cost,
        "monthly_savings": baseline_monthly_savings,
        "savings_rate_percent": baseline_savings_rate
    },
    {
        "scenario": "income_drop_20pct",
        "monthly_income": stress1_income,
        "monthly_expenses": baseline_fixed_cost + baseline_variable_cost,
        "monthly_savings": stress1_savings,
        "savings_rate_percent": stress1_savings_rate
    },
    {
        "scenario": "expense_spike_30pct",
        "monthly_income": baseline_monthly_income,
        "monthly_expenses": baseline_fixed_cost + stress2_variable_cost,
        "monthly_savings": stress2_savings,
        "savings_rate_percent": stress2_savings_rate
    }
])

stress_test_summary


Unnamed: 0,scenario,monthly_income,monthly_expenses,monthly_savings,savings_rate_percent
0,baseline,539902.1816,539902.1816,0.0,0.0
1,income_drop_20pct,431921.74528,539902.1816,-107980.43632,-25.0
2,expense_spike_30pct,539902.1816,701872.83608,-161970.65448,-30.0


In [20]:
# Reuse diversification logic from Subtask C
diversification_score = min(len(df_tx["spend_category"].unique()) * 10, 100)

# Debt burden (monthly proxy)
total_debt = df_debts["current_balance"].sum()
debt_burden_ratio = total_debt / baseline_monthly_income * 100


In [21]:
def compute_wellness_score(savings_rate, diversification_score, debt_burden_ratio):
    savings_score = min(max(savings_rate, 0), 100)
    debt_score = max(0, 100 - debt_burden_ratio)

    return round(
        0.4 * savings_score +
        0.3 * diversification_score +
        0.3 * debt_score,
        2
    )


In [22]:
stress_test_summary["wellness_score"] = stress_test_summary.apply(
    lambda row: compute_wellness_score(
        row["savings_rate_percent"],
        diversification_score,
        debt_burden_ratio
    ),
    axis=1
)

stress_test_summary


Unnamed: 0,scenario,monthly_income,monthly_expenses,monthly_savings,savings_rate_percent,wellness_score
0,baseline,539902.1816,539902.1816,0.0,0.0,42.38
1,income_drop_20pct,431921.74528,539902.1816,-107980.43632,-25.0,42.38
2,expense_spike_30pct,539902.1816,701872.83608,-161970.65448,-30.0,42.38


In [24]:
# Baseline monthly debt payment budget
# Same value used in Subtask D for consistency
monthly_budget = 800.0
from math import isnan

timeline_results = []

for scenario, budget in stress_budgets.items():
    sim = simulate_payoff(
        debts,
        monthly_budget=budget,
        strategy="avalanche"
    )

    months = sim["month"].max()
    interest = sim["total_interest_paid_to_date"].iloc[-1]

    timeline_results.append({
        "scenario": scenario,
        "monthly_budget": round(budget, 2),
        "months_to_payoff": int(months),
        "total_interest_paid": round(interest, 2)
    })

timeline_impact = pd.DataFrame(timeline_results)
timeline_impact


In [25]:
baseline_budget = monthly_budget  # from Subtask D

stress_budgets = {
    "baseline": baseline_budget,
    "income_drop_20pct": baseline_budget * 0.75,
    "expense_spike_30pct": baseline_budget * 0.7
}

stress_budgets


{'baseline': 800.0, 'income_drop_20pct': 600.0, 'expense_spike_30pct': 560.0}

In [27]:
def simulate_payoff(debts_df: pd.DataFrame, monthly_budget: float, strategy: str, max_months: int = 600):
    df = debts_df.copy()
    df["balance"] = df["current_balance"].astype(float)
    df["apr"] = df["interest_rate"].astype(float)
    df["min_pay"] = df["minimum_payment"].astype(float)

    total_interest_paid = 0.0
    rows = []

    for month in range(1, max_months + 1):
        if (df["balance"] <= 0.01).all():
            break

        # Monthly interest
        monthly_rate = (df["apr"] / 100.0) / 12.0
        interest = df["balance"].clip(lower=0) * monthly_rate
        df["balance"] += interest
        total_interest_paid += interest.sum()

        active = df["balance"] > 0.01
        min_due = df.loc[active, "min_pay"].sum()
        budget = monthly_budget

        payments = pd.Series(0.0, index=df.index)

        if budget <= min_due:
            payments.loc[active] = budget * (df.loc[active, "min_pay"] / min_due)
        else:
            payments.loc[active] = df.loc[active, "min_pay"]
            extra = budget - min_due

            candidates = df.loc[active]

            if strategy == "snowball":
                target = candidates["balance"].idxmin()
            elif strategy == "avalanche":
                target = candidates["apr"].idxmax()
            else:
                raise ValueError("Invalid strategy")

            payments.loc[target] += extra

        actual_payments = payments.clip(upper=df["balance"])
        df["balance"] -= actual_payments

        rows.append({
            "month": month,
            "total_balance": df["balance"].sum(),
            "total_interest_paid_to_date": total_interest_paid
        })

    return pd.DataFrame(rows)


In [30]:
# Load debts data (from Subtask B)
debts = pd.read_csv("data/cleaned/debts_clean.csv")

debts


Unnamed: 0,debt_id,account_id,current_balance,interest_rate,minimum_payment
0,D1,IUPM04409079772781,5200.0,22.9,160.0
1,D2,BLAT22216107051843,12400.0,7.2,275.0
2,D3,UTXA55295806601382,8600.0,9.9,210.0
3,D4,XICF70493862044851,3100.0,19.5,95.0
4,D5,KOSW19711121259020,17800.0,5.9,320.0


In [31]:
timeline_results = []

for scenario, budget in stress_budgets.items():
    sim = simulate_payoff(
        debts,
        monthly_budget=budget,
        strategy="avalanche"
    )

    timeline_results.append({
        "scenario": scenario,
        "monthly_budget": round(budget, 2),
        "months_to_payoff": int(sim["month"].max()),
        "total_interest_paid": round(sim["total_interest_paid_to_date"].iloc[-1], 2)
    })

timeline_impact = pd.DataFrame(timeline_results)
timeline_impact


Unnamed: 0,scenario,monthly_budget,months_to_payoff,total_interest_paid
0,baseline,800.0,82,17996.47
1,income_drop_20pct,600.0,143,37868.9
2,expense_spike_30pct,560.0,179,52332.46


In [32]:
import os
os.makedirs("outputs", exist_ok=True)

stress_test_summary.to_csv("outputs/subtask_e_stress_test_summary.csv", index=False)
timeline_impact.to_csv("outputs/subtask_e_timeline_impact.csv", index=False)
