In [2]:
import polars as pl
import altair as alt
from datetime import datetime


pl.Config.set_fmt_str_lengths(80)

alt.theme.enable('ggplot2')

jco_colors = [
    "#0073C2",  # blue
    "#EFC000",  # yellow
    "#868686",  # gray
    "#CD534C",  # red
    "#7AA6DC",  # light blue
    "#003C67",  # dark blue
    "#8F7700",  # dark yellow
    "#3B3B3B",  # dark gray
    "#A73030",  # dark red
    "#4A6990"   # slate blue
]

base_color = '#f2c45f'
base_opacity = 0.6

file = '~/repos/code/eda/acct/stmt20240413to20251013.parquet'

df = pl.read_parquet(file)

start_date = datetime(2024, 5, 1)
end_date = datetime(2025, 10, 1)

elapsed_months = (end_date.year - start_date.year) * 12 + (end_date.month - start_date.month)

df = df.filter(
    (pl.col('Date') >= start_date) & (pl.col('Date') < end_date)
)

df.head(4)

Date,Description,Amount
date,str,f64
2024-05-01,"""PAYPAL DES:INST XFER ID:GOOGLE YOUTUBE INDN:ROBERT EDMONDS CO ID:PAYPALSI77 WEB""",-7.99
2024-05-02,"""NATIONAL ORGANIZATION F 04/30 PURCHASE XXX-XX88669 DC""",-10.0
2024-05-02,"""ACLU 04/30 PURCHASE XXX-XX92543 NY""",-20.0
2024-05-02,"""SQ *R&G OAKLAND - E18TH 05/01 PURCHASE Oakland CA""",-5.5


In [3]:
#df_monthly = df.group_by_dynamic(
#    index_column='Date',
#    every='1mo',
#).agg(
#    -pl.sum('Amount').alias('Expenses'),
#).select(
#    pl.col('Date').alias('Month'),
#    pl.col('Expenses'),
#)

df_monthly = df.with_columns(
    pl.col('Date').dt.truncate(every='1mo').alias('Month')
).group_by('Month').agg(
    -pl.col('Amount').sum().round(2).alias('Expenses'),
).sort('Month')

display(df_monthly)

Month,Expenses
date,f64
2024-05-01,9306.65
2024-06-01,7419.68
2024-07-01,12988.34
2024-08-01,12262.87
2024-09-01,9311.64
…,…
2025-05-01,5869.16
2025-06-01,11572.24
2025-07-01,13126.75
2025-08-01,6670.83


In [None]:
monthly_totals = alt.Chart(
    df_monthly,
    title=alt.Title(
        f'monthly expenses (bofa) — may 2024 thru sep 2025',
        subtitle='Expenses are generally lower as we come to the end of 2025.',
    ),
    width=850,
    height=450,
).mark_bar(
    color=base_color,
    stroke='black',
    opacity=base_opacity,
    width=25,
    align='left',
).encode(
    x=alt.X('Month', title=''),
    y=alt.Y('Expenses', title='', axis=alt.Axis(format='$,d')),
    tooltip='custom_tooltip:N',
).transform_calculate(
    custom_tooltip='"$" + datum.Expenses'
)

monthly_totals

In [18]:
food_groceries_list = [
    'INSTACART',
    'BERKELEYB',
    'LUCKY',
    'GROCERY',
    'LAKESHORE PROD',
]

food_restaurants_list = [
    'SEAFOOD',
    'BURGER',
    'CUISINE',
    'UBER EATS',
    'UBER \\*EATS',
    'POSTMATES',
    'HANG TEN',
    'LIMON',
    'PARCHE',
    'THE FAT LADY',
    'HARVEST TABLE',
    'PARADISO',
    'ALMOND AND OAK',
    'POMET',
    'YOSHIS',
    'SCOTTS',
    'BARDO',
    'MUA',
    'QUINNS',
    'GOOSE & GANDER',
    'RESTAUR',
    'LEMON DROP',
    'STEAKHOUSE',
    'CHEESE STEAK',
    'JAJI',
    'BURMA',
    'CESAR',
    'MIJORI',
    'LUCHINI',
    'SOBRE MESA',
    'DOOR DASH',
    'DOORDASH',
    'XOCHI THE DOG',
    'DONA',
    'BRENDA\'S',
    'OLE OLE',
    'TACOS',
    'CHAMPA',
    'UCCELLO',
    'PORTAL',
    'YOSHI\'S',
    'THAI',
    'TAQUERIA',
    'BURDELL',
    'Grill',
    'BBQ',
    'RAMEN',
    'AHNS',
    'MITAMA',
    'LYNN & LU',
    'NIDO\'S',
    'ROCKIN CRAWFISH',
    'ROSE LA MOON',
    'VIKS CHAAT',
    'STAR ON GRAND',
    'FENTONS',
    'SEABREEZE ON THE DO',
    'PROPOSITION CHICKEN',
    'SAND BAR',
    'SHAKA SHACK',
    'IN-N-OUT',
    'BROTHER\'S DELI',
    'CANCUN SABOR',
    'COMAL NEXT DOOR',
    'Cracker Barrel',
]

travel_list = [
    'EXPEDIA',
    'UNITED.COM',
    'DELTA',
    'HOTEL',
    'SPIRIT AIR',
    'MGM GRAND',
    'LAS VEGAS NV',
    'AUSTIN TX',
    'JOHNNY ROCKETS',
]

trans_auto_list = [
    'UNITED FIN',
    'PROG DIRECT INS',
    'WEATHERFORD',
    'TIRE RACK',
    'DMV',
    'Bridgecrest',
    'DONS TIRE',
    'AAA',
    'PARKING',
    'PARK METER',
    'CITY OF ALAMEDA',
    'ARAMCO',
    'CHEVRON',
    'SHELL OIL',
    'SHELL SERVICE',
    'ARCO',
    'PETROLEUM',
    'BROADWAY CARWA',
    'Check 607',
    'CAR WASH',
]

df_categorized = df.with_columns(
    -pl.col('Amount'),
).rename({'Amount': 'Expense'}).with_columns(
    # category - food
    pl.when(
        pl.col('Description').str.contains_any(food_groceries_list),
    ).then(pl.lit('food_groceries'))
    .when(
        pl.col('Description').str.contains_any(food_restaurants_list),
    ).then(pl.lit('food_restaurants'))
    .when(
        pl.col('Description').str.contains('FACTOR'),
    ).then(pl.lit('food_other'))
    # category - rent
    .when(
        pl.col('Description').str.contains('Check') & (pl.col('Expense').is_between(1942.945, 1942.955) | pl.col('Expense').is_between(1987.995, 1988.005)),
    ).then(pl.lit('rent'))
    # category - travel
    .when(
        pl.col('Description').str.contains_any(travel_list),
    ).then(pl.lit('travel'))
    # category - photography
    .when(
        pl.col('Description').str.contains('ROYAL WE|B H PHOTO|FREESTYLE|CINESTILL|EPSON|LOOKING GLASS|MATBOARD|KEH|FINEART|VISTAPRINT|DIGITAL SILVER|SQUAREHOOD'),
    ).then(pl.lit('photography'))
    # category - health & fitness
    .when(
        pl.col('Description').str.contains('RAPHA|TYR|SILCA|HIMS|24 HOUR FITNESS|24HourFitness|24 Hour Fitness|ZENNI|AMAIN.COM|USASSOSCOM|SIROKO|Performance Bic|MILLICAN|ULTRADYNAMICO|GOD AND FAMOUS|COCOFLOSS|ALMSTHRE|PEAKDESIGN'),
    ).then(pl.lit('health&fitness'))
    # category - home
    .when(
        pl.col('Description').str.contains('IKEA|HOME DEPOT|TARGET|CRIMSON HORT'),
    ).then(pl.lit('home'))
    # category - clothes
    .when(
        pl.col('Description').str.contains('DUER|VUORI|WILDLING|RHOBACK|SBD|LULULEMON|EARTHINGSAN|OPTICPLANET|MEUNDIES|KOHL\'S'),
    ).then(pl.lit('clothes'))
    # category - spa
    .when(
        pl.col('Description').str.contains('Woodhouse Spas|PIEDMONT SPRINGS|PIEDMONT FAMILY SPA|Osmosis Day|NAILS'),
    ).then(pl.lit('spa'))
    # category - bills
    .when(
        pl.col('Description').str.contains('COMCAST|TMOBILE|GODADDY|DROPBOX|APPLE.COM|GITHUB|ZOOM|TODOIST|Amazon web services|LASTPASS.COM|Evernote|SQUARESPACE|SQSP'),
    ).then(pl.lit('utilities'))
    # category - transportation
    .when(
        pl.col('Description').str.contains('UBER INDN|UBER TRIP'),
    ).then(pl.lit('trans_rideshare'))
    .when(
        pl.col('Description').str.contains_any(trans_auto_list),
    ).then(pl.lit('trans_auto'))
    # category - education
    .when(
        pl.col('Description').str.contains('Google One|GOOGLE \\*ONE|Audible|LEONARDO|EXERCISM|NYTIMES|NYTimes|WASHPOST|COURSRA|ABEBOOKS|Kindle|OPENAI|ADAFRUIT|Adafruit|RBFTP|BUSUU|MEDIUM.COM|WALDEN POND'),
    ).then(pl.lit('education'))
    # category - entertainment
    .when(
        pl.col('Description').str.contains('YOUTUBE|YouTubePremium|YouTube|Netflix|NETFLIX|MELO MELO|EAZE|ONLYFANS|CAFE|COFFEE SHOP|DREXL|VTSUP.COM|LOW BAR|ORIGINAL PATTERN|BLUE BOTTLE|PENNYWEIGHT'),
    ).then(pl.lit('entertainment'))
    # category - amazon
    .when(
        pl.col('Description').str.contains('AMAZON|AMZN|Amazon Prime'),
    ).then(pl.lit('amazon'))
    # category - ebay
    .when(
        pl.col('Description').str.contains('EBAY'),
    ).then(pl.lit('ebay'))
    # category - career
    .when(
        pl.col('Description').str.contains('LINKEDIN'),
    ).then(pl.lit('career'))
    # category - holiday and gifts
    .when(
        pl.col('Description').str.contains('SEPHORA|BRENTS CHRIST|Kate Spade|MACYS .COM 12/03 PURCHASE|BOUQS|THE SAINT|U-HAUL|SPIRIT HALLOWEEN'),
    ).then(pl.lit('gifts&holiday'))
    # category - donations
    .when(
        pl.col('Description').str.contains('KQED|ACLU|NATIONAL MERIT|NATIONAL ORGANIZATION|PINOLE VALLEY HIGH|ACTBLUE'),
    ).then(pl.lit('donations'))
    # category - bank
    .when(
        pl.col('Description').str.contains('CREDIT CARD|Credit Card|CREDITCARD|CRD 8015'),
    ).then(pl.lit('bank_creditcard'))
    .when(
        pl.col('Description').str.contains('WITHDRWL'),
    ).then(pl.lit('bank_cash'))
    .when(
        pl.col('Description').str.contains('Zelle'),
    ).then(pl.lit('bank_transfers'))
    .otherwise(pl.lit(None)).alias('Category'),
).filter(
    pl.col('Expense') > 0.0
)

display(df_categorized.head())

Date,Description,Expense,Category
date,str,f64,str
2024-05-01,"""PAYPAL DES:INST XFER ID:GOOGLE YOUTUBE INDN:ROBERT EDMONDS CO ID:PAYPALSI77 WEB""",7.99,"""entertainment"""
2024-05-02,"""NATIONAL ORGANIZATION F 04/30 PURCHASE XXX-XX88669 DC""",10.0,"""donations"""
2024-05-02,"""ACLU 04/30 PURCHASE XXX-XX92543 NY""",20.0,"""donations"""
2024-05-02,"""SQ *R&G OAKLAND - E18TH 05/01 PURCHASE Oakland CA""",5.5,
2024-05-03,"""FACTOR75 05/02 PURCHASE 188-857-3572 IL""",99.8,"""food_other"""


In [19]:
monthly_average_by_category = alt.Chart(
    df_categorized.group_by('Category').agg(
        (pl.col('Expense').sum() / elapsed_months).round(2).alias('Expenses')
    ),
    title=alt.Title(
        'monthly average expenses (bofa) by category — may 2024 thru sep 2025',
        subtitle='The most significant opportunities for savings are in restaurant food expenses vs groceries.'
    ),
    width=750,
    height=550,
).mark_bar(
    color=base_color,
    opacity=base_opacity,
    stroke='black',
    height=18,
).encode(
    y=alt.Y('Category', title='', sort=alt.SortField('Expenses', order='descending')),
    x=alt.X('Expenses', title='', axis=alt.Axis(format='$,d')),
    tooltip='custom_tooltip:N'
).transform_calculate(
    custom_tooltip='"$" + datum.Expenses'
)

monthly_average_by_category

In [8]:
total = df_categorized.select(pl.col('Expense').sum().alias('total')).to_dict()['total'].item()

display(df_categorized.filter(pl.col('Category').is_null())
    .select(
        pl.col('Expense').len().alias('num_of_uncategorized_expenses'),
        pl.col('Expense').sum().alias('total_uncategorized_expenses'),
        (pl.col('Expense').sum() / total * 100).round(3).alias('%_uncatorgorized'),
    )
)

num_of_uncategorized_expenses,total_uncategorized_expenses,%_uncatorgorized
u32,f64,f64
148,2844.39,1.515


In [9]:
display(df_categorized.head(2))

categories = [
    'health&fitness',
    'photography',
    'amazon',
    'rent',
    'trans_auto',
    'food_groceries',
    'food_restaurants',
]

selection = alt.selection_point(fields=['Category'], bind='legend')

monthly_totals_by_category = alt.Chart(
    df_categorized.filter(
        pl.col('Category').is_in(categories)
    ).group_by_dynamic(index_column='Date', every='1mo', group_by='Category').agg(
        pl.col('Expense').sum().round(2),
    ),
    title=alt.Title(
        'monthy expenses (bofa) by selected category — may 2024 thru sep 2025',
        subtitle=[
            'Monthly costs are lower, and further opportunity exists in restaurant food expenditures vs groceries.',
            '(Click inside the legend to isolate an individual category. Click anywhere else to show all.)',
        ]
    ),
    width=950,
    height=650,
).mark_line(
    size=4.5,
    opacity=base_opacity,
    interpolate='catmull-rom',
    point=True,
).encode(
    x=alt.X('Date', title=''),
    y=alt.Y('Expense', title='', axis=alt.Axis(format="$,d")),
    color=alt.Color(
        'Category:N',
        title='(click to isolate)',
        #scale=alt.Scale(scheme='set2'),
        scale=alt.Scale(range=jco_colors),
        legend=alt.Legend(symbolType='stroke')
    ),
    tooltip='custom_tooltip:N',
    opacity=alt.when(selection).then(alt.value(1)).otherwise(alt.value(0.1)),
).add_params(
    selection
).transform_calculate(
    custom_tooltip='"$" + datum.Expense + " (" + datum.Category + ")"'
)

text_overlay = alt.Chart(
    df_categorized.filter(
        pl.col('Category').is_in(categories),
        pl.col('Date') >= datetime(2025, 9, 1),
    ).group_by('Category').agg(
        pl.lit(datetime(2025, 9, 1)).alias('Date'),
        pl.col('Expense').sum(),
    ),
).mark_point().mark_text(
    baseline='middle',
    align='left',
    dx=8,
    size=13,
    fontWeight='bold',
).encode(
    x='Date',
    y='Expense',
    text='Category',
    color='Category',
)

monthly_totals_by_category + text_overlay

Date,Description,Expense,Category
date,str,f64,str
2024-05-01,"""PAYPAL DES:INST XFER ID:GOOGLE YOUTUBE INDN:ROBERT EDMONDS CO ID:PAYPALSI77 WEB""",7.99,"""entertainment"""
2024-05-02,"""NATIONAL ORGANIZATION F 04/30 PURCHASE XXX-XX88669 DC""",10.0,"""donations"""
