In [8]:
import polars as pl
import altair as alt
import calendar

alt.theme.enable('latimes')

file = '/home/dedmonds/repos/code/eda/amazon-orders-20251008/Retail.OrderHistory.1/Retail.OrderHistory.1.csv'

schema_changes = {
    'Total Owed': pl.Utf8,
    'Unit Price': pl.Utf8,
    'Order Date': pl.Datetime,
}

df = pl.read_csv(
    file,
    has_header=True,
    schema_overrides=schema_changes,
)

#display(df.head(4))

null_strings = ['Not Available', 'Not Applicable']

df = df.with_columns(
        pl.col('Unit Price').str.replace_all(',', '').cast(pl.Float64),
        pl.col('Total Owed').str.replace_all(',', '').cast(pl.Float64),
        pl.col('Total Discounts').str.replace_all("'", "").cast(pl.Float64),
        pl.col('Purchase Order Number').replace(null_strings, None),
        pl.col('Shipment Item Subtotal').replace(null_strings, None).str.replace_all(',', '').cast(pl.Float64),
        pl.col('Shipment Item Subtotal Tax').replace(null_strings, None).str.replace_all(',', '').cast(pl.Float64),
        pl.col('Ship Date').replace(null_strings, None).str.head(10).cast(pl.Date),
        pl.col('Order Date').cast(pl.Date),
)

display(df.head(4))

Website,Order ID,Order Date,Purchase Order Number,Currency,Unit Price,Unit Price Tax,Shipping Charge,Total Discounts,Total Owed,Shipment Item Subtotal,Shipment Item Subtotal Tax,ASIN,Product Condition,Quantity,Payment Instrument Type,Order Status,Shipment Status,Ship Date,Shipping Option,Shipping Address,Billing Address,Carrier Name & Tracking Number,Product Name,Gift Message,Gift Sender Name,Gift Recipient Contact Details,Item Serial Number
str,str,date,str,str,f64,f64,f64,f64,f64,f64,f64,str,str,i64,str,str,str,date,str,str,str,str,str,str,str,str,str
"""Amazon.com""","""114-0760626-7787434""",2025-10-06,,"""USD""",6.99,0.75,0.0,0.0,7.74,6.99,0.75,"""B00J5J9DAK""","""New""",1,"""Visa - 8166""","""Authorized""","""Paid""",2025-10-06,"""next-1dc""","""Robert Duane Edmonds 2208 CARR…","""Robert Duane Edmonds 2208 CARR…","""Not Available""","""Pepsodent Complete Care Antica…","""Not Available""","""Not Available""","""Not Available""","""Not Available"""
"""Amazon.com""","""114-5496154-2389867""",2025-10-06,,"""USD""",54.0,5.81,0.0,0.0,59.81,54.0,5.81,"""B0D911QH19""","""New""",1,"""Visa - 8166""","""Closed""","""Shipped""",2025-10-07,"""next-1dc""","""Robert Duane Edmonds 2208 CARR…","""Robert Duane Edmonds 2208 CARR…","""USPS(9341989704006510833705)""","""Effective Polars: Optimized Da…","""Not Available""","""Not Available""","""Not Available""","""Not Available"""
"""Amazon.com""","""114-0384001-8252269""",2025-10-03,,"""USD""",12.99,1.4,0.0,0.0,14.39,12.99,1.4,"""1976884128""","""New""",1,"""Visa - 8166""","""Closed""","""Shipped""",2025-10-05,"""next-1dc""","""Robert Duane Edmonds 2208 CARR…","""Robert Duane Edmonds 2208 CARR…","""USPS(9341989704006508028502)""","""THE WAY: A Modern Tao Te Ching""","""Not Available""","""Not Available""","""Not Available""","""Authenticity_2D=01097819768841…"
"""Amazon.com""","""114-6132900-0965005""",2025-10-02,,"""USD""",9.98,1.07,0.0,0.0,11.05,19.96,2.14,"""B0F3DBPYL9""","""New""",1,"""Visa - 8166""","""Closed""","""Shipped""",2025-10-02,"""next-1dc""","""Robert Duane Edmonds 2208 CARR…","""Robert Duane Edmonds 2208 CARR…","""AMZN_US(TBA324793113883)""","""M&SENSE Natural Soy Candle| Ba…","""Not Available""","""Not Available""","""Not Available""","""Not Available"""


In [3]:
(
    df
    .sort('Total Owed', descending=True)
    .head(10)
    .select(
        pl.col('Product Name'),
        pl.col('Total Owed').alias('Cost'),
    )
)

Product Name,Cost
str,f64
"""Philips HeartStart Home Defibr…",1285.15
"""Dell U-Series 38"" Screen LED-L…",1063.93
"""Polycom SoundStation IP 6000 w…",1055.64
"""Optoma HD39Darbee 1080p 3500 L…",976.5
"""Taylor GS Mini-e Koa""",749.0
"""Hamilton Khaki Pilot Pioneer H…",639.55
"""Lenovo X1 Carbon 14 Inch Busin…",545.16
"""Insta360 ONE R Twin Edition - …",529.19
"""Polycom SoundStation IP 6000 w…",484.23
"""Logitech G923 Racing Wheel and…",419.57


In [4]:
df.select(
    pl.col('Total Owed').len().alias('No. Purchases'),
    pl.col('Total Owed').sum().alias('Total Paid'),
)

No. Purchases,Total Paid
u32,f64
3464,102175.04


In [5]:
df_date_totals = (
    df
    .filter(pl.col('Order Status') != 'Cancelled')
    .group_by('Order Date')
    .agg(
        pl.col('Product Name').first(),
        pl.col('Website').first(),
        pl.col('Total Owed').sum(),
    )
    .with_columns(
        pl.col('Order Date').dt.month().alias('Order Month'),
        #pl.col('Order Date').dt.strftime('%b').alias('Order Month'),
    )
)

display(df_date_totals.head(4))

Order Date,Product Name,Website,Total Owed,Order Month
date,str,str,f64,i8
2017-06-08,"""NIKE Men's Dry DF Swoosh Heath…","""Amazon.com""",70.52,6
2022-06-01,"""Logitech MX Master 2S Wireless…","""Amazon.com""",64.15,6
2025-04-23,"""ThermoPro TP49 Digital Hygrome…","""Amazon.com""",33.5,4
2023-06-23,"""Greater Goods Food Scale for K…","""Amazon.com""",12.0,6


In [73]:
alt.Chart(
    df_date_totals.with_columns(
        pl.col('Order Date').dt.year().alias('Year'),
    ).group_by('Year').agg(
        pl.col('Total Owed').sum().round(2).alias('Total Spent')
    ),
    title='Order Totals by Year'
).mark_bar(
    color='#87ffaf',
    stroke='black',
    opacity=0.75,
).transform_calculate(
    custom_tooltip="'$' + datum['Total Spent']",
).encode(
    x=alt.X('Year:O', title=''),
    y=alt.Y('Total Spent', title='Total (USD)', axis=alt.Axis(format='$,.0f')),
    tooltip='custom_tooltip:N',
).properties(
    width=1200,
    height=500,
)

In [132]:
(
    df
    .group_by('Product Name')
    .agg(
        pl.col('Quantity').sum(),
        pl.col('Quantity').len().alias('No. Orders'),
        (pl.col('Total Owed').sum() / pl.col('Quantity').sum()).round(2).alias('Avg. Price'),
        pl.col('Total Owed').sum().alias('Total Paid'),
        pl.col('Order Date').first().alias('Date First Ordered'),
        pl.col('Order Date').last().alias('Date Last Ordered'),
    )
    .sort('Quantity', descending=True)
    .head(10)
)

Product Name,Quantity,No. Orders,Avg. Price,Total Paid,Date First Ordered,Date Last Ordered
str,i64,u32,f64,f64,date,date
"""thinkThin High Protein, Chunky…",28,7,16.98,475.49,2015-04-26,2015-02-09
"""Kodak Tri-X 400TX Professional…",20,1,5.42,108.4,2016-06-18,2016-06-18
"""The Boy, the Mole, the Fox and…",15,6,12.98,194.67,2022-12-22,2021-05-09
"""Perfect Bar, Peanut Butter Pro…",13,6,20.69,269.0,2025-06-15,2024-08-20
"""Chicago Comb Model 7 Carbon Fi…",13,9,13.45,174.8,2023-11-26,2021-02-10
"""Luna Sundara Handmade Palo San…",13,7,14.74,191.66,2025-08-04,2022-12-19
"""thinkThin Chunky Peanut Butter…",10,6,14.23,142.3,2015-02-04,2014-11-21
"""AmazonBasics High-Speed HDMI C…",10,1,5.98,59.8,2015-02-28,2015-02-28
"""Perfect Bar Peanut Butter Prot…",9,4,22.4,201.6,2024-06-17,2024-01-16
"""thinkThin High Protein Bars, C…",8,3,13.42,107.35,2016-10-04,2015-11-26


In [65]:
df_month_totals = (
    df_date_totals
    .group_by('Order Month')
    .agg(
        pl.col('Total Owed').sum().round(2).alias('Amount Paid'),
    )
    .sort('Amount Paid')
)

display(df_month_totals.head())

Order Month,Amount Paid
i8,f64
11,6680.37
3,6839.49
6,7217.82
9,7379.62
2,7701.13


In [None]:
alt.Chart(
    df_month_totals.with_columns(
        pl.col('Order Month').map_elements(lambda n: calendar.month_abbr[n]).alias('Month'),
        pl.col('Order Month').alias('month_number'),
    ),
    title='aggregated costs by month'
).mark_bar(
    color='#87ffaf',
    stroke='black',
    opacity=0.75,
).transform_calculate(
    custom_tooltip="'$' + datum['Amount Paid']",
).encode(
    x=alt.X('Month', title='', sort=alt.SortField(field='month_number', order='ascending')),
    y=alt.Y('Amount Paid', title='total paid (USD)', axis=alt.Axis(format='$,.0f')),
    tooltip='custom_tooltip:N',
).properties(
    width=800,
    height=500,
)