In [2]:
import polars as pl
import duckdb

## load data

In [4]:
con = duckdb.connect('./case-2.duckdb')

runners = con.sql("select * from pizza_runner.runners").pl()
pizza_names = con.sql("select * from pizza_runner.pizza_names").pl()
pizza_toppings = con.sql("select * from pizza_runner.pizza_toppings").pl()
runner_orders = con.sql("select * from pizza_runner.runner_orders").pl()
customer_orders = con.sql("select * from pizza_runner.customer_orders").pl()
pizza_recipes = con.sql("select * from pizza_runner.pizza_recipes").pl()

con.close()

## clean data

In [6]:
clean_customer_orders = customer_orders.with_columns(
    pl.when(pl.col('exclusions').is_in(['', 'null']))
      .then(pl.lit(None))
      .otherwise(pl.col('exclusions'))
      .alias('exclusions'),
    pl.when(pl.col('extras').is_in(['', 'null']))
      .then(pl.lit(None))
      .otherwise(pl.col('extras'))
      .alias('extras'),
)

#display(clean_customer_orders)

In [7]:
clean_runner_orders = runner_orders.with_columns(
    pl.when(pl.col('pickup_time').is_in(['', 'null']))
      .then(pl.lit(None))
      .otherwise(pl.col('pickup_time'))
      .str.to_datetime(format='%Y-%m-%d %H:%M:%S')
      .alias('pickup_time'),
    pl.when(pl.col('distance').is_in(['', 'null']))
      .then(pl.lit(None))
      .otherwise(pl.col('distance'))
      .str.replace(r"[^0-9\\.]*$", "")
      .cast(pl.Float64)
      .alias('distance'),
    pl.when(pl.col('duration').is_in(['', 'null']))
      .then(pl.lit(None))
      .otherwise(pl.col('duration'))
      .str.replace(r"[^0-9\\.]*$", "")
      .cast(pl.Int64)
      .alias('duration'),
    pl.when(pl.col('cancellation').is_in(['', 'null']))
      .then(pl.lit(None))
      .otherwise(pl.col('cancellation'))
      .alias('cancellation'),
)

#display(clean_runner_orders)

## analysis

## A.1. pizzas ordered

In [59]:
clean_customer_orders.select(
    pl.col('pizza_id').len()
                      .alias('pizzas_ordered')
)

pizzas_ordered
u32
14


## A.2. number of unique customer orders

In [61]:
clean_customer_orders.select(
    pl.col('customer_id').n_unique()
                         .alias('customer_orders')
)

customer_orders
u32
5


## A.3. number of successful order deliveries by each runner

In [66]:
(clean_runner_orders
    .filter(pl.col('cancellation').is_null())
    .group_by('runner_id')
    .agg(
        pl.col('duration').len().alias('orders_delivered')
    )
    .sort('runner_id')
)

runner_id,orders_delivered
i32,u32
1,4
2,3
3,1


## A.4. deliveries by pizza type

In [16]:
(clean_runner_orders
    .join(clean_customer_orders, how='left', on='order_id')
    .filter(pl.col('cancellation').is_null())
    .group_by('pizza_id')
    .agg(pl.len().alias('deliveries'))
    .sort('pizza_id')
)

pizza_id,deliveries
i32,u32
1,9
2,3


## A.5. pizza types ordered by each customer

In [11]:
(clean_customer_orders
    .group_by('customer_id', 'pizza_id')
    .agg(pl.len().alias('orders'))
    .join(pizza_names, how='left', on='pizza_id')
    .select('customer_id', 'pizza_name', 'orders')
    .sort('customer_id', 'pizza_name')
)

customer_id,pizza_name,orders
i32,str,u32
101,"""Meatlovers""",2
101,"""Vegetarian""",1
102,"""Meatlovers""",2
102,"""Vegetarian""",1
103,"""Meatlovers""",3
103,"""Vegetarian""",1
104,"""Meatlovers""",3
105,"""Vegetarian""",1


## A.6. most pizzas delivered on a single order

In [18]:
(clean_runner_orders
    .filter(pl.col('cancellation').is_null())
    .join(clean_customer_orders, how='left', on='order_id')
    .group_by('order_id')
    .agg(pl.len().alias('pizzas_delivered'))
    .filter(pl.col('pizzas_delivered') == pl.col('pizzas_delivered').max())
)

order_id,pizzas_delivered
i32,u32
4,3


## A.7. changed vs unchanged pizzas delivered per customer

In [26]:
(clean_runner_orders
    .filter(pl.col('cancellation').is_null())
    .join(clean_customer_orders, how='left', on='order_id')
    .group_by('customer_id')
    .agg(
        pl.when(pl.col('exclusions').is_null() & pl.col('extras').is_null())
          .then(pl.lit(1))
          .otherwise(pl.lit(0))
          .sum()
          .alias('unchanged'),
        pl.when(pl.col('exclusions').is_not_null() | pl.col('extras').is_not_null())
          .then(pl.lit(1))
          .otherwise(pl.lit(0))
          .sum()
          .alias('changed'),
    )
    .sort('customer_id')
)

customer_id,unchanged,changed
i32,i32,i32
101,2,0
102,3,0
103,0,3
104,1,2
105,0,1


## A.8. delivered with both exclusions and extras

In [35]:
(clean_runner_orders
    .join(clean_customer_orders, how='left', on='order_id')
    .filter(pl.col('cancellation').is_null() & pl.col('exclusions').is_not_null() & pl.col('extras').is_not_null())
    .select(pl.len().alias('pizzas_w_exclusions_and_extras'))
)

pizzas_w_exclusions_and_extras
u32
1


## A.9. pizzas orders by hour of the day