In [1]:
import polars as pl
import duckdb

## load data

In [2]:
con = duckdb.connect('./case-2.duckdb')

In [None]:
con.sql("""
--sqlbegin

select distinct table_name
from information_schema.columns
where table_schema = 'pizza_runner'

--sqlend
""")

In [15]:
runners = con.sql("select * from pizza_runner.runners").pl()
pizza_names = con.sql("select * from pizza_runner.pizza_names").pl()
pizza_toppings = con.sql("select * from pizza_runner.pizza_toppings").pl()
runner_orders = con.sql("select * from pizza_runner.runner_orders").pl()
customer_orders = con.sql("select * from pizza_runner.customer_orders").pl()
pizza_recipes = con.sql("select * from pizza_runner.pizza_recipes").pl()

## clean data

In [44]:
clean_customer_orders = customer_orders.with_columns(
    pl.when(pl.col('exclusions').is_in(['', 'null']))
      .then(pl.lit(None))
      .otherwise(pl.col('exclusions'))
      .alias('exclusions'),
    pl.when(pl.col('extras').is_in(['', 'null']))
      .then(pl.lit(None))
      .otherwise(pl.col('extras'))
      .alias('extras'),
)

#display(clean_customer_orders)

In [52]:
clean_runner_orders = runner_orders.with_columns(
    pl.when(pl.col('pickup_time').is_in(['', 'null']))
      .then(pl.lit(None))
      .otherwise(pl.col('pickup_time'))
      .str.to_datetime(format='%Y-%m-%d %H:%M:%S')
      .alias('pickup_time'),
    pl.when(pl.col('distance').is_in(['', 'null']))
      .then(pl.lit(None))
      .otherwise(pl.col('distance'))
      .str.replace(r"[^0-9\\.]*$", "")
      .cast(pl.Float64)
      .alias('distance'),
    pl.when(pl.col('duration').is_in(['', 'null']))
      .then(pl.lit(None))
      .otherwise(pl.col('duration'))
      .str.replace(r"[^0-9\\.]*$", "")
      .cast(pl.Int64)
      .alias('duration'),
    pl.when(pl.col('cancellation').is_in(['', 'null']))
      .then(pl.lit(None))
      .otherwise(pl.col('cancellation'))
      .alias('cancellation'),
)

#display(clean_runner_orders)

## analysis

## 1. pizzas ordered

In [59]:
clean_customer_orders.select(
    pl.col('pizza_id').len()
                      .alias('pizzas_ordered')
)

pizzas_ordered
u32
14


## 2. number of unique customer orders

In [61]:
clean_customer_orders.select(
    pl.col('customer_id').n_unique()
                         .alias('customer_orders')
)

customer_orders
u32
5


## 3. number of successful order deliveries by each runner

In [66]:
(clean_runner_orders
    .filter(pl.col('cancellation').is_null())
    .group_by('runner_id')
    .agg(
        pl.col('duration').len().alias('orders_delivered')
    )
    .sort('runner_id')
)

runner_id,orders_delivered
i32,u32
1,4
2,3
3,1
