#Chaptor 3  Window Function


Load SQL Module and Database

In [2]:
import sys
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# If running in Google Colab, install PostgreSQL and restore the database
if 'google.colab' in sys.modules:
    # Update package installer
    !sudo apt-get update -qq > /dev/null 2>&1

    # Install PostgreSQL
    !sudo apt-get install postgresql -qq > /dev/null 2>&1

    # Start PostgreSQL service (suppress output)
    !sudo service postgresql start > /dev/null 2>&1

    # Set password for the 'postgres' user to avoid authentication errors (suppress output)
    !sudo -u postgres psql -c "ALTER USER postgres WITH PASSWORD 'password';" > /dev/null 2>&1

    # Create the 'colab_db' database (suppress output)
    !sudo -u postgres psql -c "CREATE DATABASE contoso_100k;" > /dev/null 2>&1

    # Download the PostgreSQL .sql dump
    !wget -q -O contoso_100k.sql https://github.com/lukebarousse/Int_SQL_Data_Analytics_Course/releases/download/v.0.0.0/contoso_100k.sql

    # Restore the dump file into the PostgreSQL database (suppress output)
    !sudo -u postgres psql contoso_100k < contoso_100k.sql > /dev/null 2>&1

    # Shift libraries from ipython-sql to jupysql
    !pip uninstall -y ipython-sql > /dev/null 2>&1
    !pip install jupysql > /dev/null 2>&1

# Load the sql extension for SQL magic
%load_ext sql

# Connect to the PostgreSQL database
%sql postgresql://postgres:password@localhost:5432/contoso_100k

# Enable automatic conversion of SQL results to pandas DataFrames
%config SqlMagic.autopandas = True

# Disable named parameters for SQL magic
%config SqlMagic.named_parameters = "disabled"

# Display pandas number to two decimal places
pd.options.display.float_format = '{:.2f}'.format

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


## 3.1 Find revenue for each order line item and their proportion of daily sales

In [None]:
%%sql

SELECT
    *,
    100 * net_revenue / daily_net_revenue AS pct_daily_revenue
FROM
    (
    SELECT
        orderdate,
        orderkey * 10 + linenumber AS order_line_item,
        (quantity * netprice * exchangerate) AS net_revenue,
        SUM(quantity * netprice * exchangerate) OVER(PARTITION BY orderdate) AS daily_net_revenue
    FROM
        sales
    ) AS revenue_by_day
ORDER BY
    orderdate,
    pct_daily_revenue DESC


Unnamed: 0,orderdate,order_line_item,net_revenue,daily_net_revenue,pct_daily_revenue
0,2015-01-01,10043,2395.10,11640.80,20.58
1,2015-01-01,10061,1552.32,11640.80,13.34
2,2015-01-01,10022,1302.91,11640.80,11.19
3,2015-01-01,10020,1146.75,11640.80,9.85
4,2015-01-01,10050,975.16,11640.80,8.38
...,...,...,...,...,...
199868,2024-04-20,33980141,12.00,96879.43,0.01
199869,2024-04-20,33980074,9.29,96879.43,0.01
199870,2024-04-20,33980080,8.35,96879.43,0.01
199871,2024-04-20,33980142,8.34,96879.43,0.01


##3.2 Find yearly revenue and customer numbers from customers of each cohort year.

In [72]:
%%sql

WITH yearly_cohort AS (
    SELECT DISTINCT
        customerkey,
        EXTRACT(YEAR FROM MIN(orderdate) OVER (PARTITION BY customerkey)) AS cohort_year
    FROM sales
)
SELECT
    y.cohort_year,
    EXTRACT(YEAR FROM orderdate) AS purchase_year,
    SUM(s.quantity * s.netprice * s.exchangerate) AS net_revenue,
    COUNT(DISTINCT s.customerkey) AS num_customers
FROM
    sales s
LEFT JOIN
    yearly_cohort y
ON  s.customerkey = y.customerkey
GROUP BY
    y.cohort_year,
    purchase_year
ORDER BY
    y.cohort_year,
    purchase_year


Unnamed: 0,cohort_year,purchase_year,net_revenue,num_customers
0,2015,2015,7370979.48,2825
1,2015,2016,392623.48,126
2,2015,2017,479841.31,149
3,2015,2018,1069850.87,348
4,2015,2019,1235991.48,388
5,2015,2020,386489.6,171
6,2015,2021,872845.99,295
7,2015,2022,1569787.72,600
8,2015,2023,1157633.91,499
9,2015,2024,356186.62,146


In [74]:
%%sql

    SELECT DISTINCT
        customerkey,
        EXTRACT(YEAR FROM MIN(orderdate) OVER (PARTITION BY customerkey)) AS cohort_year
    FROM sales


Unnamed: 0,customerkey,cohort_year
0,1884827,2022
1,883756,2023
2,1309988,2018
3,848767,2019
4,1955010,2021
...,...,...
49482,2045997,2019
49483,2060016,2023
49484,2071081,2017
49485,1984329,2023


##3.3 Find average total spending per customer of each cohort year before 2020

In [5]:
%%sql

WITH customer_total_spending AS
(
  SELECT
    customerkey,
    SUM(quantity * netprice * exchangerate) AS total_spending,
    EXTRACT(YEAR FROM MIN(orderdate)) AS cohort_year
  FROM
    sales
  GROUP BY
    customerkey
)
, customer_avg_spending AS
(
SELECT
  *,
  AVG(total_spending) OVER(PARTITION BY cohort_year) AS avg_spending
FROM
  customer_total_spending
)
SELECT DISTINCT
  cohort_year,
  avg_spending
FROM
  customer_avg_spending
WHERE
  cohort_year <= 2020


Unnamed: 0,cohort_year,avg_spending
0,2015,5271.59
1,2016,5404.92
2,2017,5403.08
3,2018,4896.64
4,2019,4731.95
5,2020,3933.32


Note: Filter after the window function to maintain cohort data integrity.

##3.4 Get the running order count and running total/average sales from each customer

In [113]:
%%sql

SELECT
  customerkey,
  productkey,
  orderdate,
  COUNT(orderkey) OVER(PARTITION BY customerkey ORDER BY orderdate, productkey) AS running_order_count,
  quantity * netprice * exchangerate AS sales,
  SUM(quantity * netprice * exchangerate) OVER(PARTITION BY customerkey ORDER BY orderdate, productkey) AS running_total_sales,
  AVG(quantity * netprice * exchangerate) OVER(PARTITION BY customerkey ORDER BY orderdate, productkey) AS running_avg_sales
FROM
  sales
ORDER BY
  orderdate,
  customerkey,
  running_order_count

Unnamed: 0,customerkey,productkey,orderdate,running_order_count,sales,running_total_sales,running_avg_sales
0,254117,128,2015-01-01,1,263.11,263.11,263.11
1,254117,450,2015-01-01,2,2395.10,2658.20,1329.10
2,254117,1633,2015-01-01,3,9.65,2667.85,889.28
3,254117,2079,2015-01-01,4,578.52,3246.38,811.59
4,947009,48,2015-01-01,1,63.49,63.49,63.49
...,...,...,...,...,...,...,...
199868,2004167,2499,2024-04-20,7,113.86,9335.43,1333.63
199869,2066845,441,2024-04-20,5,204.61,4124.33,824.87
199870,2066845,1057,2024-04-20,6,588.00,4712.33,785.39
199871,2076477,363,2024-04-20,8,699.00,7140.37,892.55


##3.5 Assign row number to everyday order line items

In [122]:
%%sql

SELECT
  ROW_NUMBER() OVER(PARTITION BY orderdate ORDER BY customerkey, orderkey, linenumber) AS row_num,
  orderdate,
  customerkey,
  orderkey,
  linenumber
FROM
  sales
ORDER BY
  orderdate,
  row_num

Unnamed: 0,row_num,orderdate,customerkey,orderkey,linenumber
0,1,2015-01-01,254117,1004,0
1,2,2015-01-01,254117,1004,1
2,3,2015-01-01,254117,1004,2
3,4,2015-01-01,254117,1004,3
4,5,2015-01-01,947009,1000,0
...,...,...,...,...,...
199868,93,2024-04-20,2004167,3398030,6
199869,94,2024-04-20,2066845,3398021,0
199870,95,2024-04-20,2066845,3398021,1
199871,96,2024-04-20,2076477,3398009,0


In [None]:
##3.6 Rank the customers by total sales and order number

In [123]:
%%sql

SELECT
  customerkey,
  COUNT(orderkey) AS order_count,
  SUM(netprice*quantity*exchangerate) AS total_sales,
  RANK() OVER(ORDER BY COUNT(orderkey) DESC) AS rank_order,
  RANK() OVER(ORDER BY SUM(netprice*quantity*exchangerate) DESC) AS rank_sales
FROM
  sales
GROUP BY
  customerkey
ORDER BY
  rank_sales,
  rank_order DESC;

Unnamed: 0,customerkey,order_count,total_sales,rank_order,rank_sales
0,72844,8,82057.67,4104,1
1,399184,21,79201.82,16,2
2,1743963,22,65431.98,11,3
3,1232832,15,62460.01,287,4
4,326979,16,61349.65,170,5
...,...,...,...,...,...
49482,1064373,1,3.27,39985,49483
49483,1035383,1,3.14,39985,49484
49484,881751,1,2.98,39985,49485
49485,447646,1,2.55,39985,49486


##3.7 Find YoY growth of sales of contoso

In [6]:
%%sql

SELECT
  order_year,
  total_sales,
  LAG(total_sales) OVER(ORDER BY order_year) AS prev_year_sales,
  (total_sales - LAG(total_sales) OVER(ORDER BY order_year)) / LAG(total_sales) OVER(ORDER BY order_year) * 100 AS yoy_growth
FROM
(
  SELECT
    EXTRACT(YEAR FROM orderdate) AS order_year,
    SUM(netprice*quantity*exchangerate) AS total_sales
  FROM
    sales
  GROUP BY
      order_year
  ORDER BY
      order_year
) AS sales_by_year
ORDER BY
  order_year

Unnamed: 0,order_year,total_sales,prev_year_sales,yoy_growth
0,2015,7370979.48,,
1,2016,10383613.67,7370979.48,40.87
2,2017,13221339.05,10383613.67,27.33
3,2018,24667447.84,13221339.05,86.57
4,2019,31818095.97,24667447.84,28.99
5,2020,11218435.79,31818095.97,-64.74
6,2021,21357976.66,11218435.79,90.38
7,2022,44864557.21,21357976.66,110.06
8,2023,33108565.51,44864557.21,-26.2
9,2024,8396527.38,33108565.51,-74.64


In [7]:
##3.8 Find seasonal sales development of every year of Contoso.

In [10]:
%%sql

SELECT
  order_year,
  order_month,
  total_sales,
  (total_sales - LAG(total_sales) OVER(ORDER BY order_year, order_month)) / LAG(total_sales) OVER(ORDER BY order_year, order_month) * 100
  AS monthly_growth
FROM
(
  SELECT
    EXTRACT (YEAR FROM orderdate) AS order_year,
    EXTRACT (MONTH FROM orderdate) AS order_month,
    SUM(netprice*quantity*exchangerate) AS total_sales
  FROM
    sales
  GROUP BY
    order_year,
    order_month
  ORDER BY
    order_year,
    order_month
) AS sales_by_month
ORDER BY
  order_year,
  order_month


Unnamed: 0,order_year,order_month,total_sales,monthly_growth
0,2015,1,384092.66,
1,2015,2,706374.12,83.91
2,2015,3,332961.59,-52.86
3,2015,4,160767.00,-51.72
4,2015,5,548632.63,241.26
...,...,...,...,...
107,2023,12,2928550.93,8.46
108,2024,1,2677498.55,-8.57
109,2024,2,3542322.55,32.30
110,2024,3,1692854.89,-52.21
