<a href="https://colab.research.google.com/github/sup25/data_visualization/blob/main/Resources/Blank_SQL_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a target="_blank" href="https://colab.research.google.com/github/lukebarousse/Int_SQL_Data_Analytics_Course/blob/main/Resources/Blank_SQL_Notebook.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

# Blank SQL Notebook

#### Import Libraries & Database

In [1]:
import sys
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# If running in Google Colab, install PostgreSQL and restore the database
if 'google.colab' in sys.modules:
    # Install PostgreSQL
    !sudo apt-get install postgresql -qq > /dev/null 2>&1

    # Start PostgreSQL service (suppress output)
    !sudo service postgresql start > /dev/null 2>&1

    # Set password for the 'postgres' user to avoid authentication errors (suppress output)
    !sudo -u postgres psql -c "ALTER USER postgres WITH PASSWORD 'password';" > /dev/null 2>&1

    # Create the 'colab_db' database (suppress output)
    !sudo -u postgres psql -c "CREATE DATABASE contoso_100k;" > /dev/null 2>&1

    # Download the PostgreSQL .sql dump
    !wget -q -O contoso_100k.sql https://github.com/lukebarousse/Int_SQL_Data_Analytics_Course/releases/download/v.0.0.0/contoso_100k.sql

    # Restore the dump file into the PostgreSQL database (suppress output)
    !sudo -u postgres psql contoso_100k < contoso_100k.sql > /dev/null 2>&1

    # Shift libraries from ipython-sql to jupysql
    !pip uninstall -y ipython-sql > /dev/null 2>&1
    !pip install jupysql > /dev/null 2>&1

# Load the sql extension for SQL magic
%load_ext sql

# Connect to the PostgreSQL database
%sql postgresql://postgres:password@localhost:5432/contoso_100k

# Enable automatic conversion of SQL results to pandas DataFrames
%config SqlMagic.autopandas = True

# Disable named parameters for SQL magic
%config SqlMagic.named_parameters = "disabled"

# Display pandas number to two decimal places
pd.options.display.float_format = '{:.2f}'.format

In [15]:
%%sql

WITH percentiles AS (
  SELECT
    PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY (s.quantity*s.netprice*s.exchangerate)) AS revenue_25th_percentile,
    PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY (s.quantity*s.netprice*s.exchangerate)) AS revenue_75th_percentile
  FROM
    sales s
  WHERE
    orderdate BETWEEN '2022-01-01' AND '2023-12-31'
)
SELECT
  p.categoryname AS category,
  CASE
    WHEN (s.quantity * s.netprice * s.exchangerate)<=pctl.revenue_25th_percentile THEN '3-LOW'
    WHEN (s.quantity * s.netprice * s.exchangerate)>=pctl.revenue_75th_percentile  THEN '1-HIGH'
    ELSE '2-MEDIUM'
  END AS revenue_tier,
  SUM(s.quantity * s.netprice * s.exchangerate) AS total_revenue
FROM
  sales s
  LEFT JOIN product p ON s.productkey = p.productkey,
  percentiles pctl
GROUP BY
  p.categoryname,
  revenue_tier
ORDER BY
  p.categoryname,
  revenue_tier



Unnamed: 0,category,revenue_tier,total_revenue
0,Audio,1-HIGH,1213265.71
1,Audio,2-MEDIUM,3832415.38
2,Audio,3-LOW,267217.01
3,Cameras and camcorders,1-HIGH,15050781.63
4,Cameras and camcorders,2-MEDIUM,3388546.1
5,Cameras and camcorders,3-LOW,81032.92
6,Cell phones,1-HIGH,21874993.15
7,Cell phones,2-MEDIUM,10338963.22
8,Cell phones,3-LOW,410309.35
9,Computers,1-HIGH,79607760.89


In [23]:
%%sql
SELECT
  DATE_TRUNC('month',orderdate)::date AS order_month,
  SUM(quantity * netprice * exchangerate) AS net_revenue,
  COUNT(DISTINCT customerkey) AS total_unique_customers
FROM sales
GROUP BY
  order_month


Unnamed: 0,order_month,net_revenue,total_unique_customers
0,2015-01-01,384092.66,200
1,2015-02-01,706374.12,291
2,2015-03-01,332961.59,139
3,2015-04-01,160767.00,78
4,2015-05-01,548632.63,236
...,...,...,...
107,2023-12-01,2928550.93,1484
108,2024-01-01,2677498.55,1340
109,2024-02-01,3542322.55,1718
110,2024-03-01,1692854.89,877


In [26]:
%%sql
SELECT
orderdate,
TO_CHAR(orderdate,'YYYY-MM')
FROM sales
ORDER BY RANDOM()
LIMIT 10


Unnamed: 0,orderdate,to_char
0,2016-08-19,2016-08
1,2022-05-05,2022-05
2,2022-08-13,2022-08
3,2020-03-11,2020-03
4,2022-01-21,2022-01
5,2023-02-18,2023-02
6,2024-02-09,2024-02
7,2016-12-29,2016-12
8,2019-02-27,2019-02
9,2020-02-26,2020-02


Date and Time Filtering

**DATE_PART**

In [28]:
%%sql
SELECT
orderdate,
DATE_PART('year',orderdate)AS order_year,
DATE_PART('month',orderdate)AS order_month,
DATE_PART('day',orderdate)AS order_day
FROM sales
ORDER BY RANDOM()
LIMIT 10

Unnamed: 0,orderdate,order_year,order_month,order_day
0,2022-08-06,2022.0,8.0,6.0
1,2023-09-02,2023.0,9.0,2.0
2,2017-01-04,2017.0,1.0,4.0
3,2018-04-25,2018.0,4.0,25.0
4,2022-09-25,2022.0,9.0,25.0
5,2022-07-14,2022.0,7.0,14.0
6,2022-06-18,2022.0,6.0,18.0
7,2024-03-13,2024.0,3.0,13.0
8,2016-05-04,2016.0,5.0,4.0
9,2018-07-09,2018.0,7.0,9.0


**EXTRACT**

In [30]:
%%sql
SELECT
orderdate,
EXTRACT(YEAR FROM orderdate)AS order_year,
EXTRACT(MONTH FROM orderdate)AS order_month,
EXTRACT(DAY FROM orderdate)AS order_day
FROM sales
ORDER BY RANDOM()
LIMIT 10

Unnamed: 0,orderdate,order_year,order_month,order_day
0,2022-11-22,2022,11,22
1,2023-12-16,2023,12,16
2,2021-11-18,2021,11,18
3,2024-02-05,2024,2,5
4,2017-08-14,2017,8,14
5,2018-05-31,2018,5,31
6,2020-02-29,2020,2,29
7,2022-10-17,2022,10,17
8,2016-09-23,2016,9,23
9,2022-03-05,2022,3,5


In [37]:
%%sql
SELECT
  EXTRACT(YEAR FROM orderdate) AS order_year,
  EXTRACT(MONTH FROM orderdate) AS order_month,
  SUM(quantity * netprice * exchangerate) AS net_revenue

FROM sales
GROUP BY
  order_year,
  order_month
ORDER BY
  order_year,
  order_month


Unnamed: 0,order_year,order_month,net_revenue
0,2015,1,384092.66
1,2015,2,706374.12
2,2015,3,332961.59
3,2015,4,160767.00
4,2015,5,548632.63
...,...,...,...
107,2023,12,2928550.93
108,2024,1,2677498.55
109,2024,2,3542322.55
110,2024,3,1692854.89


**CURRENT_DATE**

In [38]:
%%sql
SELECT CURRENT_DATE

Unnamed: 0,current_date
0,2025-04-18


In [39]:
%%sql
SELECT NOW()

Unnamed: 0,now
0,2025-04-18 15:48:53.813626+00:00


In [49]:
%%sql
SELECT
  s.orderdate,
  p.categoryname,
  SUM(s.quantity*s.netprice*s.exchangerate) AS net_revenue
FROM sales s
LEFT JOIN product p ON s.productkey = p.productkey
WHERE EXTRACT(YEAR FROM orderdate) >= EXTRACT(YEAR FROM CURRENT_DATE) - 5
GROUP BY
  s.orderdate,
  p.categoryname


Unnamed: 0,orderdate,categoryname,net_revenue
0,2020-11-19,Home Appliances,3853.71
1,2021-09-22,Cameras and camcorders,2161.73
2,2022-04-21,Games and Toys,334.05
3,2020-04-23,Audio,1064.52
4,2022-10-17,Audio,926.46
...,...,...,...
11166,2023-11-20,Games and Toys,469.88
11167,2022-05-08,Cameras and camcorders,226.15
11168,2023-03-03,Cell phones,13360.38
11169,2023-05-13,Audio,3560.74


**DATE AND TIME DIFFERENCES**

In [52]:
%%sql
SELECT
CURRENT_DATE,
orderdate
FROM sales s
WHERE
orderdate >= CURRENT_DATE - INTERVAL '5 years'


Unnamed: 0,current_date,orderdate
0,2025-04-18,2020-04-18
1,2025-04-18,2020-04-18
2,2025-04-18,2020-04-18
3,2025-04-18,2020-04-18
4,2025-04-18,2020-04-18
...,...,...
118213,2025-04-18,2024-04-20
118214,2025-04-18,2024-04-20
118215,2025-04-18,2024-04-20
118216,2025-04-18,2024-04-20


In [53]:
%%sql
SELECT
  s.orderdate,
  p.categoryname,
  SUM(s.quantity*s.netprice*s.exchangerate) AS net_revenue
FROM sales s
LEFT JOIN product p ON s.productkey = p.productkey
WHERE orderdate >= CURRENT_DATE - INTERVAL '5 years'
GROUP BY
  s.orderdate,
  p.categoryname


Unnamed: 0,orderdate,categoryname,net_revenue
0,2020-11-19,Home Appliances,3853.71
1,2021-09-22,Cameras and camcorders,2161.73
2,2022-04-21,Games and Toys,334.05
3,2020-04-23,Audio,1064.52
4,2022-10-17,Audio,926.46
...,...,...,...
10425,2023-11-20,Games and Toys,469.88
10426,2022-05-08,Cameras and camcorders,226.15
10427,2023-03-03,Cell phones,13360.38
10428,2023-05-13,Audio,3560.74


In [65]:
%%sql
SELECT
  DATE_PART('year',orderdate) AS order_year,
  ROUND(AVG(EXTRACT(DAYS FROM AGE(deliverydate,orderdate))),2) AS avg_processing_time,
  CAST(SUM(quantity*netprice*exchangerate) AS INTEGER) AS net_revenue
FROM
  sales
WHERE
  orderdate >= CURRENT_DATE - INTERVAL '5 years'
GROUP BY
 order_year
ORDER BY
  order_year
LIMIT 10

Unnamed: 0,order_year,avg_processing_time,net_revenue
0,2020.0,0.96,5082042
1,2021.0,1.36,21357977
2,2022.0,1.62,44864557
3,2023.0,1.75,33108566
4,2024.0,1.67,8396527
