[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/wasim/Data-Science/blob/main/data-analyst-roadmap/06_sql_for_analytics/03_window_functions.ipynb)

# Window Functions in SQL

Window functions perform calculations across rows 
related to the current row.

## Topics Covered
- ROW_NUMBER, RANK, DENSE_RANK
- LAG and LEAD
- Running totals
- Moving averages
- PARTITION BY

In [None]:
import sqlite3
import pandas as pd

conn = sqlite3.connect(':memory:')

## Setup: Sales Data

In [None]:
# Create sales table
cursor = conn.cursor()

cursor.execute('''
CREATE TABLE sales (
    id INTEGER PRIMARY KEY,
    salesperson TEXT,
    region TEXT,
    product TEXT,
    amount INTEGER,
    sale_date TEXT
)
''')

# Insert sample data
sales_data = [
    (1, 'Alice', 'East', 'Laptop', 1200, 
     '2023-01-15'),
    (2, 'Bob', 'West', 'Phone', 800, 
     '2023-01-16'),
    (3, 'Alice', 'East', 'Tablet', 500, 
     '2023-01-17'),
    (4, 'Charlie', 'East', 'Laptop', 1200, 
     '2023-01-18'),
    (5, 'Bob', 'West', 'Laptop', 1100, 
     '2023-01-19'),
    (6, 'Alice', 'East', 'Phone', 850, 
     '2023-01-20'),
    (7, 'David', 'South', 'Tablet', 600, 
     '2023-01-21'),
    (8, 'Charlie', 'East', 'Phone', 900, 
     '2023-01-22'),
    (9, 'Bob', 'West', 'Tablet', 550, 
     '2023-01-23'),
    (10, 'David', 'South', 'Laptop', 1150, 
     '2023-01-24')
]

cursor.executemany(
    'INSERT INTO sales VALUES (?,?,?,?,?,?)', 
    sales_data
)
conn.commit()

pd.read_sql_query('SELECT * FROM sales', conn)

## 1. ROW_NUMBER()

Assigns unique sequential number to each row.

In [None]:
# Number all sales
query = '''
SELECT 
    ROW_NUMBER() OVER (
        ORDER BY sale_date
    ) AS row_num,
    salesperson,
    product,
    amount,
    sale_date
FROM sales
'''

pd.read_sql_query(query, conn)

In [None]:
# Number sales per salesperson
query = '''
SELECT 
    ROW_NUMBER() OVER (
        PARTITION BY salesperson 
        ORDER BY sale_date
    ) AS sale_number,
    salesperson,
    product,
    amount,
    sale_date
FROM sales
ORDER BY salesperson, sale_date
'''

pd.read_sql_query(query, conn)

## 2. RANK() and DENSE_RANK()

Assign ranks with different tie handling.

In [None]:
# Compare RANK vs DENSE_RANK
query = '''
SELECT 
    salesperson,
    amount,
    RANK() OVER (
        ORDER BY amount DESC
    ) AS rank,
    DENSE_RANK() OVER (
        ORDER BY amount DESC
    ) AS dense_rank,
    ROW_NUMBER() OVER (
        ORDER BY amount DESC
    ) AS row_num
FROM sales
ORDER BY amount DESC
'''

pd.read_sql_query(query, conn)

In [None]:
# Rank sales within each region
query = '''
SELECT 
    region,
    salesperson,
    amount,
    RANK() OVER (
        PARTITION BY region 
        ORDER BY amount DESC
    ) AS region_rank
FROM sales
ORDER BY region, region_rank
'''

pd.read_sql_query(query, conn)

## 3. Top N per Group

Find top performers in each category.

In [None]:
# Top 2 sales per region
query = '''
WITH ranked_sales AS (
    SELECT 
        region,
        salesperson,
        product,
        amount,
        RANK() OVER (
            PARTITION BY region 
            ORDER BY amount DESC
        ) AS rank
    FROM sales
)
SELECT 
    region,
    salesperson,
    product,
    amount
FROM ranked_sales
WHERE rank <= 2
ORDER BY region, rank
'''

pd.read_sql_query(query, conn)

## 4. LAG() and LEAD()

Access previous/next row values.

In [None]:
# Compare with previous sale
query = '''
SELECT 
    salesperson,
    sale_date,
    amount,
    LAG(amount, 1) OVER (
        PARTITION BY salesperson 
        ORDER BY sale_date
    ) AS prev_amount,
    amount - LAG(amount, 1) OVER (
        PARTITION BY salesperson 
        ORDER BY sale_date
    ) AS difference
FROM sales
ORDER BY salesperson, sale_date
'''

pd.read_sql_query(query, conn)

In [None]:
# Look ahead to next sale
query = '''
SELECT 
    salesperson,
    sale_date,
    amount,
    LEAD(amount, 1) OVER (
        PARTITION BY salesperson 
        ORDER BY sale_date
    ) AS next_amount,
    LEAD(sale_date, 1) OVER (
        PARTITION BY salesperson 
        ORDER BY sale_date
    ) AS next_date
FROM sales
ORDER BY salesperson, sale_date
'''

pd.read_sql_query(query, conn)

## 5. Running Totals

Calculate cumulative sums.

In [None]:
# Running total of all sales
query = '''
SELECT 
    sale_date,
    salesperson,
    amount,
    SUM(amount) OVER (
        ORDER BY sale_date
        ROWS BETWEEN UNBOUNDED PRECEDING 
        AND CURRENT ROW
    ) AS running_total
FROM sales
ORDER BY sale_date
'''

pd.read_sql_query(query, conn)

In [None]:
# Running total per salesperson
query = '''
SELECT 
    salesperson,
    sale_date,
    amount,
    SUM(amount) OVER (
        PARTITION BY salesperson 
        ORDER BY sale_date
        ROWS BETWEEN UNBOUNDED PRECEDING 
        AND CURRENT ROW
    ) AS running_total
FROM sales
ORDER BY salesperson, sale_date
'''

pd.read_sql_query(query, conn)

## 6. Moving Averages

Calculate rolling statistics.

In [None]:
# 3-day moving average
query = '''
SELECT 
    sale_date,
    amount,
    AVG(amount) OVER (
        ORDER BY sale_date
        ROWS BETWEEN 2 PRECEDING 
        AND CURRENT ROW
    ) AS moving_avg_3day,
    COUNT(*) OVER (
        ORDER BY sale_date
        ROWS BETWEEN 2 PRECEDING 
        AND CURRENT ROW
    ) AS window_size
FROM sales
ORDER BY sale_date
'''

pd.read_sql_query(query, conn)

## 7. Percentage of Total

In [None]:
# Calculate % of total sales
query = '''
SELECT 
    salesperson,
    SUM(amount) AS total_sales,
    ROUND(
        100.0 * SUM(amount) / 
        SUM(SUM(amount)) OVER (), 
        2
    ) AS pct_of_total
FROM sales
GROUP BY salesperson
ORDER BY total_sales DESC
'''

pd.read_sql_query(query, conn)

In [None]:
# % of region total
query = '''
SELECT 
    region,
    salesperson,
    SUM(amount) AS sales,
    ROUND(
        100.0 * SUM(amount) / 
        SUM(SUM(amount)) OVER (
            PARTITION BY region
        ), 
        2
    ) AS pct_of_region
FROM sales
GROUP BY region, salesperson
ORDER BY region, sales DESC
'''

pd.read_sql_query(query, conn)

## 8. Real-World Example

Comprehensive sales analysis.

In [None]:
# Complete sales performance dashboard
query = '''
SELECT 
    salesperson,
    region,
    COUNT(*) AS num_sales,
    SUM(amount) AS total_sales,
    ROUND(AVG(amount), 2) AS avg_sale,
    RANK() OVER (
        ORDER BY SUM(amount) DESC
    ) AS overall_rank,
    RANK() OVER (
        PARTITION BY region 
        ORDER BY SUM(amount) DESC
    ) AS region_rank,
    ROUND(
        100.0 * SUM(amount) / 
        SUM(SUM(amount)) OVER (), 
        2
    ) AS pct_of_total
FROM sales
GROUP BY salesperson, region
ORDER BY total_sales DESC
'''

pd.read_sql_query(query, conn)

## Practice Exercises

### Exercise 1
Find the 2nd highest sale per region.

In [None]:
# Your code here


### Exercise 2
Calculate 7-day moving average of sales.

In [None]:
# Your code here


### Exercise 3
Find growth rate from previous sale 
for each salesperson.

In [None]:
# Your code here


In [None]:
conn.close()

## Key Takeaways

✅ **ROW_NUMBER** - Unique sequential numbers  
✅ **RANK/DENSE_RANK** - Ranking with ties  
✅ **LAG/LEAD** - Access adjacent rows  
✅ **Running totals** - Cumulative sums  
✅ **Moving averages** - Rolling statistics  
✅ **PARTITION BY** - Group calculations  

**Next:** CTEs and Complex Joins →