We use Window functions to do row level analytics without collapsing(unlike groupBy) 

In [0]:
# Why Window functions ?
# groupBy -> reduces rows
# Window functions -> does not reduce rows -> keeps all rows + adds insight
# Window functions
# - rank, dense_rank, row_number
# - lead, lag
# - first_value, last_value
# - sum, avg, min, max
# - count

# you will use window when : you need a rank, lastest records, deduplication, moving average, runnin total etc...

"""

| Function       | What it does           |
| -------------- | ---------------------- |
| `row_number()` | Unique row number      |
| `rank()`       | Ranking with gaps      |
| `dense_rank()` | Ranking without gaps   |
| `lag(col, n)`  | Previous row value     |
| `lead(col, n)` | Next row value         |
| `sum(col)`     | Running / windowed sum |
| `avg(col)`     | Window average         |

"""

a window spec (Window Specification) defines how rows are grouped and ordered when you apply window functions—things like row_number, rank, lag, lead, running totals, etc.

A window spec can have three main parts:

1. partitionBy – how to group rows (like GROUP BY, but without collapsing rows)

2. orderBy – how rows are ordered within each partition

3. rowsBetween / rangeBetween – frame boundaries (optional)

In [0]:
df_orders = (spark.read.table("dev_data.test.orders_online"))

df_customers = (spark.read.table("dev_data.test.customers_online"))
df_orders.show()
#df_customers.show()

# try to avoid double actions at same cell

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

window = Window.partitionBy("customer_id").orderBy("order_date")


In [0]:
# row_numer most imp
# lastest order per costumer
from pyspark.sql.functions import col
df_lastest = (
    df_orders.withColumn(
        "rn", row_number().over(window)
    )
)
df_lastest.filter(col("rn") == 1).show()

# row_number is used to identify the lastest or top record per group

In [0]:
df_lastest.display()

In [0]:
df_lastest.show()

In [0]:
from pyspark.sql.functions import rank, dense_rank

#d = rank().over(window)

In [0]:
#dense_rank().over(window)

In [0]:
# keep the latest record per customer

df_dedup = (
    df_orders.withColumn(
        "rn", row_number().over(window)
    ).filter(col("rn") == 1).drop("rn")
)

df_dedup.show()

In [0]:
# running total
from pyspark.sql.functions import sum, col
runnin_window = (
    Window.partitionBy("customer_id")\
        .orderBy("order_date")\
        .rowsBetween(Window.unboundedPreceding, Window.currentRow)
)

df_running = (
    df_orders.withColumn(
        "running_total", 
        sum(col("order_amount").cast("int")).over(runnin_window)
    )
)

df_running.show()

In [0]:
#latest Order per customer

from pyspark.sql.window import Window
from pyspark.sql.functions import col
win_spec = Window.partitionBy("customer_id").orderBy(col("order_date").desc())

lastest_ord = df_orders.withColumn(
    "rn", row_number().over(win_spec))

display(lastest_ord.filter(col("rn") ==1))
lastest_ord.show()

In [0]:
lastest_ord = df_orders.withColumn(
    "rn", row_number().over(win_spec))\
        .filter(col("rn") ==1).drop("rn")

display(lastest_ord)
lastest_ord.show()

In [0]:
# 2 rank orders by amount per customer

from pyspark.sql.functions import rank, sum

win_rank = Window.partitionBy("customer_id").orderBy(col("order_amount").desc())

df_rank = df_orders.withColumn(
    "rank", rank().over(win_rank))
display(df_rank)

In [0]:

# 3️⃣ Deduplicate orders keeping latest

window_spec = Window.partitionBy("customer_id").orderBy(col("order_date").desc())

df_dedup1 = df_orders.withColumn(
    "rn", row_number().over(window_spec)
).filter(col("rn") ==1).drop("rn")

display(df_dedup1)

In [0]:
# 4️⃣ Running total per customer

wind_run = (Window.partitionBy("customer_id")
            .orderBy(col("order_date"))
            .rowsBetween(Window.unboundedPreceding, Window.currentRow)
)

running_total_df = df_orders.withColumn(
    "Running_total", sum(col("order_amount")).over(wind_run)
)

display(running_total_df)

