# Window Functions in PySpark - Part 2

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
import pyspark.sql.functions as F

StatementMeta(, fe5c5b4e-cdad-4ad1-a88d-d6a63cba8a4b, 3, Finished, Available, Finished)

## Creating a Spark Session

In [2]:
spark = SparkSession.builder.appName('WindowFunctionsPart2').getOrCreate()

StatementMeta(, fe5c5b4e-cdad-4ad1-a88d-d6a63cba8a4b, 4, Finished, Available, Finished)

## Creating a Sample DataFrame

In [3]:
data = [
    ('Alice', 100),
    ('Bob', 200),
    ('Charlie', 200),
    ('David', 300),
    ('Eve', 400),
    ('Frank', 500),
    ('Grace', 500),
    ('Hank', 600),
    ('Ivy', 700),
    ('Jack', 800)
]

columns = ['Name', 'Score']
df = spark.createDataFrame(data, columns)
df.show()

StatementMeta(, fe5c5b4e-cdad-4ad1-a88d-d6a63cba8a4b, 5, Finished, Available, Finished)

+-------+-----+
|   Name|Score|
+-------+-----+
|  Alice|  100|
|    Bob|  200|
|Charlie|  200|
|  David|  300|
|    Eve|  400|
|  Frank|  500|
|  Grace|  500|
|   Hank|  600|
|    Ivy|  700|
|   Jack|  800|
+-------+-----+



## Defining a Window Specification
A window specification is created by ordering rows based on the 'Score' column.

In [4]:
window_spec = Window.orderBy('Score')

StatementMeta(, fe5c5b4e-cdad-4ad1-a88d-d6a63cba8a4b, 6, Finished, Available, Finished)

## Applying Window Functions
### 1. Rank
Assigns the same rank to rows with the same score, with gaps for the next rank.

In [5]:
df1 = df.withColumn('Rank', F.rank().over(window_spec))
df1.show()

StatementMeta(, fe5c5b4e-cdad-4ad1-a88d-d6a63cba8a4b, 7, Finished, Available, Finished)

+-------+-----+----+
|   Name|Score|Rank|
+-------+-----+----+
|  Alice|  100|   1|
|    Bob|  200|   2|
|Charlie|  200|   2|
|  David|  300|   4|
|    Eve|  400|   5|
|  Frank|  500|   6|
|  Grace|  500|   6|
|   Hank|  600|   8|
|    Ivy|  700|   9|
|   Jack|  800|  10|
+-------+-----+----+



### 2. Dense Rank
Similar to `rank()`, but does not leave gaps between ranks.

In [6]:
df2 = df.withColumn('DenseRank', F.dense_rank().over(window_spec))
df2.show()

StatementMeta(, fe5c5b4e-cdad-4ad1-a88d-d6a63cba8a4b, 8, Finished, Available, Finished)

+-------+-----+---------+
|   Name|Score|DenseRank|
+-------+-----+---------+
|  Alice|  100|        1|
|    Bob|  200|        2|
|Charlie|  200|        2|
|  David|  300|        3|
|    Eve|  400|        4|
|  Frank|  500|        5|
|  Grace|  500|        5|
|   Hank|  600|        6|
|    Ivy|  700|        7|
|   Jack|  800|        8|
+-------+-----+---------+



### 3. Row Number
Assigns a unique row number to each record.

In [7]:
df3 = df.withColumn('RowNumber', F.row_number().over(window_spec))
df3.show()

StatementMeta(, fe5c5b4e-cdad-4ad1-a88d-d6a63cba8a4b, 9, Finished, Available, Finished)

+-------+-----+---------+
|   Name|Score|RowNumber|
+-------+-----+---------+
|  Alice|  100|        1|
|    Bob|  200|        2|
|Charlie|  200|        3|
|  David|  300|        4|
|    Eve|  400|        5|
|  Frank|  500|        6|
|  Grace|  500|        7|
|   Hank|  600|        8|
|    Ivy|  700|        9|
|   Jack|  800|       10|
+-------+-----+---------+



### 4. Lead Function - Score Difference with Next Row
Calculates the difference between the current and next row's score.

In [8]:
df4 = df.withColumn('ScoreDifferenceWithNext', F.lead('Score').over(window_spec) - df['Score'])
df4.show()

StatementMeta(, fe5c5b4e-cdad-4ad1-a88d-d6a63cba8a4b, 10, Finished, Available, Finished)

+-------+-----+-----------------------+
|   Name|Score|ScoreDifferenceWithNext|
+-------+-----+-----------------------+
|  Alice|  100|                    100|
|    Bob|  200|                      0|
|Charlie|  200|                    100|
|  David|  300|                    100|
|    Eve|  400|                    100|
|  Frank|  500|                      0|
|  Grace|  500|                    100|
|   Hank|  600|                    100|
|    Ivy|  700|                    100|
|   Jack|  800|                   NULL|
+-------+-----+-----------------------+



### 5. Lag Function - Score Difference with Previous Row
Calculates the difference between the current and previous row's score.

In [9]:
df5 = df.withColumn('ScoreDifferenceWithPrevious', df['Score'] - F.lag('Score').over(window_spec))
df5.show()

StatementMeta(, fe5c5b4e-cdad-4ad1-a88d-d6a63cba8a4b, 11, Finished, Available, Finished)

+-------+-----+---------------------------+
|   Name|Score|ScoreDifferenceWithPrevious|
+-------+-----+---------------------------+
|  Alice|  100|                       NULL|
|    Bob|  200|                        100|
|Charlie|  200|                          0|
|  David|  300|                        100|
|    Eve|  400|                        100|
|  Frank|  500|                        100|
|  Grace|  500|                          0|
|   Hank|  600|                        100|
|    Ivy|  700|                        100|
|   Jack|  800|                        100|
+-------+-----+---------------------------+

