In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("windowing").getOrCreate()

2021-09-24 08:05:16,163 WARN util.Utils: Your hostname, tb-LinuxBox resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
2021-09-24 08:05:16,165 WARN util.Utils: Set SPARK_LOCAL_IP if you need to bind to another address
2021-09-24 08:05:18,734 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
columns = ["name","department","salary"]
data = [
        ("Jonathan","Developer",3000),\
        ("Tony","Developer",4600),\
        ("Arya","Developer",4100,),\
        ("Bruce","Sales",3000),\
        ("Natasha","Sales",3000),\
        ("Thor","Finance",3300),\
        ("Steve","Sales",3900),\
        ("Thanos","Finance",3000),\
        ("Sansa","Developer",2000),\
        ("Gandalf","Finance",4100)
       ]

In [3]:
df = spark.createDataFrame(data,columns)

In [4]:
df.show()

                                                                                

+--------+----------+------+
|    name|department|salary|
+--------+----------+------+
|Jonathan| Developer|  3000|
|    Tony| Developer|  4600|
|    Arya| Developer|  4100|
|   Bruce|     Sales|  3000|
| Natasha|     Sales|  3000|
|    Thor|   Finance|  3300|
|   Steve|     Sales|  3900|
|  Thanos|   Finance|  3000|
|   Sansa| Developer|  2000|
| Gandalf|   Finance|  4100|
+--------+----------+------+



# row_number

In [5]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

In [6]:
window_object = Window.partitionBy("department").orderBy("salary")

In [7]:
window_object 
'''
This windowSpec object is an instance of:
class WindowSpec(object):
    """
    A window specification that defines the partitioning, ordering,
    and frame boundaries.
'''

'\nThis windowSpec object is an instance of:\nclass WindowSpec(object):\n    """\n    A window specification that defines the partitioning, ordering,\n    and frame boundaries.\n'

In [8]:
windowspec = Window.partitionBy("department").orderBy("salary")

In [9]:
df.withColumn("row_number", row_number().over(Window.partitionBy("department").orderBy("salary"))).show()



+--------+----------+------+----------+
|    name|department|salary|row_number|
+--------+----------+------+----------+
|   Bruce|     Sales|  3000|         1|
| Natasha|     Sales|  3000|         2|
|   Steve|     Sales|  3900|         3|
|  Thanos|   Finance|  3000|         1|
|    Thor|   Finance|  3300|         2|
| Gandalf|   Finance|  4100|         3|
|   Sansa| Developer|  2000|         1|
|Jonathan| Developer|  3000|         2|
|    Arya| Developer|  4100|         3|
|    Tony| Developer|  4600|         4|
+--------+----------+------+----------+



                                                                                

In [22]:
df.withColumn("row_number",row_number().over(Window.partitionBy("department").orderBy(df.salary.desc()))).filter("row_number == 1").show()

                                                                                

+-------+----------+------+----------+
|   name|department|salary|row_number|
+-------+----------+------+----------+
|  Steve|     Sales|  3900|         1|
|Gandalf|   Finance|  4100|         1|
|   Tony| Developer|  4600|         1|
+-------+----------+------+----------+



In [10]:
# or

In [11]:
df.withColumn("row_number", row_number().over(windowspec)).show()

                                                                                

+--------+----------+------+----------+
|    name|department|salary|row_number|
+--------+----------+------+----------+
|   Bruce|     Sales|  3000|         1|
| Natasha|     Sales|  3000|         2|
|   Steve|     Sales|  3900|         3|
|  Thanos|   Finance|  3000|         1|
|    Thor|   Finance|  3300|         2|
| Gandalf|   Finance|  4100|         3|
|   Sansa| Developer|  2000|         1|
|Jonathan| Developer|  3000|         2|
|    Arya| Developer|  4100|         3|
|    Tony| Developer|  4600|         4|
+--------+----------+------+----------+





# rank

In [12]:
# next rank after tie is skipped

In [13]:
from pyspark.sql.functions import rank

In [14]:
df.withColumn("rank",rank().over(Window.partitionBy("department").orderBy("salary"))).show()



+--------+----------+------+----+
|    name|department|salary|rank|
+--------+----------+------+----+
|   Bruce|     Sales|  3000|   1|
| Natasha|     Sales|  3000|   1|
|   Steve|     Sales|  3900|   3|
|  Thanos|   Finance|  3000|   1|
|    Thor|   Finance|  3300|   2|
| Gandalf|   Finance|  4100|   3|
|   Sansa| Developer|  2000|   1|
|Jonathan| Developer|  3000|   2|
|    Arya| Developer|  4100|   3|
|    Tony| Developer|  4600|   4|
+--------+----------+------+----+



                                                                                

# dense_rank

In [15]:
# next rank after tie is not skipped

In [16]:
from pyspark.sql.functions import dense_rank

In [17]:
df.withColumn("dense_rank",dense_rank().over(Window.partitionBy("department").orderBy("salary"))).show()



+--------+----------+------+----------+
|    name|department|salary|dense_rank|
+--------+----------+------+----------+
|   Bruce|     Sales|  3000|         1|
| Natasha|     Sales|  3000|         1|
|   Steve|     Sales|  3900|         2|
|  Thanos|   Finance|  3000|         1|
|    Thor|   Finance|  3300|         2|
| Gandalf|   Finance|  4100|         3|
|   Sansa| Developer|  2000|         1|
|Jonathan| Developer|  3000|         2|
|    Arya| Developer|  4100|         3|
|    Tony| Developer|  4600|         4|
+--------+----------+------+----------+



                                                                                

# percent_rank

In [18]:
from pyspark.sql.functions import percent_rank

In [19]:
df.withColumn("percent_rank",percent_rank().over(Window.partitionBy("department").orderBy("salary"))).show()



+--------+----------+------+------------------+
|    name|department|salary|      percent_rank|
+--------+----------+------+------------------+
|   Bruce|     Sales|  3000|               0.0|
| Natasha|     Sales|  3000|               0.0|
|   Steve|     Sales|  3900|               1.0|
|  Thanos|   Finance|  3000|               0.0|
|    Thor|   Finance|  3300|               0.5|
| Gandalf|   Finance|  4100|               1.0|
|   Sansa| Developer|  2000|               0.0|
|Jonathan| Developer|  3000|0.3333333333333333|
|    Arya| Developer|  4100|0.6666666666666666|
|    Tony| Developer|  4600|               1.0|
+--------+----------+------+------------------+



                                                                                