## Window Functions in PySpark - Part 4

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
import pyspark.sql.functions as F

StatementMeta(, a509921b-37e1-463b-aff0-233cdebd7408, 3, Finished, Available, Finished)

In [2]:
spark = SparkSession.builder.appName('WindowFunctionsPart4').getOrCreate()

StatementMeta(, a509921b-37e1-463b-aff0-233cdebd7408, 4, Finished, Available, Finished)

In [3]:
emp_data = [
    (1, 'Alice', 1, 6300),
    (2, 'Bob', 1, 6200),
    (3, 'Charlie', 2, 7000),
    (4, 'David', 2, 7200),
    (5, 'Eve', 1, 6300),
    (6, 'Frank', 2, 7100)
]
dept_data = [
    (1, 'HR'),
    (2, 'Finance')
]

emp_df = spark.createDataFrame(emp_data, ['EmpId', 'EmpName', 'DeptId', 'Salary'])
dept_df = spark.createDataFrame(dept_data, ['DeptId', 'DeptName'])
emp_df.show()
dept_df.show()

StatementMeta(, a509921b-37e1-463b-aff0-233cdebd7408, 5, Finished, Available, Finished)

+-----+-------+------+------+
|EmpId|EmpName|DeptId|Salary|
+-----+-------+------+------+
|    1|  Alice|     1|  6300|
|    2|    Bob|     1|  6200|
|    3|Charlie|     2|  7000|
|    4|  David|     2|  7200|
|    5|    Eve|     1|  6300|
|    6|  Frank|     2|  7100|
+-----+-------+------+------+

+------+--------+
|DeptId|DeptName|
+------+--------+
|     1|      HR|
|     2| Finance|
+------+--------+



## Finding the Highest Salary in Each Department
We use a window function to rank salaries within each department.

In [4]:
window_spec = Window.partitionBy('DeptId').orderBy(F.desc('Salary'))
ranked_salary_df = emp_df.withColumn('Rank', F.rank().over(window_spec))
ranked_salary_df.show()

StatementMeta(, a509921b-37e1-463b-aff0-233cdebd7408, 6, Finished, Available, Finished)

+-----+-------+------+------+----+
|EmpId|EmpName|DeptId|Salary|Rank|
+-----+-------+------+------+----+
|    1|  Alice|     1|  6300|   1|
|    5|    Eve|     1|  6300|   1|
|    2|    Bob|     1|  6200|   3|
|    4|  David|     2|  7200|   1|
|    6|  Frank|     2|  7100|   2|
|    3|Charlie|     2|  7000|   3|
+-----+-------+------+------+----+



## Filtering the Top Salary in Each Department

In [5]:
result_df = ranked_salary_df.filter(F.col('Rank') == 1)
result_df.show()

StatementMeta(, a509921b-37e1-463b-aff0-233cdebd7408, 7, Finished, Available, Finished)

+-----+-------+------+------+----+
|EmpId|EmpName|DeptId|Salary|Rank|
+-----+-------+------+------+----+
|    1|  Alice|     1|  6300|   1|
|    5|    Eve|     1|  6300|   1|
|    4|  David|     2|  7200|   1|
+-----+-------+------+------+----+



## Joining with Department Names

In [6]:
result_df = result_df.join(dept_df, ['DeptId'], 'left')
result_df.select('EmpName', 'DeptName', 'Salary').show()

StatementMeta(, a509921b-37e1-463b-aff0-233cdebd7408, 8, Finished, Available, Finished)

+-------+--------+------+
|EmpName|DeptName|Salary|
+-------+--------+------+
|  Alice|      HR|  6300|
|    Eve|      HR|  6300|
|  David| Finance|  7200|
+-------+--------+------+

