# Window Functions in PySpark - Part 3

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
import pyspark.sql.functions as F

StatementMeta(, 358cf085-313e-449a-978e-c3d973eb18ba, 3, Finished, Available, Finished)

## Creating a Spark Session

In [2]:
spark = SparkSession.builder.appName('WindowFunctionsPart3').getOrCreate()

StatementMeta(, 358cf085-313e-449a-978e-c3d973eb18ba, 4, Finished, Available, Finished)

## Creating a Sample DataFrame

In [3]:
data = [
    ('Alice', 'Math', 90, 1), ('Alice', 'Science', 85, 1), ('Alice', 'History', 78, 1),
    ('Bob', 'Math', 80, 1), ('Bob', 'Science', 81, 1), ('Bob', 'History', 77, 1),
    ('Charlie', 'Math', 75, 1), ('Charlie', 'Science', 82, 1), ('Charlie', 'History', 79, 1),
    ('Alice', 'Physics', 86, 2), ('Alice', 'Chemistry', 92, 2), ('Alice', 'Biology', 80, 2),
    ('Bob', 'Physics', 94, 2), ('Bob', 'Chemistry', 91, 2), ('Bob', 'Biology', 96, 2),
    ('Charlie', 'Physics', 89, 2), ('Charlie', 'Chemistry', 88, 2), ('Charlie', 'Biology', 85, 2),
    ('Alice', 'Computer Science', 95, 3), ('Alice', 'Electronics', 91, 3), ('Alice', 'Geography', 97, 3),
    ('Bob', 'Computer Science', 88, 3), ('Bob', 'Electronics', 66, 3), ('Bob', 'Geography', 92, 3),
    ('Charlie', 'Computer Science', 92, 3), ('Charlie', 'Electronics', 97, 3), ('Charlie', 'Geography', 99, 3)
]

columns = ['First Name', 'Subject', 'Marks', 'Semester']
df = spark.createDataFrame(data, columns)
df.show()

StatementMeta(, 358cf085-313e-449a-978e-c3d973eb18ba, 5, Finished, Available, Finished)

+----------+----------------+-----+--------+
|First Name|         Subject|Marks|Semester|
+----------+----------------+-----+--------+
|     Alice|            Math|   90|       1|
|     Alice|         Science|   85|       1|
|     Alice|         History|   78|       1|
|       Bob|            Math|   80|       1|
|       Bob|         Science|   81|       1|
|       Bob|         History|   77|       1|
|   Charlie|            Math|   75|       1|
|   Charlie|         Science|   82|       1|
|   Charlie|         History|   79|       1|
|     Alice|         Physics|   86|       2|
|     Alice|       Chemistry|   92|       2|
|     Alice|         Biology|   80|       2|
|       Bob|         Physics|   94|       2|
|       Bob|       Chemistry|   91|       2|
|       Bob|         Biology|   96|       2|
|   Charlie|         Physics|   89|       2|
|   Charlie|       Chemistry|   88|       2|
|   Charlie|         Biology|   85|       2|
|     Alice|Computer Science|   95|       3|
|     Alic

## 1. Finding the Student Who Scored Maximum Marks in Each Semester

In [4]:
window_spec_max_marks = Window.partitionBy('Semester').orderBy(F.desc('Marks'))
max_marks_df = df.withColumn('Rank', F.rank().over(window_spec_max_marks))
top_scorer = max_marks_df.filter(max_marks_df['Rank'] == 1)
top_scorer.show()

StatementMeta(, 358cf085-313e-449a-978e-c3d973eb18ba, 6, Finished, Available, Finished)

+----------+---------+-----+--------+----+
|First Name|  Subject|Marks|Semester|Rank|
+----------+---------+-----+--------+----+
|     Alice|     Math|   90|       1|   1|
|       Bob|  Biology|   96|       2|   1|
|   Charlie|Geography|   99|       3|   1|
+----------+---------+-----+--------+----+



## 2. Calculating the Percentage of Each Student Considering All Subjects

In [5]:
window_spec_total_marks = Window.partitionBy('First Name', 'Semester')
df = df.withColumn('TotalMarks', F.sum('Marks').over(window_spec_total_marks))
df = df.withColumn('Percentage', (F.col('TotalMarks') / (3 * 100)).cast('decimal(5, 2)')*100)
df2 = df.groupBy('First Name', 'Semester').agg(F.max('TotalMarks').alias('TotalMarks'),
                                             F.max('Percentage').alias('Percentage'))
df2.show()

StatementMeta(, 358cf085-313e-449a-978e-c3d973eb18ba, 7, Finished, Available, Finished)

+----------+--------+----------+----------+
|First Name|Semester|TotalMarks|Percentage|
+----------+--------+----------+----------+
|     Alice|       1|       253|     84.00|
|     Alice|       2|       258|     86.00|
|     Alice|       3|       283|     94.00|
|       Bob|       1|       238|     79.00|
|       Bob|       2|       281|     94.00|
|       Bob|       3|       246|     82.00|
|   Charlie|       1|       236|     79.00|
|   Charlie|       2|       262|     87.00|
|   Charlie|       3|       288|     96.00|
+----------+--------+----------+----------+



## 3. Finding the Top Rank Holder in Each Semester

In [6]:
window_spec_rank = Window.partitionBy('Semester').orderBy(F.desc('Percentage'))
rank_df = df.withColumn('Rank', F.rank().over(window_spec_rank))
top_rank_holder = rank_df.filter(rank_df['Rank'] == 1).select('First Name','Semester', 'Rank', 'Percentage').distinct()
top_rank_holder.show()

StatementMeta(, 358cf085-313e-449a-978e-c3d973eb18ba, 8, Finished, Available, Finished)

+----------+--------+----+----------+
|First Name|Semester|Rank|Percentage|
+----------+--------+----+----------+
|     Alice|       1|   1|     84.00|
|       Bob|       2|   1|     94.00|
|   Charlie|       3|   1|     96.00|
+----------+--------+----+----------+



## 4. Finding the Student Who Scored Maximum Marks in Each Subject in Each Semester

In [7]:
window_spec_max_subject_marks = Window.partitionBy('Semester', 'Subject').orderBy(F.desc('Marks'))
max_subject_marks_df = df.withColumn('Rank', F.rank().over(window_spec_max_subject_marks))
max_subject_scorer = max_subject_marks_df.filter(max_subject_marks_df['Rank'] == 1)
max_subject_scorer.show()

StatementMeta(, 358cf085-313e-449a-978e-c3d973eb18ba, 9, Finished, Available, Finished)

+----------+----------------+-----+--------+----------+----------+----+
|First Name|         Subject|Marks|Semester|TotalMarks|Percentage|Rank|
+----------+----------------+-----+--------+----------+----------+----+
|   Charlie|         History|   79|       1|       236|     79.00|   1|
|     Alice|            Math|   90|       1|       253|     84.00|   1|
|     Alice|         Science|   85|       1|       253|     84.00|   1|
|       Bob|         Biology|   96|       2|       281|     94.00|   1|
|     Alice|       Chemistry|   92|       2|       258|     86.00|   1|
|       Bob|         Physics|   94|       2|       281|     94.00|   1|
|     Alice|Computer Science|   95|       3|       283|     94.00|   1|
|   Charlie|     Electronics|   97|       3|       288|     96.00|   1|
|   Charlie|       Geography|   99|       3|       288|     96.00|   1|
+----------+----------------+-----+--------+----------+----------+----+

