In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as FS
from pyspark.sql import Window as WN
from pyspark.sql import types as TP



In [2]:
spark = SparkSession.builder.appName("Grade").master("local[2]").getOrCreate()

In [3]:
data = [('2', '2', '95') ,
('2', '3', '95'),
('1', '1', '90'),
('1', '2', '99'),
('3', '1', '80'),
('3', '2', '75'),
('3', '3', '82')]
schema = ["student_id","course_id","grade"]

In [15]:
dataframe = spark.createDataFrame(data, schema = schema)

In [16]:
dataframe.show()

+----------+---------+-----+
|student_id|course_id|grade|
+----------+---------+-----+
|         2|        2|   95|
|         2|        3|   95|
|         1|        1|   90|
|         1|        2|   99|
|         3|        1|   80|
|         3|        2|   75|
|         3|        3|   82|
+----------+---------+-----+



**Write a SQL query to find the highest grade with its corresponding course for
each student. In case of a tie you should find the course with the smallest course_id.
The output must be sorted by student_id**

#### Output
```
+----------+---------+-----+
|student_id|course_id|grade|
+----------+---------+-----+
|         1|        2|   99|
|         2|        2|   95|
|         3|        3|   82|
+----------+---------+-----+

```


In [18]:
final_data =  dataframe.select("student_id","course_id","grade",
                FS.dense_rank().over(WN.partitionBy('student_id').\
                                     orderBy(FS.col("grade").desc(),FS.col("course_id").asc())).alias("RNK"))

In [19]:
final_data.show()

+----------+---------+-----+---+
|student_id|course_id|grade|RNK|
+----------+---------+-----+---+
|         1|        2|   99|  1|
|         1|        1|   90|  2|
|         2|        2|   95|  1|
|         2|        3|   95|  2|
|         3|        3|   82|  1|
|         3|        1|   80|  2|
|         3|        2|   75|  3|
+----------+---------+-----+---+



In [20]:
final_data.select("student_id","course_id","grade").filter("RNK = 1").show()

+----------+---------+-----+
|student_id|course_id|grade|
+----------+---------+-----+
|         1|        2|   99|
|         2|        2|   95|
|         3|        3|   82|
+----------+---------+-----+



## With SQL

In [22]:
dataframe.createOrReplaceTempView("table")

In [24]:
spark.sql("select * from table").show()

+----------+---------+-----+
|student_id|course_id|grade|
+----------+---------+-----+
|         2|        2|   95|
|         2|        3|   95|
|         1|        1|   90|
|         1|        2|   99|
|         3|        1|   80|
|         3|        2|   75|
|         3|        3|   82|
+----------+---------+-----+



In [32]:
sql_statement = """
WITH GRADE_DATE AS
(
SELECT student_id,course_id,grade,DENSE_RANK() OVER(PARTITION BY student_id ORDER BY grade DESC,course_id ASC) AS RNK
FROM table
)
SELECT student_id, course_id,grade FROM GRADE_DATE
WHERE RNK = 1 ORDER BY student_id
"""
spark.sql(sql_statement).show()

+----------+---------+-----+
|student_id|course_id|grade|
+----------+---------+-----+
|         1|        2|   99|
|         2|        2|   95|
|         3|        3|   82|
+----------+---------+-----+

