In [0]:
%run "../includes/configurations"

In [0]:
candidates_df = spark.read.parquet(f"{processed_folder_path}/candidates")
candidates_marks_df = spark.read.parquet(f"{processed_folder_path}/candidates_marks")

#### Joining the dataframes

In [0]:
joined_df = candidates_df.join(candidates_marks_df, candidates_df.app_no == candidates_marks_df.app_no, "inner")\
.select(candidates_df.app_no, "name", "dob", "highschool_gpa", "phy_marks", "chem_marks", "bio_marks")

In [0]:
display(joined_df)

app_no,name,dob,highschool_gpa,phy_marks,chem_marks,bio_marks
1088,AREEBA MAHROOH GUL,1997-02-09,4.1,76,83,74
1025,HAFSA QAZI,1999-01-31,4.1,56,63,54
1016,ROHA FARHAN LODHI,1998-03-30,3.0,87,94,85
1259,TABINDA ZAIDI,1998-06-03,3.2,54,70,52
1296,Ayesha Hassan,1999-08-26,2.4,88,95,86
1489,FARHANA ISRAR,1996-12-08,2.4,54,61,52
1480,WARDHA,1996-03-29,2.9,33,40,31
1294,MUHAMMAD ASHAR NAEEM,1998-02-20,4.1,88,95,86
1418,PREETI AdVANI,1998-08-08,3.1,78,85,76
1290,SANAM SHAFI,1997-11-04,2.4,54,61,52


In [0]:
from pyspark.sql.functions import col

In [0]:
joined_df.columns

In [0]:
sum_df = joined_df.withColumn("total_marks", col("phy_marks")+col("chem_marks")+col("bio_marks"))

In [0]:
display(sum_df)

app_no,name,dob,highschool_gpa,phy_marks,chem_marks,bio_marks,total_marks
1088,AREEBA MAHROOH GUL,1997-02-09,4.1,76,83,74,233
1025,HAFSA QAZI,1999-01-31,4.1,56,63,54,173
1016,ROHA FARHAN LODHI,1998-03-30,3.0,87,94,85,266
1259,TABINDA ZAIDI,1998-06-03,3.2,54,70,52,176
1296,Ayesha Hassan,1999-08-26,2.4,88,95,86,269
1489,FARHANA ISRAR,1996-12-08,2.4,54,61,52,167
1480,WARDHA,1996-03-29,2.9,33,40,31,104
1294,MUHAMMAD ASHAR NAEEM,1998-02-20,4.1,88,95,86,269
1418,PREETI AdVANI,1998-08-08,3.1,78,85,76,239
1290,SANAM SHAFI,1997-11-04,2.4,54,61,52,167


#### Arranging the candidates in descending order of total_marks (highest marks first)

In [0]:
display(sum_df.orderBy("total_marks", ascending=False))

app_no,name,dob,highschool_gpa,phy_marks,chem_marks,bio_marks,total_marks
1072,PIRHA MASOOD,1997-01-17,3.0,98,105,96,299
1298,Ammar khan soomro,2000-03-16,2.9,98,105,96,299
1296,Ayesha Hassan,1999-08-26,2.4,88,95,86,269
1294,MUHAMMAD ASHAR NAEEM,1998-02-20,4.1,88,95,86,269
1125,JAVIRIYA,1997-11-03,2.9,87,94,85,266
1491,Urooj,1997-08-18,3.0,87,94,85,266
1016,ROHA FARHAN LODHI,1998-03-30,3.0,87,94,85,266
1409,SAFFA MARIAM,1995-09-07,3.0,87,94,85,266
1467,IQRA ASLAM,1997-08-15,2.9,87,94,85,266
1366,AYISHA,1998-04-03,3.3,87,94,85,266


#### Window Functions

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import desc, rank

In [0]:
candidate_rank_spec = Window.orderBy(desc("total_marks"), desc("highschool_gpa"))

final_df = sum_df.withColumn("rank", rank().over(candidate_rank_spec))

In [0]:
display(final_df)

app_no,name,dob,highschool_gpa,phy_marks,chem_marks,bio_marks,total_marks,rank
1072,PIRHA MASOOD,1997-01-17,3.0,98,105,96,299,1
1298,Ammar khan soomro,2000-03-16,2.9,98,105,96,299,2
1294,MUHAMMAD ASHAR NAEEM,1998-02-20,4.1,88,95,86,269,3
1296,Ayesha Hassan,1999-08-26,2.4,88,95,86,269,4
1366,AYISHA,1998-04-03,3.3,87,94,85,266,5
1016,ROHA FARHAN LODHI,1998-03-30,3.0,87,94,85,266,6
1491,Urooj,1997-08-18,3.0,87,94,85,266,6
1409,SAFFA MARIAM,1995-09-07,3.0,87,94,85,266,6
1125,JAVIRIYA,1997-11-03,2.9,87,94,85,266,9
1467,IQRA ASLAM,1997-08-15,2.9,87,94,85,266,9
