In [0]:
%run "../includes/configurations"

In [0]:
%run "../includes/common_functions"

In [0]:
from pyspark.sql.types import IntegerType, DoubleType, StructField, StructType

In [0]:
candidates_marks_schema = StructType(fields=[StructField("Application No", IntegerType(), False),
                                     StructField("HighSchool GPA", DoubleType(), True),
                                     StructField("Physics Marks", IntegerType(), False),
                                     StructField("Chem Marks", IntegerType(), False),
                                     StructField("Biology Marks", IntegerType(), False)
                       ])

In [0]:
candidates_marks_df = spark.read\
.option("header", True)\
.schema(candidates_marks_schema)\
.csv(f"{raw_folder_path}/candidates-marks.csv")\
.withColumnRenamed("Application No", "app_no")\
.withColumnRenamed("HighSchool GPA", "highschool_gpa")\
.withColumnRenamed("Physics Marks", "phy_marks")\
.withColumnRenamed("Chem Marks", "chem_marks")\
.withColumnRenamed("Biology Marks", "bio_marks")\
.dropDuplicates(['app_no'])

In [0]:
candidates_marks_df.printSchema()

In [0]:
display(candidates_marks_df)

app_no,highschool_gpa,phy_marks,chem_marks,bio_marks
1088,4.1,76.0,83.0,74.0
1025,4.1,56.0,63.0,54.0
1016,3.0,87.0,94.0,85.0
1259,3.2,54.0,,52.0
1296,2.4,88.0,95.0,86.0
1489,2.4,54.0,61.0,52.0
1480,2.9,33.0,40.0,31.0
1294,4.1,88.0,95.0,86.0
1418,3.1,78.0,85.0,76.0
1290,2.4,54.0,61.0,52.0


In [0]:
from pyspark.sql.functions import avg

In [0]:
def fill_with_mean(df, exclude=set()): 
    stats = df.agg(*(avg(c).alias(c) for c in df.columns if c not in exclude))
    return df.na.fill(stats.first().asDict())

In [0]:
avg_marks_df = fill_with_mean(candidates_marks_df, ["app_no", "highschool_gpa"])

In [0]:
display(avg_marks_df)

app_no,highschool_gpa,phy_marks,chem_marks,bio_marks
1088,4.1,76,83,74
1025,4.1,56,63,54
1016,3.0,87,94,85
1259,3.2,54,70,52
1296,2.4,88,95,86
1489,2.4,54,61,52
1480,2.9,33,40,31
1294,4.1,88,95,86
1418,3.1,78,85,76
1290,2.4,54,61,52


In [0]:
avg_marks_df.write.mode("overwrite").parquet(f"{processed_folder_path}/candidates_marks")