In [1]:
from pyspark.sql import SparkSession

# Start Spark
spark = SparkSession.builder.appName("MidMarksAnalysis").getOrCreate()

# Load CSV
df = spark.read.csv("MIDMARKS.csv", header=True, inferSchema=True)

# Show sample
df.show(5)


AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/C:/Users/adity/Downloads/MIDMARKS.csv. SQLSTATE: 42K03

In [None]:
from pyspark.sql.functions import expr

# Convert all subject columns to IntegerType safely (marks never exceed 20)
subjects = ["DV","M-II","PP","BEEE","FL","FIMS"]
for subj in subjects:
    df = df.withColumn(subj, expr(f"try_cast(`{subj}` as int)"))

# Summary only for numeric columns (int/double)
numeric_cols = [c for c, t in df.dtypes if t in ('int', 'double')]
df.select(numeric_cols).describe().show()


In [None]:
from pyspark.sql.functions import col, sum
null_counts = df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])
null_counts.show()



In [None]:
df.describe().show(truncate=False)
df.summary().show(truncate=False)

In [None]:
# Cleaning dataset

In [None]:
df.groupBy("Section").count().show()

In [None]:
df = df.na.fill({"Section":"ZETA"})

In [None]:
df.groupBy("Section").count().show()

In [None]:
from pyspark.sql.functions import when, col

df = df.withColumn(
    "SECTION",
    when(col("SECTION") == "SGMA","SIGMA")
    .when(col("SECTION") == "GAMMA", "GAMA")
    .otherwise(col("SECTION"))
)

In [None]:
df = df.withColumn(
    "DV",
    when(col("DV") == "A",0)
    .when(col("DV") == "o",0)
    .otherwise(col("DV"))
)

In [None]:
df = df.na.fill({"DV":"0"})

In [None]:
df.groupBy("Section").count().show()

In [None]:
df.groupBy("Section").count().show()

In [None]:
# Number of rows & columns
print("Rows:", df.count())
print("Columns:", len(df.columns))

In [None]:
df.groupBy("SECTION").count().orderBy("count", ascending=False).show()

In [None]:
from pyspark.sql.functions import expr

subjects = ["DV", "MII", "PP", "BEEE", "FL", "FIMS"]

# Try casting each subject column to int, invalid values become NULL
for i in subjects:
    df = df.withColumn(i, expr(f"try_cast({i} as int)"))


In [None]:
df.select(subjects).show(10)


In [None]:
df = df.na.fill({"PP":"0"})

In [None]:
df = df.na.fill({"BEEE":"0"})
df = df.na.fill({"PP":"0"})
df = df.na.fill({"FL":"0"})
df = df.na.fill({"FIMS":"0"})
df = df.na.fill({"DV":"0"})
df = df.na.fill({"MII":"0"})

In [None]:
df.select(subjects).show(10)

In [None]:
from pyspark.sql.functions import coalesce, lit

for subj in subjects:
    df = df.withColumn(subj, coalesce(df[subj], lit(0)))

In [None]:
from pyspark.sql.functions import expr

df = df.withColumn("SNO_int", expr("try_cast(SNO as int)"))

In [None]:
df.select("SNO", "SNO_int").show(10)


In [None]:
df.dtypes

In [None]:
# Done with data cleaning

In [None]:
from pyspark.sql.functions import expr

# Convert all subject columns to IntegerType safely (marks never exceed 20)
subjects = ["DV","M-II","PP","BEEE","FL","FIMS"]
for subj in subjects:
    df = df.withColumn(subj, expr(f"try_cast(`{subj}` as int)"))

# Summary only for numeric columns (int/double)
numeric_cols = [c for c, t in df.dtypes if t in ('int', 'double')]
df.select(numeric_cols).describe().show()
