In [1]:
sc

In [3]:
 from pyspark.sql import SparkSession
 from pyspark.sql.functions import col, avg, max, min, round, count
 # Step 1: Initialize Spark Session
 spark = SparkSession.builder.appName("StudentsAnalytics").getOrCreate()
# Step 2: Read CSV file into DataFrame
 df = spark.read.csv("students.csv", header=True, inferSchema=True)
# === Analytical Operations (10 max) ===
 # 1. View first 5 rows
 print("=== First 5 rows ===")
 df.show(5)
 # 2. Print schema
 print("=== Schema ===")
 df.printSchema()
# 3. Count total rows
 print("Total rows:", df.count())
# 4. Show summary statistics (numeric columns)
 print("=== Summary Statistics ===")
 df.describe().show()
# 5. Select students with math >= 80
 print("=== Students with math >= 80 ===")
 df.filter(col("math") >= 80).select("id", "name", "math").show(10)
 # 6. Calculate average marks per subject
 print("=== Average marks per subject ===")
 df.select(
 round(avg("math"),2).alias("avg_math"),
 round(avg("science"),2).alias("avg_science"),
 round(avg("english"),2).alias("avg_english")
 ).show()
 # 7. Add new column: average marks
 df_with_avg = df.withColumn("average", round((col("math")+col("science")+col("english"))/3,2))
 print("=== Dataset with 'average' column ===")
 df_with_avg.show(5)
 # 8. Find topper (student with max average)
 print("=== Topper ===")
 df_with_avg.orderBy(col("average").desc()).limit(1).show()
 # 9. Group by gender → average marks
 print("=== Average marks by gender ===")
 df_with_avg.groupBy("gender").agg(
 round(avg("math"),2).alias("avg_math"),
round(avg("science"),2).alias("avg_science"),
 round(avg("english"),2).alias("avg_english"),
 round(avg("average"),2).alias("overall_avg")
 ).show()
 # 10. Find min and max of each subject
 print("=== Min & Max of each subject ===")
 df.select(
 min("math").alias("min_math"), max("math").alias("max_math"),
 min("science").alias("min_science"), max("science").alias("max_science"),
 min("english").alias("min_english"), max("english").alias("max_english")
 ).show()

=== First 5 rows ===
+---+-------+---+------+----+-------+-------+
| id|   name|age|gender|math|science|english|
+---+-------+---+------+----+-------+-------+
|  1|  Alice| 20|     F|  66|     92|     44|
|  2|    Bob| 20|     M|  82|     52|     77|
|  3|Charlie| 22|     F|  43|     57|     76|
|  4|  David| 19|     M|  95|     69|     46|
|  5|    Eva| 19|     F|  62|     44|     96|
+---+-------+---+------+----+-------+-------+
only showing top 5 rows
=== Schema ===
root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- math: integer (nullable = true)
 |-- science: integer (nullable = true)
 |-- english: integer (nullable = true)

Total rows: 50
=== Summary Statistics ===
+-------+------------------+-----+------------------+------+------------------+------------------+-----------------+
|summary|                id| name|               age|gender|              math|           science| 