In [1]:
sc

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, max, min, round, count
# Step 1: Initialize Spark Session
spark = SparkSession.builder.appName("StudentsAnalytics").getOrCreate()

In [3]:
df = spark.read.csv("students.csv", header=True, inferSchema=True)

In [4]:
print("=== First 5 rows ===")
df.show(5)

=== First 5 rows ===
+---+-------+---+------+----+-------+-------+
| id|   name|age|gender|math|science|english|
+---+-------+---+------+----+-------+-------+
|  1|  Alice| 20|     F|  66|     92|     44|
|  2|    Bob| 20|     M|  82|     52|     77|
|  3|Charlie| 22|     F|  43|     57|     76|
|  4|  David| 19|     M|  95|     69|     46|
|  5|    Eva| 19|     F|  62|     44|     96|
+---+-------+---+------+----+-------+-------+
only showing top 5 rows


In [5]:
print("=== Schema ===")
df.printSchema()

=== Schema ===
root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- math: integer (nullable = true)
 |-- science: integer (nullable = true)
 |-- english: integer (nullable = true)



In [6]:
print("Total rows:", df.count())

Total rows: 50


In [7]:
print("=== Summary Statistics ===")
df.describe().show()

=== Summary Statistics ===
+-------+------------------+-----+------------------+------+------------------+------------------+-----------------+
|summary|                id| name|               age|gender|              math|           science|          english|
+-------+------------------+-----+------------------+------+------------------+------------------+-----------------+
|  count|                50|   50|                50|    50|                50|                50|               50|
|   mean|              25.5| NULL|              21.5|  NULL|             68.94|             70.16|            69.36|
| stddev|14.577379737113251| NULL|2.2337851101588404|  NULL|17.609610085034216|14.636214521186957|18.74507826560544|
|    min|                 1|Aaron|                18|     F|                40|                44|               42|
|    max|                50| Zoey|                25|     M|               100|                99|              100|
+-------+------------------+-----+---

In [8]:
print("=== Students with math >= 80 ===")
df.filter(col("math") >= 80).select("id", "name", "math").show(10)

=== Students with math >= 80 ===
+---+------+----+
| id|  name|math|
+---+------+----+
|  2|   Bob|  82|
|  4| David|  95|
| 11| Kathy|  85|
| 12|   Leo|  97|
| 15|Olivia|  87|
| 20|  Tina| 100|
| 21|   Uma|  89|
| 22|Victor|  96|
| 25|  Yara| 100|
| 27| Aaron|  81|
+---+------+----+
only showing top 10 rows


In [9]:
print("=== Average marks per subject ===")
df.select(
 round(avg("math"),2).alias("avg_math"),
 round(avg("science"),2).alias("avg_science"),
 round(avg("english"),2).alias("avg_english")
).show()

=== Average marks per subject ===
+--------+-----------+-----------+
|avg_math|avg_science|avg_english|
+--------+-----------+-----------+
|   68.94|      70.16|      69.36|
+--------+-----------+-----------+

