# 🔹 Setup JSON Files (Student Example)
We will create **students.json** and **marks.json** files.


In [2]:
import json
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, sum, max

# Create Spark session
spark = SparkSession.builder.appName("Student-Example").getOrCreate()

# Students JSON
students_data = [
    {"student_id": 1, "name": "Rahul", "age": 20, "department": "CSE", "city": "Delhi"},
    {"student_id": 2, "name": "Priya", "age": 22, "department": "ECE", "city": "Mumbai"},
    {"student_id": 3, "name": "Arjun", "age": 21, "department": "CSE", "city": "Bangalore"},
    {"student_id": 4, "name": "Sneha", "age": 23, "department": "EEE", "city": "Chennai"},
    {"student_id": 5, "name": "Divya", "age": 20, "department": "CSE", "city": "Delhi"}
]
with open("students.json", "w") as f:
    json.dump(students_data, f)

# Marks JSON
marks_data = [
    {"exam_id": 101, "student_id": 1, "subject": "Math", "score": 85},
    {"exam_id": 102, "student_id": 2, "subject": "Math", "score": 78},
    {"exam_id": 103, "student_id": 1, "subject": "Physics", "score": 90},
    {"exam_id": 104, "student_id": 3, "subject": "Chemistry", "score": 70},
    {"exam_id": 105, "student_id": 4, "subject": "Math", "score": 88},
    {"exam_id": 106, "student_id": 5, "subject": "Physics", "score": 92},
    {"exam_id": 107, "student_id": 6, "subject": "Math", "score": 60}  # non-existent student
]
with open("marks.json", "w") as f:
    json.dump(marks_data, f)

# Load JSON into DataFrames
students_df = spark.read.json("students.json")
marks_df = spark.read.json("marks.json")


#  🔹  Basic Operations

In [3]:
# 1. Select name & department
students_df.select("name", "department").show()

# 2. Filter students older than 21
students_df.filter(col("age") > 21).show()

# 3. Distinct departments
students_df.select("department").distinct().show()


+-----+----------+
| name|department|
+-----+----------+
|Rahul|       CSE|
|Priya|       ECE|
|Arjun|       CSE|
|Sneha|       EEE|
|Divya|       CSE|
+-----+----------+

+---+-------+----------+-----+----------+
|age|   city|department| name|student_id|
+---+-------+----------+-----+----------+
| 22| Mumbai|       ECE|Priya|         2|
| 23|Chennai|       EEE|Sneha|         4|
+---+-------+----------+-----+----------+

+----------+
|department|
+----------+
|       ECE|
|       CSE|
|       EEE|
+----------+



#  🔹  **Aggregations**

In [4]:
# 1. Average student age
students_df.agg(avg("age")).show()

# 2. Maximum exam score
marks_df.agg(max("score")).show()

# 3. Total score per student
marks_df.groupBy("student_id").agg(sum("score").alias("total_score")).show()


+--------+
|avg(age)|
+--------+
|    21.2|
+--------+

+----------+
|max(score)|
+----------+
|        92|
+----------+

+----------+-----------+
|student_id|total_score|
+----------+-----------+
|         6|         60|
|         5|         92|
|         1|        175|
|         3|         70|
|         2|         78|
|         4|         88|
+----------+-----------+



#  🔹  Join Example


In [5]:
# 1. Inner join students & marks
students_df.join(marks_df, "student_id", "inner").show()

# 2. Left join (all students, even without marks)
students_df.join(marks_df, "student_id", "left").show()

# 3. Marks with non-existent students
marks_df.join(students_df, "student_id", "left") \
    .filter(col("name").isNull()).show()


+----------+---+---------+----------+-----+-------+-----+---------+
|student_id|age|     city|department| name|exam_id|score|  subject|
+----------+---+---------+----------+-----+-------+-----+---------+
|         1| 20|    Delhi|       CSE|Rahul|    101|   85|     Math|
|         2| 22|   Mumbai|       ECE|Priya|    102|   78|     Math|
|         1| 20|    Delhi|       CSE|Rahul|    103|   90|  Physics|
|         3| 21|Bangalore|       CSE|Arjun|    104|   70|Chemistry|
|         4| 23|  Chennai|       EEE|Sneha|    105|   88|     Math|
|         5| 20|    Delhi|       CSE|Divya|    106|   92|  Physics|
+----------+---+---------+----------+-----+-------+-----+---------+

+----------+---+---------+----------+-----+-------+-----+---------+
|student_id|age|     city|department| name|exam_id|score|  subject|
+----------+---+---------+----------+-----+-------+-----+---------+
|         1| 20|    Delhi|       CSE|Rahul|    103|   90|  Physics|
|         1| 20|    Delhi|       CSE|Rahul|    

# 🔹  Sorting & Grouping

In [6]:
# 1. Students ordered by age (descending)
students_df.orderBy(col("age").desc()).show()

# 2. Top 3 highest scores
marks_df.orderBy(col("score").desc()).limit(3).show()

# 3. Average score per subject
marks_df.groupBy("subject").agg(avg("score")).show()


+---+---------+----------+-----+----------+
|age|     city|department| name|student_id|
+---+---------+----------+-----+----------+
| 23|  Chennai|       EEE|Sneha|         4|
| 22|   Mumbai|       ECE|Priya|         2|
| 21|Bangalore|       CSE|Arjun|         3|
| 20|    Delhi|       CSE|Rahul|         1|
| 20|    Delhi|       CSE|Divya|         5|
+---+---------+----------+-----+----------+

+-------+-----+----------+-------+
|exam_id|score|student_id|subject|
+-------+-----+----------+-------+
|    106|   92|         5|Physics|
|    103|   90|         1|Physics|
|    105|   88|         4|   Math|
+-------+-----+----------+-------+

+---------+----------+
|  subject|avg(score)|
+---------+----------+
|     Math|     77.75|
|Chemistry|      70.0|
|  Physics|      91.0|
+---------+----------+



#  🔹  SQL Operations


In [7]:
students_df.createOrReplaceTempView("students")
marks_df.createOrReplaceTempView("marks")

# 1. Total score by city
spark.sql("""
SELECT s.city, SUM(m.score) as total_score
FROM students s
JOIN marks m ON s.student_id = m.student_id
GROUP BY s.city
""").show()

# 2. Top 2 students by total score
spark.sql("""
SELECT s.name, SUM(m.score) as total_score
FROM students s
JOIN marks m ON s.student_id = m.student_id
GROUP BY s.name
ORDER BY total_score DESC
LIMIT 2
""").show()

# 3. Students scoring >150 in total
spark.sql("""
SELECT s.name, SUM(m.score) as total_score
FROM students s
JOIN marks m ON s.student_id = m.student_id
GROUP BY s.name
HAVING total_score > 150
""").show()


+---------+-----------+
|     city|total_score|
+---------+-----------+
|Bangalore|         70|
|  Chennai|         88|
|   Mumbai|         78|
|    Delhi|        267|
+---------+-----------+

+-----+-----------+
| name|total_score|
+-----+-----------+
|Rahul|        175|
|Divya|         92|
+-----+-----------+

+-----+-----------+
| name|total_score|
+-----+-----------+
|Rahul|        175|
+-----+-----------+

