In [2]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

# Create Spark Session (this will also create SparkContext internally)
spark = SparkSession.builder \
    .appName("MySparkApp") \
    .getOrCreate()

sc = spark.sparkContext   # get SparkContext


In [3]:
sc

In [4]:
data = sc.textFile("students.csv")

In [5]:
header = data.first()
rows = data.filter(lambda line: line != header)

In [6]:
split_rdd = rows.map(lambda line: line.split(","))

In [7]:
print("=== Student Dataset (first 10 rows) ===") 
for row in split_rdd.take(3):   # you can change 10 → 20, 50 etc.
    print(row)


=== Student Dataset (first 10 rows) ===
['1', 'Alice', '20', 'F', '66', '92', '44']
['2', 'Bob', '20', 'M', '82', '52', '77']
['3', 'Charlie', '22', 'F', '43', '57', '76']


In [8]:

students_rdd = split_rdd.map(lambda x: (int(x[0]), 	x[1],	 int(x[2]), 	x[3],	 int(x[4]), int(x[5]), int(x[6])))


In [9]:
avg_marks_rdd = students_rdd.map(lambda x: (x[1], (	x[4] 	+ 	x[5] 	+ x[6]) / 	3))

In [10]:
passed_rdd = avg_marks_rdd.filter(lambda x: x[1] >= 75)

In [11]:
sorted_passed_rdd = passed_rdd.sortBy(lambda x: x[1], ascending	=False)	

In [12]:
results = sorted_passed_rdd.collect()

In [13]:
print("=== Students with Average >= 95 ===") 
for student in results:
    print(f"Name: {student[0]}, Avg Marks: {student[1]:.2f}")


=== Students with Average >= 95 ===
Name: Leo, Avg Marks: 88.00
Name: Olivia, Avg Marks: 88.00
Name: Rita, Avg Marks: 86.67
Name: Kathy, Avg Marks: 81.67
Name: George, Avg Marks: 81.67
Name: Frank, Avg Marks: 80.67
Name: Oscar, Avg Marks: 80.00
Name: Uma, Avg Marks: 78.33
Name: Kyle, Avg Marks: 78.33
Name: Matt, Avg Marks: 78.33
Name: Tina, Avg Marks: 76.00
Name: Victor, Avg Marks: 75.67
Name: Grace, Avg Marks: 75.33
Name: Mona, Avg Marks: 75.00
Name: Will, Avg Marks: 75.00


In [14]:
count_passed = passed_rdd.count() 
print("\nNumber of students who passed:", count_passed)


Number of students who passed: 15


In [15]:
topper = passed_rdd.reduce(lambda a, b: a if a[1] > b[1] else b) 
print("Topper:", topper)

Topper: ('Olivia', 88.0)


In [16]:

print("\nFirst 3 Passed Students (via take):") 
print(passed_rdd.take(3))



First 3 Passed Students (via take):
[('Frank', 80.66666666666667), ('Grace', 75.33333333333333), ('Kathy', 81.66666666666667)]
