In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

# Create Spark Session (this will also create SparkContext internally)
spark = SparkSession.builder \
    .appName("MySparkApp") \
    .getOrCreate()

sc = spark.sparkContext   # get SparkContext


In [7]:
sc

In [8]:
from pyspark.sql import SparkSession 
from pyspark.sql.functions import col, avg, round, max

In [9]:
spark = SparkSession.builder.appName("StudentsDataFrameExample").getOrCreate()

In [13]:
df = spark.read.csv("C:\\Users\\SAI MANIKANTA\\OneDrive\\Desktop\\students.csv", header=True, inferSchema=True)


In [15]:
print("=== First 3 rows ===")
df.show(3)

=== First 3 rows ===
+---+-------+---+------+----+-------+-------+
| id|   name|age|gender|math|science|english|
+---+-------+---+------+----+-------+-------+
|  1|  Alice| 20|     F|  66|     92|     44|
|  2|    Bob| 20|     M|  82|     52|     77|
|  3|Charlie| 22|     F|  43|     57|     76|
+---+-------+---+------+----+-------+-------+
only showing top 3 rows


In [16]:
print("=== Schema ===")
df.printSchema()

=== Schema ===
root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- math: integer (nullable = true)
 |-- science: integer (nullable = true)
 |-- english: integer (nullable = true)



In [17]:
print("=== Datatypes ===") 
print(df.dtypes)

=== Datatypes ===
[('id', 'int'), ('name', 'string'), ('age', 'int'), ('gender', 'string'), ('math', 'int'), ('science', 'int'), ('english', 'int')]


In [18]:
print("=== Summary statistics ===")
df.describe().show()

=== Summary statistics ===
+-------+------------------+-----+------------------+------+------------------+------------------+-----------------+
|summary|                id| name|               age|gender|              math|           science|          english|
+-------+------------------+-----+------------------+------+------------------+------------------+-----------------+
|  count|                50|   50|                50|    50|                50|                50|               50|
|   mean|              25.5| NULL|              21.5|  NULL|             68.94|             70.16|            69.36|
| stddev|14.577379737113251| NULL|2.2337851101588404|  NULL|17.609610085034216|14.636214521186957|18.74507826560544|
|    min|                 1|Aaron|                18|     F|                40|                44|               42|
|    max|                50| Zoey|                25|     M|               100|                99|              100|
+-------+------------------+-----+---

In [19]:
print("Total rows:", df.count())
print("Columns:", df.columns)

Total rows: 50
Columns: ['id', 'name', 'age', 'gender', 'math', 'science', 'english']


In [20]:
print("\n=== Select name, age, and math columns ===")
df.select("name", "age", "math").show(3)


=== Select name, age, and math columns ===
+-------+---+----+
|   name|age|math|
+-------+---+----+
|  Alice| 20|  66|
|    Bob| 20|  82|
|Charlie| 22|  43|
+-------+---+----+
only showing top 3 rows


In [21]:
# Step 4: Filter students (age >= 21 and math >= 70) 
print("\n=== Students with age >= 21 and math >= 70 ===")
df.filter((col("age") >= 21) & (col("math") >= 70)).show(3)



=== Students with age >= 21 and math >= 70 ===
+---+-----+---+------+----+-------+-------+
| id| name|age|gender|math|science|english|
+---+-----+---+------+----+-------+-------+
|  6|Frank| 22|     F|  70|     78|     94|
| 11|Kathy| 25|     M|  85|     71|     89|
| 12|  Leo| 24|     M|  97|     84|     83|
+---+-----+---+------+----+-------+-------+
only showing top 3 rows


In [24]:
# Step 5: Add a new column: average marks
df_with_avg = df.withColumn("average", round((col("math") + col("science") + col("english")) / 3, 2))
print("\n=== Dataset with new column 'average' ===") 
df_with_avg.show(3)



=== Dataset with new column 'average' ===
+---+-------+---+------+----+-------+-------+-------+
| id|   name|age|gender|math|science|english|average|
+---+-------+---+------+----+-------+-------+-------+
|  1|  Alice| 20|     F|  66|     92|     44|  67.33|
|  2|    Bob| 20|     M|  82|     52|     77|  70.33|
|  3|Charlie| 22|     F|  43|     57|     76|  58.67|
+---+-------+---+------+----+-------+-------+-------+
only showing top 3 rows


In [26]:
print("\n=== Students with average >= 75 (sorted) ===")
df_with_avg.filter(col("average") >= 75).orderBy(col("average").desc()).show(3)


=== Students with average >= 75 (sorted) ===
+---+------+---+------+----+-------+-------+-------+
| id|  name|age|gender|math|science|english|average|
+---+------+---+------+----+-------+-------+-------+
| 15|Olivia| 18|     M|  87|     90|     87|   88.0|
| 12|   Leo| 24|     M|  97|     84|     83|   88.0|
| 44|  Rita| 24|     M|  90|     82|     88|  86.67|
+---+------+---+------+----+-------+-------+-------+
only showing top 3 rows


In [27]:
print("\n=== Average marks by gender ===")
df_with_avg.groupBy("gender").agg(     round(avg("math"), 2).alias("avg_math"),     round(avg("science"), 2).alias("avg_science"), round(avg("english"), 2).alias("avg_english"),round(avg("average"), 2).alias("overall_avg") ).show()


=== Average marks by gender ===
+------+--------+-----------+-----------+-----------+
|gender|avg_math|avg_science|avg_english|overall_avg|
+------+--------+-----------+-----------+-----------+
|     F|   63.86|      68.55|      70.55|      67.66|
|     M|   75.95|      72.38|      67.71|      72.02|
+------+--------+-----------+-----------+-----------+

