In [1]:
from pyspark.sql import SparkSession

# Step 1: Initialize Spark Session
spark = SparkSession.builder.appName("BasicDataFrameOps").getOrCreate()

# Step 2: Read CSV file into DataFrame
df = spark.read.csv("students.csv", header=True, inferSchema=True)

# === Basic Operations ===

# 1. View first 5 rows
print("=== First 5 rows ===")
df.show(5)

# 2. Print schema (structure of DataFrame)
print("=== Schema ===")
df.printSchema()

# 3. Select specific columns: name and math
print("=== Select name and math columns ===")
df.select("name", "math").show(5)

# 4. Filter students with math >= 80
print("=== Students with math >= 80 ===")
df.filter(df.math >= 80).show(5)

# 5. Sort students by science marks (descending)
print("=== Sorted by science (desc) ===")
df.orderBy(df.science.desc()).show(5)

# 6. Count total rows
print("Total rows in dataset:", df.count())

# 7. Show column names
print("Columns:", df.columns)

# Stop Spark session
# spark.stop()


=== First 5 rows ===
+---+-------+---+------+----+-------+-------+
| id|   name|age|gender|math|science|english|
+---+-------+---+------+----+-------+-------+
|  1|  Alice| 20|     F|  66|     92|     44|
|  2|    Bob| 20|     M|  82|     52|     77|
|  3|Charlie| 22|     F|  43|     57|     76|
|  4|  David| 19|     M|  95|     69|     46|
|  5|    Eva| 19|     F|  62|     44|     96|
+---+-------+---+------+----+-------+-------+
only showing top 5 rows
=== Schema ===
root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- math: integer (nullable = true)
 |-- science: integer (nullable = true)
 |-- english: integer (nullable = true)

=== Select name and math columns ===
+-------+----+
|   name|math|
+-------+----+
|  Alice|  66|
|    Bob|  82|
|Charlie|  43|
|  David|  95|
|    Eva|  62|
+-------+----+
only showing top 5 rows
=== Students with math >= 80 ===
+---+------+---+------+----+-