In [1]:
from pyspark.sql import SparkSession

# Step 1: Initialize Spark Session
spark = SparkSession.builder.appName("SamplingExample").getOrCreate()

# Step 2: Read CSV file into DataFrame
df = spark.read.csv("students.csv", header=True, inferSchema=True)

# === Sampling Demonstration ===

# 1. View first 5 rows
print("=== First 5 rows of dataset ===")
df.show(5)

# 2. Print schema
print("=== Schema of dataset ===")
df.printSchema()

# 3. Random sample without replacement (30% of data)
print("=== Sample (30% without replacement) ===")
df.sample(withReplacement=False, fraction=0.3, seed=42).show(10)

# 4. Random sample with replacement (20% of data)
print("=== Sample (20% with replacement) ===")
df.sample(withReplacement=True, fraction=0.2, seed=42).show(10)

# 5. Take a random sample of 5 rows using takeSample (without replacement)
print("=== takeSample: 5 rows (without replacement) ===")
sampled_rows = df.rdd.takeSample(False, 5, seed=42)
for row in sampled_rows:
    print(row)

# 6. Take a random sample of 5 rows using takeSample (with replacement)
print("=== takeSample: 5 rows (with replacement) ===")
sampled_rows_wr = df.rdd.takeSample(True, 5, seed=42)
for row in sampled_rows_wr:
    print(row)

# 7. Count total rows (to compare with sampled data size)
print("Total rows in dataset:", df.count())

# Stop Spark session
# spark.stop()


=== First 5 rows of dataset ===
+---+-------+---+------+----+-------+-------+
| id|   name|age|gender|math|science|english|
+---+-------+---+------+----+-------+-------+
|  1|  Alice| 20|     F|  66|     92|     44|
|  2|    Bob| 20|     M|  82|     52|     77|
|  3|Charlie| 22|     F|  43|     57|     76|
|  4|  David| 19|     M|  95|     69|     46|
|  5|    Eva| 19|     F|  62|     44|     96|
+---+-------+---+------+----+-------+-------+
only showing top 5 rows
=== Schema of dataset ===
root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- math: integer (nullable = true)
 |-- science: integer (nullable = true)
 |-- english: integer (nullable = true)

=== Sample (30% without replacement) ===
+---+------+---+------+----+-------+-------+
| id|  name|age|gender|math|science|english|
+---+------+---+------+----+-------+-------+
|  4| David| 19|     M|  95|     69|     46|
|  8| Henry| 21