In [1]:
from pyspark.sql import SparkSession

# 1. Create a SparkSession (the Driver connection)
spark = SparkSession.builder.appName("LazyEvaluationExample").getOrCreate()

# 2. Create a DataFrame
data = [("Alice", 34), ("Bob", 45), ("Charlie", 28), ("David", 52)]
columns = ["name", "age"]
df = spark.createDataFrame(data, columns)

# 3. Apply Transformations (these are lazy and build the plan)
print("Applying transformations...")
df_filtered = df.filter(df.age > 30)
df_final = df_filtered.select("name")

# At this point, no actual data processing has happened.
# Spark has just built an efficient plan (DAG) for the work.
print("Plan created. No job has been executed yet.")

# 4. Call an Action (this triggers the execution)
print("\nCalling an action to trigger the job...")
df_final.show()

# Now, the Driver sends the plan to the executors,
# which read the data, filter it, select the column, and return the result.
print("Job finished.")

# Stop the SparkSession
spark.stop()

Applying transformations...
Plan created. No job has been executed yet.

Calling an action to trigger the job...
+-----+
| name|
+-----+
|Alice|
|  Bob|
|David|
+-----+

Job finished.


Code with RDD

In [2]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("RDDExample").getOrCreate()

# The SparkContext (sc) is the entry point for RDDs
sc = spark.sparkContext

# --- Lesson 2.1: Creating and Manipulating RDDs ---
print("--- Transformations ---")
# Create an RDD from a Python list
numbers_rdd = sc.parallelize([1, 2, 3, 4, 5, 6])

# Use map() to square each number
squared_rdd = numbers_rdd.map(lambda x: x * x)
print("Squared numbers (lazy):", squared_rdd) # Note: this doesn't execute yet

# Use filter() to get only the even numbers
even_rdd = numbers_rdd.filter(lambda x: x % 2 == 0)

# Create an RDD of sentences
sentences_rdd = sc.parallelize(["hello world", "spark is cool"])

# Use flatMap() to split sentences into words
words_rdd = sentences_rdd.flatMap(lambda sentence: sentence.split(" "))


# --- Lesson 2.2: Aggregations and Actions ---
print("\n--- Aggregations and Actions ---")
# Create a key-value pair RDD for word counting
word_pairs_rdd = words_rdd.map(lambda word: (word, 1))

# Use reduceByKey() to count the occurrences of each word
word_counts_rdd = word_pairs_rdd.reduceByKey(lambda a, b: a + b)

# Call an ACTION to trigger all the above transformations and see the result
print("\nWord Counts:")
word_counts_result = word_counts_rdd.collect() # DANGEROUS on large data
for word, count in word_counts_result:
    print(f"{word}: {count}")

print("\nFirst 3 squared numbers:")
# Use take() to safely get a few results
first_three_squared = squared_rdd.take(3)
print(first_three_squared)

# Stop the SparkSession
spark.stop()

--- Transformations ---
Squared numbers (lazy): PythonRDD[1] at RDD at PythonRDD.scala:53

--- Aggregations and Actions ---

Word Counts:
hello: 1
world: 1
spark: 1
is: 1
cool: 1

First 3 squared numbers:
[1, 4, 9]


🔹 Reduce

Runs a function to combine multiple values into one.

For each key group in reduceByKey, Spark repeatedly applies your function to pairs of values until only one remains.

It’s not just a normal "apply". It’s Spark doing reduce: take two values, combine them, repeat, until one value is left for that key.