In [1]:
import random

from pyspark.sql import SparkSession
from pyspark.sql.functions import broadcast

In [2]:
spark = SparkSession.builder.appName("chap3").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/23 12:04:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Spark Architecture

In [3]:
random.seed(0)

In [6]:
data = [(i, random.randrange(100), random.randint(-50, 50))
        for i in range(20)]
df1 = spark.createDataFrame(data, schema=["id", "x", "y"])
df1.head(3)

                                                                                

[Row(id=0, x=49, y=47), Row(id=1, x=53, y=-45), Row(id=2, x=33, y=15)]

Number of partitions in resilient distributed dataset (RDD):

In [7]:
df1.rdd.getNumPartitions()

8

In [8]:
def print_partitions(df):
    partitions = df.rdd.glom().collect()
    for i in range(len(partitions)):
        print(f"Partition #{i + 1}")
        print(*partitions[i])
        print()

In [9]:
print_partitions(df1)

Partition #1
Row(id=0, x=49, y=47) Row(id=1, x=53, y=-45)

Partition #2
Row(id=2, x=33, y=15) Row(id=3, x=62, y=1)

Partition #3
Row(id=4, x=38, y=11) Row(id=5, x=45, y=24)

Partition #4
Row(id=6, x=27, y=14) Row(id=7, x=17, y=-14) Row(id=8, x=17, y=46) Row(id=9, x=12, y=29)

Partition #5
Row(id=10, x=32, y=18) Row(id=11, x=90, y=27)

Partition #6
Row(id=12, x=18, y=-11) Row(id=13, x=12, y=43)

Partition #7
Row(id=14, x=9, y=37) Row(id=15, x=42, y=10)

Partition #8
Row(id=16, x=71, y=-38) Row(id=17, x=45, y=5) Row(id=18, x=40, y=28) Row(id=19, x=81, y=-24)



Repartition without a shuffle (often used when reducing partitioning, i.e., decreasing the number of partitions):

In [10]:
rdd1a = df1.coalesce(5)
print_partitions(rdd1a)

Partition #1
Row(id=0, x=49, y=47) Row(id=1, x=53, y=-45)

Partition #2
Row(id=2, x=33, y=15) Row(id=3, x=62, y=1) Row(id=4, x=38, y=11) Row(id=5, x=45, y=24)

Partition #3
Row(id=6, x=27, y=14) Row(id=7, x=17, y=-14) Row(id=8, x=17, y=46) Row(id=9, x=12, y=29)

Partition #4
Row(id=10, x=32, y=18) Row(id=11, x=90, y=27) Row(id=12, x=18, y=-11) Row(id=13, x=12, y=43)

Partition #5
Row(id=14, x=9, y=37) Row(id=15, x=42, y=10) Row(id=16, x=71, y=-38) Row(id=17, x=45, y=5) Row(id=18, x=40, y=28) Row(id=19, x=81, y=-24)



Repartition with a shuffle:

In [11]:
rdd1b = df1.repartition(5)
print_partitions(rdd1b)

Partition #1
Row(id=0, x=49, y=47) Row(id=2, x=33, y=15) Row(id=5, x=45, y=24) Row(id=7, x=17, y=-14) Row(id=11, x=90, y=27)

Partition #2
Row(id=8, x=17, y=46) Row(id=12, x=18, y=-11) Row(id=14, x=9, y=37) Row(id=19, x=81, y=-24)

Partition #3
Row(id=13, x=12, y=43) Row(id=15, x=42, y=10) Row(id=16, x=71, y=-38)

Partition #4
Row(id=6, x=27, y=14) Row(id=18, x=40, y=28)

Partition #5
Row(id=1, x=53, y=-45) Row(id=3, x=62, y=1) Row(id=4, x=38, y=11) Row(id=9, x=12, y=29) Row(id=10, x=32, y=18) Row(id=17, x=45, y=5)



Repartition by columns:

In [12]:
print_partitions(df1.repartition(5, "x", "y"))

Partition #1
Row(id=0, x=49, y=47) Row(id=13, x=12, y=43) Row(id=14, x=9, y=37) Row(id=16, x=71, y=-38)

Partition #2
Row(id=3, x=62, y=1)

Partition #3
Row(id=4, x=38, y=11) Row(id=11, x=90, y=27) Row(id=15, x=42, y=10)

Partition #4
Row(id=2, x=33, y=15) Row(id=6, x=27, y=14) Row(id=7, x=17, y=-14) Row(id=8, x=17, y=46) Row(id=10, x=32, y=18) Row(id=12, x=18, y=-11) Row(id=17, x=45, y=5) Row(id=19, x=81, y=-24)

Partition #5
Row(id=1, x=53, y=-45) Row(id=5, x=45, y=24) Row(id=9, x=12, y=29) Row(id=18, x=40, y=28)

