In [1]:
import pandas as pd

from faker import Faker
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    broadcast, spark_partition_id, rand, udf, struct
)
from pyspark.sql.types import (
    StructType, StructField,
    IntegerType, StringType, ArrayType, MapType
)

In [2]:
spark = (
    SparkSession.builder
        .appName("chap4")
        .config("spark.driver.memory", "2g")
        .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/18 17:15:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Batch Processing

Generate some fake data:

In [3]:
fake = Faker()
Faker.seed(0)

In [4]:
data = [
    (fake.unique.name(), fake.random_int(18, 25), fake.job())
    for _ in range(1000)   
]

In [5]:
df1 = spark.createDataFrame(data, ["name", "age", "job"])
df1.show()

[Stage 0:>                                                          (0 + 1) / 1]

+-------------------+---+--------------------+
|               name|age|                 job|
+-------------------+---+--------------------+
|       Norma Fisher| 22|Sales promotion a...|
|Dr. Ronald Faulkner| 23|Television produc...|
|     Colleen Taylor| 20|      Chief of Staff|
|  Danielle Browning| 20|Insurance claims ...|
| Benjamin Jefferson| 23|Public house manager|
|    Heather Stewart| 21|                 Sub|
|         Sean Green| 18|Chief Financial O...|
|   Jennifer Summers| 18|  Veterinary surgeon|
|    Sean Sanchez MD| 19|Engineer, aeronau...|
|       Connie Pratt| 20|Speech and langua...|
|       Bobby Flores| 25|Clinical embryolo...|
|     Eddie Martinez| 23|Sound technician,...|
|       Robert Payne| 22|Producer, televis...|
|     Robert Stewart| 21|Horticultural the...|
|    Roberto Johnson| 22|     Publishing copy|
|   Michael Anderson| 20|     Arboriculturist|
|  Stephanie Leblanc| 24|Scientist, water ...|
|    Robert Atkinson| 24|        TEFL teacher|
| Johnathan D

                                                                                

Partition the data by a column:

In [6]:
db = "fake_data"

In [7]:
# Remove directory associated with the table (if it already exists)
%rm -rf spark-warehouse/"$db"

In [8]:
df1.write.saveAsTable(db, partitionBy="age")

                                                                                

## Data Skew

Data skew is when data is unevenly distributed across partitions. This slows down performance and needs handling. Most of the time, Spark's Adaptive Query Engine (AQE) is efficient in optimizing the data distribution. However, sometimes we need to manually fix the data skew problem. Here are some ways to do it:
- Configuring the number of partitions to use when shuffling data for joins or aggregations (i.e., the `spark.sql.shuffle.partitions` option). See the [Working With Partitions](http://localhost:8888/notebooks/chap3/Apache%20Spark%20deep%20dive.ipynb#Working-With-Partitions) section in the Chapter 3 notebook for more information.
- Broadcast join: Send the smaller dataset across all nodes and then join each node's portion of the larger dataset. This is suitable for small-to-medium-sized DataFrames.

In [9]:
pd_df = pd.DataFrame({
    "name": fake.random_sample([tup[0] for tup in data], 5),
    "catchPhrase": [fake.unique.catch_phrase() for _ in range(5)]
})
df2 = spark.createDataFrame(pd_df)

In [10]:
df3 = df1.join(broadcast(df2), "name")
df3.show()

+----------------+---+--------------------+--------------------+
|            name|age|                 job|         catchPhrase|
+----------------+---+--------------------+--------------------+
|     Daniel Cruz| 19|Nurse, mental health|Public-key mobile...|
|Reginald Garrett| 24|     Arboriculturist|Visionary systema...|
|  Victoria Reese| 25|Health and safety...|Multi-layered hyb...|
| Brent Willis MD| 23|           Ecologist|Fundamental inter...|
| Dustin Mcdowell| 22|Senior tax profes...|Multi-lateral zer...|
+----------------+---+--------------------+--------------------+



- Salting (idea from cryptography): Add a random or unique identifier to each record. This is useful if we are unsure what column we want to repartition by.

In [11]:
df1.withColumn("salt", (rand(0) * 10).cast("int")).show()

+-------------------+---+--------------------+----+
|               name|age|                 job|salt|
+-------------------+---+--------------------+----+
|       Norma Fisher| 22|Sales promotion a...|   7|
|Dr. Ronald Faulkner| 23|Television produc...|   5|
|     Colleen Taylor| 20|      Chief of Staff|   0|
|  Danielle Browning| 20|Insurance claims ...|   3|
| Benjamin Jefferson| 23|Public house manager|   7|
|    Heather Stewart| 21|                 Sub|   2|
|         Sean Green| 18|Chief Financial O...|   2|
|   Jennifer Summers| 18|  Veterinary surgeon|   5|
|    Sean Sanchez MD| 19|Engineer, aeronau...|   7|
|       Connie Pratt| 20|Speech and langua...|   0|
|       Bobby Flores| 25|Clinical embryolo...|   2|
|     Eddie Martinez| 23|Sound technician,...|   6|
|       Robert Payne| 22|Producer, televis...|   4|
|     Robert Stewart| 21|Horticultural the...|   5|
|    Roberto Johnson| 22|     Publishing copy|   3|
|   Michael Anderson| 20|     Arboriculturist|   2|
|  Stephanie

# Spark Schemas

To define a `StructField`, set three components: name, data type, and nullibility (`True` by default).

In [12]:
schema1 = StructType([
    StructField("id", IntegerType(), False),
    StructField(
        "user",
        StructType([
            StructField("name", StringType(), False),
            StructField("age", IntegerType())
        ])
    )
])

In [13]:
schema1.fields

[StructField('id', IntegerType(), False),
 StructField('user', StructType([StructField('name', StringType(), False), StructField('age', IntegerType(), True)]), True)]

In [14]:
schema1.fieldNames()

['id', 'user']

Schemas are objects so we can compare them:

In [15]:
schema2 = StructType([
    StructField("id", IntegerType(), False),
    StructField(
        "user",
        StructType([
            StructField("name", StringType(), False),
            StructField("age", IntegerType())
        ])
    )
])
schema1 == schema2

True

Make sure that fields are in the correct order across the two schemas being compared, since column order is not guaranteed in Spark:

In [16]:
schema3 = StructType([
    StructField(
        "user",
        StructType([
            StructField("name", StringType(), False),
            StructField("age", IntegerType())
        ])
    ),
    StructField("id", IntegerType(), False)
])
schema1 == schema3

False

`ArrayType`: Specify the data type of elements in the array (and, optionally, the nullibility of the array).

In [17]:
array_struct = StructField("skills", ArrayType(StringType()))

`MapType`: Specify the data types of the key and of the values in the map (and, optionally, the nullibility of the map).

In [18]:
map_type = MapType(StringType(), StringType())

Add new fields to a schema: Provide either a `StructField` object or the name and data type of the new field. The data type can be either a `DataType` object or a string representing the object. Optionally, specify the nullibility of the new field.

In [19]:
schema1 = (
    schema1.add(array_struct)
        .add("info_string", map_type)
        .add("score", "float", False)
)
schema1.fieldNames()

['id', 'user', 'skills', 'info_string', 'score']

# User-defined Functions (UDFs)

UDFs: slower than native Python functions and methods because Spark cannot optimize UDFs. Only use UDFs when it's impossible to do something using the normal Spark API and the speed hit is not a concern.

In [20]:
add_one_udf = udf(lambda x: x + 1, "integer")

In [21]:
df3.select(
    "age", add_one_udf("age").alias("age_plus_one")
).show()

+---+------------+
|age|age_plus_one|
+---+------------+
| 19|          20|
| 24|          25|
| 25|          26|
| 23|          24|
| 22|          23|
+---+------------+



Define a UDF using a decorator:

In [22]:
@udf("string")
def concat(s):
    return ": ".join(s)

In [23]:
df3.select(
    concat(struct("name", "job")).alias("name_and_job")
).show(truncate=False)

+------------------------------------------------------+
|name_and_job                                          |
+------------------------------------------------------+
|Daniel Cruz: Nurse, mental health                     |
|Reginald Garrett: Arboriculturist                     |
|Victoria Reese: Health and safety adviser             |
|Brent Willis MD: Ecologist                            |
|Dustin Mcdowell: Senior tax professional/tax inspector|
+------------------------------------------------------+



---

**Further reading**:

- [Deep Dive into Handling Apache Spark Data Skew](https://chengzhizhao.com/deep-dive-into-handling-apache-spark-data-skew/) (Zhao, 2022)