<a href="https://colab.research.google.com/github/lucprosa/dataeng-basic-course/blob/main/spark/examples/10-misc_performance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Miscellaneos Performance tricks
- cache() & persist()
- broadcast join
- repartition & coalesce
- explain

# Setting up PySpark

In [None]:
%pip install pyspark

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').appName('Spark Course').getOrCreate()

# Preparing data

In [None]:
from pyspark import SparkFiles
from pyspark.sql.types import *

# Setting up URLs
squirrel_url = "https://raw.githubusercontent.com/lucprosa/dataeng-basic-course/main/data/squirrel-data/squirrel-data.csv"
park_url = "https://raw.githubusercontent.com/lucprosa/dataeng-basic-course/main/data/squirrel-data/park-data.csv"


# Defining schemas
squirrel_schema = StructType([
StructField('Area Name',StringType(),True),
StructField('Area ID',StringType(),True),
StructField('Park Name',StringType(),True),
StructField('Park ID', StringType(), True),
StructField('Squirrel ID', StringType(), True),
StructField('Primary Fur Color', StringType(), True),
StructField('Highlights in Fur Color', StringType(), True),
StructField('Color Notes', StringType(), True),
StructField('Location', StringType(), True),
StructField('Above Ground (Height in Feet)', StringType(), True),
StructField('Specific Location', StringType(), True),
StructField('Activities', StringType(), True),
StructField('Interactions with Humans', StringType(), True),
StructField('Squirrel Latitude (DD.DDDDDD)', StringType(), True),
StructField('Squirrel Longitude (-DD.DDDDDD)', StringType(), True)
])

park_schema = StructType([
StructField('Area Name',StringType(),True),
StructField('Area ID',StringType(),True),
StructField('Park Name',StringType(),True),
StructField('Park ID', StringType(), True),
StructField('Date', StringType(), True),
StructField('Start Time', StringType(), True),
StructField('End Time', StringType(), True),
StructField('Total Time (in minutes, if available)', StringType(), True),
StructField('Park Conditions', StringType(), True),
StructField('Other Animal Sightings', StringType(), True),
StructField('Litter', StringType(), True),
StructField('Activities', StringType(), True),
StructField('Temperature & Weather', StringType(), True),
StructField('Number of Squirrels', IntegerType(), True),
StructField('Squirrel Sighter(s)', StringType(), True),
StructField('Number of Sighters', IntegerType(), True)
])

area_schema = StructType([
StructField('Area ID',StringType(),True),
StructField('Area Name',StringType(),True),
StructField('Area Description',StringType(),True),
StructField('City Name',StringType(),True),
])

area_data = [
    ("A", "UPPER MANHATTAN", "Uptown Manhattan", "New York"),
    ("B", "CENTRAL MANHATTAN", "Midtown Manhattan", "New York"),
    ("C", "LOWER MANHATTAN", "Downtown Manhattan", "New York"),
    ("D", "BROOKLYN", "Brooklyn", "New York")
    ]

spark.sparkContext.addFile(squirrel_url)
spark.sparkContext.addFile(park_url)

# creating dataframes
squirrel = spark.read.csv(SparkFiles.get("squirrel-data.csv"), header=True, schema=squirrel_schema)
park = spark.read.csv(SparkFiles.get("park-data.csv"), header=True, schema=park_schema)
area = spark.createDataFrame(data=area_data, schema=area_schema)

In [None]:
# show data
squirrel.show()
park.show()
area.show()

# Caching & Persist

In [None]:
# Caching
# Default: MEMORY_AND_DISK

import uuid
from pyspark.sql.functions import udf

@udf
def generate_uuid():
  return str(uuid.uuid4())

# transformation 1
squirrel = squirrel.withColumn("hash_id", generate_uuid())

# transformation 2
squirrel = squirrel.dropDuplicates()

# squirrel.cache().count() <--------------- force an action to run the cache

# transformations N
# squirrel = squirrel.join...
# squirrel = squirrel.groupBy...

# DAG
# T1 -> T2 -> T3...TN -> A1

# action 1
# squirrel.write.format("parquet").path("path")


In [None]:
squirrel.cache().count()

In [None]:
squirrel.is_cached

In [None]:
squirrel.show()

In [None]:
squirrel.unpersist()

- Logical plan: user's code or query and is independent of the underlying data sources and execution strategies

- Physical Plan: The physical plan represents the actual execution steps that Spark will perform to execute the job on the cluster.

In [None]:
# Persist
# Default: MEMORY_ONLY
from pyspark.sql.functions import *
from pyspark import StorageLevel

# first execution plan
print(area.explain("cost"))
print("----------------")

area = area.withColumn("City shortname", lit("NY"))
# second execution plan
print(area.explain("cost"))
print("----------------")

area = area.persist(StorageLevel.MEMORY_ONLY)
area.count()

# second execution plan
area2 = area.withColumn("Teste", lit("test"))
print(area2.explain("cost"))
print("----------------")

print(area.storageLevel)
print(area.is_cached)

In [None]:
# Persist
# Default: MEMORY_AND_DISK

from pyspark.sql.functions import *
from pyspark import StorageLevel

# first execution plan
print(area.explain("cost"))

area = area.withColumn("City shortname", lit("NY"))
# second execution plan
print(area.explain("cost"))

area = area.persist(StorageLevel.DISK_ONLY)
area.count()

# second execution plan
area2 = area.withColumn("Teste", lit("test"))
print(area2.explain("cost"))

print(area.storageLevel)
print(area.is_cached)

# Broadcast Join

In [None]:
# Broadcast join
# identify the tables candidates for broadcast (smaller one)

join_df = (squirrel
           .join(park, on="Park ID", how="inner")
           .join(area, on="Area ID", how="inner")
           .select(area["Area Description"], park["Park Name"], park["Date"], squirrel["Squirrel ID"])
           )

join_df.explain()
join_df.show()

# Repartition & Coalesce

- coalesce is for reducing partitions without shuffling
- repartition is for distributing data evenly across the cluster for better parallelism

- if possible choose coalesce over repartition
- if needed to increase partitions to increase parallelism, use repartition, however keep the data shuffling operation in mind



In [None]:
squirrel_1 = squirrel
squirrel_2 = squirrel

# Check partitions
squirrel_1.rdd.getNumPartitions()

# RDD -> partitions among the workers

In [None]:
# repartition
# evenly distribute date across partitions for better parallel processing efficiency
# increase AND reduce partitions
# do shuffling

print(f"before repartition: {squirrel_1.rdd.getNumPartitions()}")
squirrel_1 = squirrel_1.repartition(4)
print(f"after repartition: {squirrel_1.rdd.getNumPartitions()}")

In [None]:
# coalesce
# reduce partitions without shuffling
# minimizes data movement across the cluster

# does not allow to increase partitions, only reduce
print(f"before coalesce: {squirrel_2.rdd.getNumPartitions()}")
squirrel_2 = squirrel_2.coalesce(5)
print(f"after coalesce: {squirrel_2.rdd.getNumPartitions()}")


In [None]:
print(f"before coalesce: {squirrel_1.rdd.getNumPartitions()}")
squirrel_1 = squirrel_1.coalesce(2)
print(f"after coalesce: {squirrel_1.rdd.getNumPartitions()}")

In [None]:
# repartition/coalesce and writing data
!rm -rf /content/files/area
!mkdir -p /content/files/area

# repartition "area" dataframe and write as parquet
area.repartition(3).write.format("parquet").mode("overwrite").save("/content/files/area")

In [None]:
# check files and their content

files = !ls /content/files/area/ | grep ".parquet"
folder = "/content/files/area/"

for f in files:
  df = spark.read.parquet(f"{folder}{f}")
  print(f"{f} - {df.count()} rows")

In [None]:
# Check file sizes
!ls /content/files/area

# Question

In [None]:
# Q1
# read data from /content/files/area (3 parquet files)
# write again the data into the same folder making sure the output will be only one file