<a href="https://colab.research.google.com/github/lucprosa/dataeng-basic-course/blob/main/spark/examples/08-caching.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a href="https://colab.research.google.com/github/lucprosa/dataeng-basic-course/blob/main/spark/examples/08-caching.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Performance tricks
- cache() & persist()
- broadcast join
- repartition & coalesce
- explain

# Setting up PySpark

In [None]:
%pip install pyspark



In [47]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').appName('Spark Course').getOrCreate()

# Preparing data

In [10]:
from pyspark import SparkFiles
from pyspark.sql.types import *

# Setting up URLs
squirrel_url = "https://raw.githubusercontent.com/lucprosa/dataeng-basic-course/main/data/squirrel-data/squirrel-data.csv"
park_url = "https://raw.githubusercontent.com/lucprosa/dataeng-basic-course/main/data/squirrel-data/park-data.csv"


# Defining schemas
squirrel_schema = StructType([
StructField('Area Name',StringType(),True),
StructField('Area ID',StringType(),True),
StructField('Park Name',StringType(),True),
StructField('Park ID', StringType(), True),
StructField('Squirrel ID', StringType(), True),
StructField('Primary Fur Color', StringType(), True),
StructField('Highlights in Fur Color', StringType(), True),
StructField('Color Notes', StringType(), True),
StructField('Location', StringType(), True),
StructField('Above Ground (Height in Feet)', StringType(), True),
StructField('Specific Location', StringType(), True),
StructField('Activities', StringType(), True),
StructField('Interactions with Humans', StringType(), True),
StructField('Squirrel Latitude (DD.DDDDDD)', StringType(), True),
StructField('Squirrel Longitude (-DD.DDDDDD)', StringType(), True)
])

park_schema = StructType([
StructField('Area Name',StringType(),True),
StructField('Area ID',StringType(),True),
StructField('Park Name',StringType(),True),
StructField('Park ID', StringType(), True),
StructField('Date', StringType(), True),
StructField('Start Time', StringType(), True),
StructField('End Time', StringType(), True),
StructField('Total Time (in minutes, if available)', StringType(), True),
StructField('Park Conditions', StringType(), True),
StructField('Other Animal Sightings', StringType(), True),
StructField('Litter', StringType(), True),
StructField('Activities', StringType(), True),
StructField('Temperature & Weather', StringType(), True),
StructField('Number of Squirrels', IntegerType(), True),
StructField('Squirrel Sighter(s)', StringType(), True),
StructField('Number of Sighters', IntegerType(), True)
])

area_schema = StructType([
StructField('Area ID',StringType(),True),
StructField('Area Name',StringType(),True),
StructField('Area Description',StringType(),True),
StructField('City Name',StringType(),True),
])

area_data = [
    ("A", "UPPER MANHATTAN", "Uptown Manhattan", "New York"),
    ("B", "CENTRAL MANHATTAN", "Midtown Manhattan", "New York"),
    ("C", "LOWER MANHATTAN", "Downtown Manhattan", "New York"),
    ("D", "BROOKLYN", "Brooklyn", "New York")
    ]

spark.sparkContext.addFile(squirrel_url)
spark.sparkContext.addFile(park_url)

# creating dataframes
squirrel = spark.read.csv(SparkFiles.get("squirrel-data.csv"), header=True, schema=squirrel_schema)
park = spark.read.csv(SparkFiles.get("park-data.csv"), header=True, schema=park_schema)
area = spark.createDataFrame(data=area_data, schema=area_schema)

In [11]:
# show data
squirrel.show()
park.show()
area.show()

+---------------+-------+-------------------+-------+-----------+-----------------+-----------------------+-----------+------------+-----------------------------+-----------------+--------------------+------------------------+-----------------------------+-------------------------------+
|      Area Name|Area ID|          Park Name|Park ID|Squirrel ID|Primary Fur Color|Highlights in Fur Color|Color Notes|    Location|Above Ground (Height in Feet)|Specific Location|          Activities|Interactions with Humans|Squirrel Latitude (DD.DDDDDD)|Squirrel Longitude (-DD.DDDDDD)|
+---------------+-------+-------------------+-------+-----------+-----------------+-----------------------+-----------+------------+-----------------------------+-----------------+--------------------+------------------------+-----------------------------+-------------------------------+
|UPPER MANHATTAN|      A|    Fort Tryon Park|     01|    A-01-01|             Gray|                  White|       NULL|Ground Plane| 

In [33]:
!pip install pytictoc

Collecting pytictoc
  Downloading pytictoc-1.5.3-py2.py3-none-any.whl.metadata (2.9 kB)
Downloading pytictoc-1.5.3-py2.py3-none-any.whl (4.0 kB)
Installing collected packages: pytictoc
Successfully installed pytictoc-1.5.3


In [48]:
# scenario 1
# many transformations on the same dataframe

import time
from pytictoc import TicToc
import uuid
from pyspark.sql.functions import broadcast, udf

t = TicToc()
t.tic()

@udf
def generate_uuid():
  return str(uuid.uuid4())

squirrel = squirrel.dropDuplicates()
squirrel = squirrel.withColumn("hash_id", generate_uuid())

join_df = (squirrel
           .join(park, on="Park ID", how="inner")
           .join(area, on="Area ID", how="inner")
           .select(area["Area Description"], park["Park Name"], park["Date"], squirrel["Squirrel ID"])
           )

# join_df.explain("cost")

squirrel.count()

t.toc()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [21]:
#squirrel.unpersist()

DataFrame[Area Name: string, Area ID: string, Park Name: string, Park ID: string, Squirrel ID: string, Primary Fur Color: string, Highlights in Fur Color: string, Color Notes: string, Location: string, Above Ground (Height in Feet): string, Specific Location: string, Activities: string, Interactions with Humans: string, Squirrel Latitude (DD.DDDDDD): string, Squirrel Longitude (-DD.DDDDDD): string]

In [22]:

join_df = (squirrel
           .join(park, on="Park ID", how="inner")
           .join(area, on="Area ID", how="inner")
           .select(area["Area Description"], park["Park Name"], park["Date"], squirrel["Squirrel ID"])
           )

join_df.explain("cost")

== Optimized Logical Plan ==
Project [Area Description#490, Park Name#458, Date#460, Squirrel ID#430], Statistics(sizeInBytes=2.97E+25 B)
+- Join Inner, (Area ID#427 = Area ID#488), Statistics(sizeInBytes=4.32E+25 B)
   :- Project [Area ID#427, Squirrel ID#430, Park Name#458, Date#460], Statistics(sizeInBytes=8.2 MiB)
   :  +- Join Inner, (Park ID#429 = Park ID#459), Statistics(sizeInBytes=11.9 MiB)
   :     :- Project [Area ID#427, Park ID#429, Squirrel ID#430], Statistics(sizeInBytes=12.7 KiB)
   :     :  +- Filter (isnotnull(Park ID#429) AND isnotnull(Area ID#427)), Statistics(sizeInBytes=57.7 KiB)
   :     :     +- Relation [Area Name#426,Area ID#427,Park Name#428,Park ID#429,Squirrel ID#430,Primary Fur Color#431,Highlights in Fur Color#432,Color Notes#433,Location#434,Above Ground (Height in Feet)#435,Specific Location#436,Activities#437,Interactions with Humans#438,Squirrel Latitude (DD.DDDDDD)#439,Squirrel Longitude (-DD.DDDDDD)#440] csv, Statistics(sizeInBytes=57.7 KiB)
   :   

In [24]:
from pyspark.sql.functions import broadcast

join_df = (squirrel
           .join(park, on="Park ID", how="inner")
           .join(broadcast(area), on="Area ID", how="inner")
           .select(area["Area Description"], park["Park Name"], park["Date"], squirrel["Squirrel ID"])
           )

join_df.explain("cost")

== Optimized Logical Plan ==
Project [Area Description#490, Park Name#458, Date#460, Squirrel ID#430], Statistics(sizeInBytes=2.97E+25 B)
+- Join Inner, (Area ID#427 = Area ID#488), rightHint=(strategy=broadcast), Statistics(sizeInBytes=4.32E+25 B)
   :- Project [Area ID#427, Squirrel ID#430, Park Name#458, Date#460], Statistics(sizeInBytes=8.2 MiB)
   :  +- Join Inner, (Park ID#429 = Park ID#459), Statistics(sizeInBytes=11.9 MiB)
   :     :- Project [Area ID#427, Park ID#429, Squirrel ID#430], Statistics(sizeInBytes=12.7 KiB)
   :     :  +- Filter (isnotnull(Park ID#429) AND isnotnull(Area ID#427)), Statistics(sizeInBytes=57.7 KiB)
   :     :     +- Relation [Area Name#426,Area ID#427,Park Name#428,Park ID#429,Squirrel ID#430,Primary Fur Color#431,Highlights in Fur Color#432,Color Notes#433,Location#434,Above Ground (Height in Feet)#435,Specific Location#436,Activities#437,Interactions with Humans#438,Squirrel Latitude (DD.DDDDDD)#439,Squirrel Longitude (-DD.DDDDDD)#440] csv, Statisti