In [1]:
import findspark
findspark.init()
findspark.find()

from IPython.display import *
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = (
            SparkSession
                .builder
                .appName("DynamicPartitionPruningApp")
    
                .master("local[4]")
    
                .config("spark.dynamicAllocation.enabled", "false")
                .config("spark.sql.adaptive.enabled", "false")                    
    
    
                # Disable Dynamic Partition Pruning
                .config("spark.sql.optimizer.dynamicPartitionPruning.enabled", "false")
    
                .getOrCreate()
        )

sc = spark.sparkContext

spark

### Save large DataFrame as a Partitioned Table

In [3]:
# Create schema for Yellow Taxi data
yellowTaxiSchema = (
                        StructType
                        ([ 
                            StructField("VendorId"               , IntegerType()   , True),
                            StructField("PickupTime"             , TimestampType() , True),
                            StructField("DropTime"               , TimestampType() , True),                            
                            StructField("PassengerCount"         , DoubleType()    , True),
                            StructField("TripDistance"           , DoubleType()    , True),
                            StructField("RateCodeId"             , DoubleType()    , True),                            
                            StructField("StoreAndFwdFlag"        , StringType()    , True),
                            StructField("PickupLocationId"       , IntegerType()   , True),
                            StructField("DropLocationId"         , IntegerType()   , True),                            
                            StructField("PaymentType"            , IntegerType()   , True),                            
                            StructField("FareAmount"             , DoubleType()    , True),
                            StructField("Extra"                  , DoubleType()    , True),
                            StructField("MtaTax"                 , DoubleType()    , True),
                            StructField("TipAmount"              , DoubleType()    , True),
                            StructField("TollsAmount"            , DoubleType()    , True),
                            StructField("ImprovementSurcharge"   , DoubleType()    , True),
                            StructField("TotalAmount"            , DoubleType()    , True),
                            StructField("CongestionSurcharge"    , DoubleType()    , True),
                            StructField("AirportFee"             , DoubleType()    , True)
                        ])
                   )


# Read Yellow Taxis file
yellowTaxiDF = (
                  spark
                    .read
                    .option("header", "true")    
                    .schema(yellowTaxiSchema)    
                    .csv("C:\SparkCourse\DataFiles\Raw\YellowTaxis_202210.csv")
               )


# Save Yellow Taxis as a partitioned table
(
    yellowTaxiDF
            .write
    
            .partitionBy("PickupLocationId")
    
            .option("header", "true")
            .option("dateFormat", "yyyy-MM-dd HH:mm:ss.S")    
            .mode("overwrite")
            .format("csv")
            .option("path", "C:\SparkCourse\DataFiles\Output\YellowTaxisPartitioned.csv")
    
            .saveAsTable("YellowTaxis")
)

### Save small DataFrame as a non-partitioned table

In [4]:
taxiZonesSchema = "PickupLocationId INT, Borough STRING, Zone STRING, ServiceZone STRING"

# Read Taxi Zones file
taxiZonesDF = (
                  spark
                    .read                    
                    .schema(taxiZonesSchema)
                    .csv("C:\SparkCourse\DataFiles\Raw\TaxiZones.csv")
              )

# Save Taxi Zones as a non-partitioned table
(
    taxiZonesDF
            .write    
            .option("header", "true")
            .option("dateFormat", "yyyy-MM-dd HH:mm:ss.S")    
            .mode("overwrite")
            .format("csv")
            .option("path", "C:\SparkCourse\DataFiles\Output\TaxiZones.csv")
    
            .saveAsTable("TaxiZones")
)

### Join Yellow Taxis and Taxi Zones with filter on Yellow Taxis table

Filtering on same column (PickupLocationId) by which data is partitioned

In [5]:
spark.sql("""

SELECT *

FROM YellowTaxis yt

    JOIN TaxiZones tz ON yt.PickupLocationId = tz.PickupLocationId
    
    WHERE yt.PickupLocationid = 1

""").show()

+--------+-------------------+-------------------+--------------+------------+----------+---------------+--------------+-----------+----------+-----+------+---------+-----------+--------------------+-----------+-------------------+----------+----------------+----------------+-------+--------------+-----------+
|VendorId|         PickupTime|           DropTime|PassengerCount|TripDistance|RateCodeId|StoreAndFwdFlag|DropLocationId|PaymentType|FareAmount|Extra|MtaTax|TipAmount|TollsAmount|ImprovementSurcharge|TotalAmount|CongestionSurcharge|AirportFee|PickupLocationId|PickupLocationId|Borough|          Zone|ServiceZone|
+--------+-------------------+-------------------+--------------+------------+----------+---------------+--------------+-----------+----------+-----+------+---------+-----------+--------------------+-----------+-------------------+----------+----------------+----------------+-------+--------------+-----------+
|       2|2022-10-09 18:42:58|2022-10-09 18:43:02|           1.0

### Check Pickup Location Ids in one Borough

'EWR' borough has only one Pickup Location Id, which is Id=1

In [6]:
spark.sql("""

SELECT *

FROM TaxiZones

WHERE Borough = 'EWR'

""").show()

+----------------+-------+--------------+-----------+
|PickupLocationId|Borough|          Zone|ServiceZone|
+----------------+-------+--------------+-----------+
|               1|    EWR|Newark Airport|        EWR|
+----------------+-------+--------------+-----------+



### DPP disabled: Join Yellow Taxis and Taxi Zones with filter on Taxi Zones table

Output is the same as previous query. <br/>
But since filter is on Taxi Zones, Partition Pruning will not work.

In [7]:
spark.sql("""

SELECT * 

FROM YellowTaxis yt

    JOIN TaxiZones tz ON yt.PickupLocationId = tz.PickupLocationId
        
    WHERE tz.Borough = 'EWR'     --WHERE yt.PickupLocationid = 1 (Both will yield same output)

""").show()

+--------+-------------------+-------------------+--------------+------------+----------+---------------+--------------+-----------+----------+-----+------+---------+-----------+--------------------+-----------+-------------------+----------+----------------+----------------+-------+--------------+-----------+
|VendorId|         PickupTime|           DropTime|PassengerCount|TripDistance|RateCodeId|StoreAndFwdFlag|DropLocationId|PaymentType|FareAmount|Extra|MtaTax|TipAmount|TollsAmount|ImprovementSurcharge|TotalAmount|CongestionSurcharge|AirportFee|PickupLocationId|PickupLocationId|Borough|          Zone|ServiceZone|
+--------+-------------------+-------------------+--------------+------------+----------+---------------+--------------+-----------+----------+-----+------+---------+-----------+--------------------+-----------+-------------------+----------+----------------+----------------+-------+--------------+-----------+
|       2|2022-10-09 18:42:58|2022-10-09 18:43:02|           1.0

### Enable Dynamic Partition Pruning

In [8]:
spark.conf.set( "spark.sql.optimizer.dynamicPartitionPruning.enabled", "true" )

### DPP enabled: Join Yellow Taxis and Taxi Zones with filter on Taxi Zones table

Partition Pruning will work

In [9]:
spark.sql("""

SELECT * 

FROM YellowTaxis yt

    JOIN TaxiZones tz ON yt.PickupLocationId = tz.PickupLocationId
    
    WHERE tz.Borough = 'EWR'

""").show()

+--------+-------------------+-------------------+--------------+------------+----------+---------------+--------------+-----------+----------+-----+------+---------+-----------+--------------------+-----------+-------------------+----------+----------------+----------------+-------+--------------+-----------+
|VendorId|         PickupTime|           DropTime|PassengerCount|TripDistance|RateCodeId|StoreAndFwdFlag|DropLocationId|PaymentType|FareAmount|Extra|MtaTax|TipAmount|TollsAmount|ImprovementSurcharge|TotalAmount|CongestionSurcharge|AirportFee|PickupLocationId|PickupLocationId|Borough|          Zone|ServiceZone|
+--------+-------------------+-------------------+--------------+------------+----------+---------------+--------------+-----------+----------+-----+------+---------+-----------+--------------------+-----------+-------------------+----------+----------------+----------------+-------+--------------+-----------+
|       2|2022-10-09 18:42:58|2022-10-09 18:43:02|           1.0