In [1]:
import findspark
findspark.init()
findspark.find()

'C:\\Installation\\spark-3.3.2-bin-hadoop3'

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = (
            SparkSession
                .builder
                .appName("AqeDynamicCoalescingApp")
    
                .master("local[4]")
    
                .config("spark.dynamicAllocation.enabled", "false")

    
                # Disable Adaptive Query Execution framework
                .config("spark.sql.adaptive.enabled", "false")
    
                .getOrCreate()
        )

sc = spark.sparkContext

spark

### Create method to calculate DataFrame statistics

Finds data for each partition <br/>
Calculate count of records, and min & max values of a column across each partition

In [3]:
def getDataFrameStats(dataFrame, columnName):

    outputDF = (
                    dataFrame
                        .withColumn("Partition Number", spark_partition_id())

                        .groupBy("Partition Number")    
                        .agg(
                                  count("*").alias("Record Count"),
                                  min(columnName).alias("Min Column Value"),
                                  max(columnName).alias("Max Column Value")
                            )

                        .orderBy("Partition Number")
               )

    return outputDF

In [4]:
# Read Yellow Taxis data
yellowTaxiDF = (
                  spark
                    .read
                    .option("header", "true")    
                    .option("inferSchema", "true")    
                    .csv("C:\SparkCourse\DataFiles\Raw\YellowTaxis_202210.csv")
               )


# Check number of partitions
print("Partitions = "    + str( yellowTaxiDF.rdd.getNumPartitions()) )


Partitions = 4


### Change default shuffle partitions

In [5]:
spark.conf.set( "spark.sql.shuffle.partitions", 20 )

### Apply a wide transformation

In [6]:
yellowTaxiGroupedDF = (
                            yellowTaxiDF
                                .groupBy("VendorId", "payment_type")
                                .agg(sum("total_amount"))
                      )

yellowTaxiGroupedDF.show()

+--------+------------+--------------------+
|VendorId|payment_type|   sum(total_amount)|
+--------+------------+--------------------+
|       1|           2|   3407067.990002185|
|       2|           4|   1103.950000000002|
|       1|           0|   704915.6500000019|
|       1|           4|   73231.73999999865|
|       1|           1|1.7903327639982704E7|
|       6|           0|   279048.8899999997|
|       2|           2|    9225232.37000579|
|       1|           3|  186607.24000000514|
|       2|           0|  2899635.7899999204|
|       2|           1|4.7088665280070014E7|
|       2|           3|   29.71999999999995|
+--------+------------+--------------------+



### Check DataFrame partitions after shuffle

In [7]:
# Get number of partitions
print("Partitions = "  + str( yellowTaxiGroupedDF.rdd.getNumPartitions() ))

# Get partition stats
getDataFrameStats(yellowTaxiGroupedDF, "VendorId").show()


Partitions = 20
+----------------+------------+----------------+----------------+
|Partition Number|Record Count|Min Column Value|Max Column Value|
+----------------+------------+----------------+----------------+
|               1|           2|               1|               2|
|               4|           1|               1|               1|
|               5|           1|               1|               1|
|               7|           1|               1|               1|
|               8|           1|               6|               6|
|               9|           3|               1|               2|
|              17|           1|               2|               2|
|              18|           1|               2|               2|
+----------------+------------+----------------+----------------+



In [8]:
# EXERCISE - NOT SHOWN IN VIDEO
(
    yellowTaxiGroupedDF    
            .write
            
            .option("header", "true")
            .option("dateFormat", "yyyy-MM-dd HH:mm:ss.S")
    
            .mode("overwrite")
    
            .csv("C:\SparkCourse\DataFiles\Output\AqeTest.csv")
)

### Enable Adaptive Query Execution - Dynamic Coalescing of Shuffle Partitions

In [9]:
spark.conf.set("spark.sql.adaptive.enabled", "true")

spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")

### Apply a wide transformation and check DataFrame stats

In [10]:
yellowTaxiGroupedDF = (
                            yellowTaxiDF
                                .groupBy("VendorId", "payment_type")
                                .agg(sum("total_amount"))
                      )


# Check number of partitions
print("Partitions = "  + str(yellowTaxiGroupedDF.rdd.getNumPartitions()))


# Get DataFrame stats
getDataFrameStats(yellowTaxiGroupedDF, "VendorId").show()

Partitions = 1
+----------------+------------+----------------+----------------+
|Partition Number|Record Count|Min Column Value|Max Column Value|
+----------------+------------+----------------+----------------+
|               0|          11|               1|               6|
+----------------+------------+----------------+----------------+

