In [1]:
import findspark
findspark.init()
findspark.find()

'C:\\Installation\\spark-3.3.2-bin-hadoop3'

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = (
            SparkSession
                .builder
                .appName("SparkPartitionsApp")
    
                # Local
                .master("local[4]")
    
                # Standalone/YARN    
    
                #.config("spark.cores.max",            "6")
    
                #.config("spark.executor.memory",      "2g")
                #.config("spark.executor.cores",       "2")
    
                .config("spark.dynamicAllocation.enabled", "false")
                .config("spark.sql.adaptive.enabled", "false")
    
                .getOrCreate()
        )

sc = spark.sparkContext

### Check default parallelism

In [3]:
sc.defaultParallelism


4

### 1. Partition settings while reading data

### Check partitions for a small dataset

Spark finds optimal number of partitions

In [4]:
# Read Taxi Zones data
taxiZonesDF = (
                  spark
                    .read                    
                    .option("inferSchema", "true")
                    .csv("C:\SparkCourse\DataFiles\Raw\TaxiZones.csv")
              )

# Check number of partitions
print("Partitions = "    + str( taxiZonesDF.rdd.getNumPartitions() ))

# Check number of records
print("Record Count = "  + str( taxiZonesDF.count() ))


Partitions = 1
Record Count = 265


### Check partitions for a large dataset

In [5]:
# Read Yellow Taxis data
yellowTaxiDF = (
                  spark
                    .read
                    .option("header", "true")    
                    .option("inferSchema", "true")    
                    .csv("C:\SparkCourse\DataFiles\Raw\YellowTaxis_202210.csv")
               )

# Check number of partitions
print("Partitions = "    + str( yellowTaxiDF.rdd.getNumPartitions() ))

# Check number of records
print("Record Count = "  + str( yellowTaxiDF.count() ))


Partitions = 4
Record Count = 3675412


### Change maximum partition size

In [6]:
spark.conf.set( "spark.sql.files.maxPartitionBytes", "64m" )


### Check partitions for a large dataset

With smaller max partition size (64 MB)

In [7]:
# Read Yellow Taxis data
yellowTaxiDF = (
                  spark
                    .read
                    .option("header", "true")    
                    .option("inferSchema", "true")    
                    .csv("C:\SparkCourse\DataFiles\Raw\YellowTaxis_202210.csv")
               )

# Check default parallelism
print("Default Parallelism = "  + str( sc.defaultParallelism ))

# Check number of partitions
print("Partitions = "           + str( yellowTaxiDF.rdd.getNumPartitions() ))


Default Parallelism = 4
Partitions = 7


### Create method to calculate DataFrame statistics

Finds data for each partition <br/>
Calculate count of records, and min & max values of a column across each partition

In [8]:
def getDataFrameStats(dataFrame, columnName):

    outputDF = (
                    dataFrame

                        # Get partition number for each record
                        .withColumn("Partition Number", spark_partition_id())
        
        
                        # Group by partition, and calculate stats for a column
                        .groupBy("Partition Number")
                        .agg(
                                  count("*").alias("Record Count"),
                                  min(columnName).alias("Min Column Value"),
                                  max(columnName).alias("Max Column Value")
                            )

                        .orderBy("Partition Number")
               )

    return outputDF

### Check stats for Yellow Taxis DataFrame

In [9]:
getDataFrameStats( yellowTaxiDF, "PULocationID" ).show()

+----------------+------------+----------------+----------------+
|Partition Number|Record Count|Min Column Value|Max Column Value|
+----------------+------------+----------------+----------------+
|               0|      531991|               1|             265|
|               1|      531721|               1|             265|
|               2|      531579|               1|             265|
|               3|      531536|               1|             265|
|               4|      531728|               1|             265|
|               5|      531528|               1|             265|
|               6|      485329|               1|             265|
+----------------+------------+----------------+----------------+



### 2. Partition settings while shuffling data

### Check default number of shuffle partitions

In [10]:

spark.conf.get( "spark.sql.shuffle.partitions" )


'200'

### Apply a shuffle operation and check DataFrame stats

In [11]:
# Group the data
yellowTaxiGroupedDF = (
                            yellowTaxiDF
                                .groupBy("PULocationID")
                                .agg(sum("total_amount"))
                      )

# Check number of partitions
print("Partitions = "  + str( yellowTaxiGroupedDF.rdd.getNumPartitions() ))

# Get DataFrame stats
getDataFrameStats( yellowTaxiGroupedDF, "PULocationID" ).show()


Partitions = 200
+----------------+------------+----------------+----------------+
|Partition Number|Record Count|Min Column Value|Max Column Value|
+----------------+------------+----------------+----------------+
|               0|           1|             148|             148|
|               1|           1|             243|             243|
|               2|           1|              31|              31|
|               3|           3|              85|             251|
|               4|           1|              65|              65|
|               5|           2|              53|             255|
|               6|           1|             133|             133|
|               7|           1|              78|              78|
|              10|           2|             108|             155|
|              11|           3|              34|             211|
|              12|           3|             101|             126|
|              13|           1|              81|           

### Change default number of shuffle partitions

In [12]:
spark.conf.set( "spark.sql.shuffle.partitions", 3 )


### Apply a shuffle operation and check DataFrame stats

After changing default shuffle partitions

In [13]:
# Group the data
yellowTaxiGroupedDF = (
                            yellowTaxiDF
                                .groupBy("PULocationID")
                                .agg(sum("total_amount"))
                      )

# Check number of partitions
print("Partitions = "  + str( yellowTaxiGroupedDF.rdd.getNumPartitions() ))

# Get DataFrame stats
getDataFrameStats( yellowTaxiGroupedDF, "PULocationID" ).show()


Partitions = 3
+----------------+------------+----------------+----------------+
|Partition Number|Record Count|Min Column Value|Max Column Value|
+----------------+------------+----------------+----------------+
|               0|          88|               3|             263|
|               1|          85|               1|             265|
|               2|          87|              11|             264|
+----------------+------------+----------------+----------------+



### Check stats for Yellow Taxis DataFrame

In [14]:
getDataFrameStats( yellowTaxiDF, "PULocationID" ).show()

+----------------+------------+----------------+----------------+
|Partition Number|Record Count|Min Column Value|Max Column Value|
+----------------+------------+----------------+----------------+
|               0|      531991|               1|             265|
|               1|      531721|               1|             265|
|               2|      531579|               1|             265|
|               3|      531536|               1|             265|
|               4|      531728|               1|             265|
|               5|      531528|               1|             265|
|               6|      485329|               1|             265|
+----------------+------------+----------------+----------------+



### 1. Repartition DataFrame: Round-Robin partitioning

Create equal-sized partitions. Data is not co-located.

In [15]:

repartionedDF1 = yellowTaxiDF.repartition( 14 )


# Get DataFrame stats
getDataFrameStats( repartionedDF1, "PULocationID" ).show()

+----------------+------------+----------------+----------------+
|Partition Number|Record Count|Min Column Value|Max Column Value|
+----------------+------------+----------------+----------------+
|               0|      262529|               1|             265|
|               1|      262529|               1|             265|
|               2|      262530|               1|             265|
|               3|      262531|               1|             265|
|               4|      262531|               1|             265|
|               5|      262531|               1|             265|
|               6|      262531|               1|             265|
|               7|      262530|               1|             265|
|               8|      262528|               1|             265|
|               9|      262529|               1|             265|
|              10|      262528|               1|             265|
|              11|      262529|               1|             265|
|         

### 2. Repartition DataFrame: Hash partitioning

Co-locates the data. Partition sizes may not be the same.

In [16]:
# spark.sql.shuffle.partitions = 3 (in previous clip)


repartionedDF1 = yellowTaxiDF.repartition( "PULocationID" )


# Get DataFrame stats
getDataFrameStats( repartionedDF1, "PULocationID" ).show()

+----------------+------------+----------------+----------------+
|Partition Number|Record Count|Min Column Value|Max Column Value|
+----------------+------------+----------------+----------------+
|               0|     1246579|               3|             263|
|               1|     1426282|               1|             265|
|               2|     1002551|              11|             264|
+----------------+------------+----------------+----------------+



### 2. Repartition DataFrame: Hash partitioning & define number of partitions

Co-locates the data. Partition sizes may not be the same.

In [17]:

repartionedDF1 = yellowTaxiDF.repartition( 14, "PULocationID" )


# Get DataFrame stats
getDataFrameStats( repartionedDF1, "PULocationID" ).show()

+----------------+------------+----------------+----------------+
|Partition Number|Record Count|Min Column Value|Max Column Value|
+----------------+------------+----------------+----------------+
|               0|      405748|              12|             261|
|               1|      493312|               8|             250|
|               2|      168505|              18|             263|
|               3|      207849|               9|             260|
|               4|      191119|              52|             257|
|               5|      202251|              43|             258|
|               6|      259735|              13|             265|
|               7|      386736|              24|             255|
|               8|      142987|               4|             254|
|               9|      406158|               1|             251|
|              10|      108394|              28|             234|
|              11|      231246|               3|             241|
|         

### 3. Repartition DataFrame: Range partitioning

Sort and co-locates the data. Partition sizes may not be the same.

In [18]:

repartionedDF1 = yellowTaxiDF.repartitionByRange( 14, "PULocationID" )


# Get DataFrame stats
getDataFrameStats( repartionedDF1, "PULocationID" ).show()

+----------------+------------+----------------+----------------+
|Partition Number|Record Count|Min Column Value|Max Column Value|
+----------------+------------+----------------+----------------+
|               0|      266634|               1|              50|
|               1|      238923|              51|              79|
|               2|      290834|              80|             113|
|               3|      258270|             114|             132|
|               4|      218033|             133|             140|
|               5|      261945|             141|             143|
|               6|      286326|             144|             161|
|               7|      304033|             162|             164|
|               8|      243993|             165|             186|
|               9|      244184|             187|             230|
|              10|      359447|             231|             236|
|              11|      176660|             237|             237|
|         

### Coalesce DataFrame

Reduces number of partitions

In [19]:

coalescedDF = yellowTaxiDF.coalesce( 2 )


# Get DataFrame stats
getDataFrameStats( coalescedDF, "PULocationID" ).show()

+----------------+------------+----------------+----------------+
|Partition Number|Record Count|Min Column Value|Max Column Value|
+----------------+------------+----------------+----------------+
|               0|     1595291|               1|             265|
|               1|     2080121|               1|             265|
+----------------+------------+----------------+----------------+

