In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.master("local").appName("SparkPartition").getOrCreate()
sc = spark.sparkContext

In [4]:
#This feature helps in running a optimised spark application - uses the runtime stats 
#to choose most efficient query execution

"""
Use of auto repartitioning spark.sql.adaptive.coalescePartitions.enabled, spark.sql.adaptive.enabled
Difference between spark repartitons and partitionBy
Reparition, spark engine will chose optimal number of paritions
ParitionBy has to be given when we use dataframe writer, we cannot give a number. it has to be always column name
"""

df = spark.read.format("csv").option("header","True") \
.load("file:///C:/Users/venka/PycharmProjects/pythonProject/dataset/BankChurners.csv") \
.repartition(3)

df.show()

df.rdd.getNumPartitions()

+---------+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------------+--------------+-------------------+---------------------+
|CLIENTNUM|   Attrition_Flag|Customer_Age|Gender|Dependent_count|Education_Level|Marital_Status|Income_Category|Card_Category|Months_on_book|Total_Relationship_Count|Months_Inactive_12_mon|Contacts_Count_12_mon|Credit_Limit|Total_Revolving_Bal|Avg_Open_To_Buy|Total_Amt_Chng_Q4_Q1|Total_Trans_Amt|Total_Trans_Ct|Total_Ct_Chng_Q4_Q1|Avg_Utilization_Ratio|
+---------+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------

3

In [5]:
df_widetrans = df.select("Card_Category").distinct()

df_widetrans.show()

df_widetrans.rdd.getNumPartitions()

+-------------+
|Card_Category|
+-------------+
|     Platinum|
|       Silver|
|         Blue|
|         Gold|
+-------------+



1

In [44]:
#To enable shuffle auto repartitioning, enable AQE and specify repartitioning as well.
spark.conf.get("spark.sql.adaptive.enabled")

'true'

In [7]:
spark.conf.set("spark.sql.adaptive.enabled","false")

In [45]:
#spark repartitions
spark.conf.get("spark.sql.adaptive.coalescePartitions.enabled")

'true'

In [8]:
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled","false")

In [11]:
#Difference between Reparitions and PartitionBy
df1 = spark.read.format("csv").option("header","True") \
.load("file:///C:/Users/venka/PycharmProjects/pythonProject/dataset/IntPersonal_transactions.csv")

In [13]:
df1.show()

+-----------+-------------+---------+-------------------+----------------+-------+
|Customer_No|    Card_type|     Date|           Category|Transaction Type| Amount|
+-----------+-------------+---------+-------------------+----------------+-------+
|    1000501|Platinum Card| 1/1/2018|           Shopping|           debit|  11.11|
|    1000501|     Checking| 1/2/2018|    Mortgage & Rent|           debit|1247.44|
|    1000501|  Silver Card| 1/2/2018|        Restaurants|           debit|  24.22|
|    1000501|Platinum Card| 1/3/2018|Credit Card Payment|          credit|2298.09|
|    1000501|Platinum Card| 1/4/2018|      Movies & DVDs|           debit|  11.76|
|    1000501|  Silver Card| 1/5/2018|        Restaurants|           debit|  25.85|
|    1000501|  Silver Card| 1/6/2018|   Home Improvement|           debit|  18.45|
|    1000501|     Checking| 1/8/2018|          Utilities|           debit|     45|
|    1000501|  Silver Card| 1/8/2018|   Home Improvement|           debit|  15.38|
|   

In [23]:
df2 = df1.select("customer_no").distinct()

In [25]:
df2.rdd.getNumPartitions()

200

In [26]:
#1. Basic Syntactical Usage - you can use either num or column name
#It will upscale or downgrade the partition to given number
df1.repartition(3).rdd.getNumPartitions()

3

In [30]:
#It is not mandatory to have seven(unique) customer no paritions, spark engine gives optimal value
df1.repartition("customer_no").rdd.getNumPartitions()

200

In [31]:
#Spark engine will ensure unique records reside in same parition
df1.repartition(2,"customer_no").rdd.getNumPartitions()

2

In [32]:
df1.write.partitionBy("customer_no")

<pyspark.sql.readwriter.DataFrameWriter at 0x1d36c3b5610>

In [38]:
#We can have empty paritions as well when repartition number is very large
from pyspark.sql.functions import spark_partition_id

df_part = df1.repartition(3,"customer_no").withColumn("partitionId",spark_partition_id()) \
    .groupBy("partitionId") \
    .count()

In [40]:
df_part.show()

+-----------+-----+
|partitionId|count|
+-----------+-----+
|          1|   14|
|          2|   60|
|          0|   31|
+-----------+-----+



In [44]:
#Functional difference is observed when we save files using repartition(data written to memory, will involve shuffle operation) 
# and paritionBy(data written to disk, will not involve shuffle to push data to target path)
#Filter on paritionBy(target data segregation) will return results faster

filepath = "file:///C:/Users/venka/PycharmProjects/pythonProject/dataset/"
        

df1.repartition("Customer_No").write.format("csv") \
.option("header",True).mode("overwrite") \
.save(filepath + "Online/repart/customer")

In [46]:
df1.write.format("csv").partitionBy("Customer_No") \
.option("header",True).mode("overwrite") \
.save(filepath + "Online/partBy/customer")

In [47]:
#Performance analysis
#read the target directory
in_part = spark.read.csv(filepath + "Online/partBy/customer/", header = True ).filter("Customer_No=1002324")

In [49]:
in_part.show()

+-------------+---------+-------------------+----------------+------+-----------+
|    Card_type|     Date|           Category|Transaction Type|Amount|Customer_No|
+-------------+---------+-------------------+----------------+------+-----------+
|Platinum Card| 3/5/2018|       Coffee Shops|           debit|     3|    1002324|
|  Silver Card| 3/5/2018|Credit Card Payment|          credit|761.59|    1002324|
|     Checking| 3/5/2018|Credit Card Payment|           debit|761.59|    1002324|
|Platinum Card| 3/7/2018|       Coffee Shops|           debit|   3.5|    1002324|
|Platinum Card| 3/8/2018|         Gas & Fuel|           debit|  34.9|    1002324|
|     Checking| 3/8/2018|          Utilities|           debit|    52|    1002324|
|Platinum Card| 3/9/2018|          Groceries|           debit| 20.72|    1002324|
|Platinum Card| 3/9/2018|          Groceries|           debit|  5.09|    1002324|
|Platinum Card| 3/9/2018|              Music|           debit| 10.69|    1002324|
|Platinum Card|3

In [50]:
df1.filter("Customer_No=1002324").show()

+-----------+-------------+---------+-------------------+----------------+------+
|Customer_No|    Card_type|     Date|           Category|Transaction Type|Amount|
+-----------+-------------+---------+-------------------+----------------+------+
|    1002324|Platinum Card| 3/5/2018|       Coffee Shops|           debit|     3|
|    1002324|  Silver Card| 3/5/2018|Credit Card Payment|          credit|761.59|
|    1002324|     Checking| 3/5/2018|Credit Card Payment|           debit|761.59|
|    1002324|Platinum Card| 3/7/2018|       Coffee Shops|           debit|   3.5|
|    1002324|Platinum Card| 3/8/2018|         Gas & Fuel|           debit|  34.9|
|    1002324|     Checking| 3/8/2018|          Utilities|           debit|    52|
|    1002324|Platinum Card| 3/9/2018|          Groceries|           debit| 20.72|
|    1002324|Platinum Card| 3/9/2018|          Groceries|           debit|  5.09|
|    1002324|Platinum Card| 3/9/2018|              Music|           debit| 10.69|
|    1002324|Pla