In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.streaming import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import os

spark = SparkSession.builder.master("local").appName("DataSkew").getOrCreate()
sc = spark.sparkContext

In [9]:
# sql adaptive query execution adaptive.coalescePartitions.enabled will makr paritions dynamic
sc.setLogLevel("Error")
spark.conf.set("spark.sql.shuffle.partitions",3)
spark.conf.get("spark.sql.shuffle.partitions")
spark.conf.set("spark.sql.adaptive.enabled","false")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled","false")
spark.conf.set("spark.sql.execution.arrow.enabled",True)
spark.conf.set("spark.sql.execution.arrow.fallback.enabled",True)

"""
Record count for each partition
Spark_parition_id (from pyspark.sql.functions)
"""

#WithoutInferSchema
#headerTrue will read first row and assign column names but type is String for all
#spark job created to read first column
filepath = "file:///C:/Users/venka/PycharmProjects/pythonProject/dataset/"
df = spark.read.option("header",True) \
            .option("inferSchema",True) \
            .option("delimiter",",") \
            .csv(filepath + "BankChurners.csv")

In [10]:
df.show()

+---------+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------------+--------------+-------------------+---------------------+
|CLIENTNUM|   Attrition_Flag|Customer_Age|Gender|Dependent_count|Education_Level|Marital_Status|Income_Category|Card_Category|Months_on_book|Total_Relationship_Count|Months_Inactive_12_mon|Contacts_Count_12_mon|Credit_Limit|Total_Revolving_Bal|Avg_Open_To_Buy|Total_Amt_Chng_Q4_Q1|Total_Trans_Amt|Total_Trans_Ct|Total_Ct_Chng_Q4_Q1|Avg_Utilization_Ratio|
+---------+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------

In [12]:
df.count()

10127

In [13]:
#Repartition
repart = df.repartition(4)

In [23]:
repart.rdd.getNumPartitions()

4

In [27]:
repart.show()

+---------+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------------+--------------+-------------------+---------------------+
|CLIENTNUM|   Attrition_Flag|Customer_Age|Gender|Dependent_count|Education_Level|Marital_Status|Income_Category|Card_Category|Months_on_book|Total_Relationship_Count|Months_Inactive_12_mon|Contacts_Count_12_mon|Credit_Limit|Total_Revolving_Bal|Avg_Open_To_Buy|Total_Amt_Chng_Q4_Q1|Total_Trans_Amt|Total_Trans_Ct|Total_Ct_Chng_Q4_Q1|Avg_Utilization_Ratio|
+---------+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------

In [20]:
#Spark_parition_id

from pyspark.sql.functions import spark_partition_id

df_part = repart.withColumn("partitionId",spark_partition_id()) \
    .groupBy("partitionId") \
    .count()

In [28]:
df_part.show()

+-----------+-----+
|partitionId|count|
+-----------+-----+
|          0| 2531|
|          2| 2532|
|          3| 2532|
|          1| 2532|
+-----------+-----+



In [30]:
df_part

DataFrame[partitionId: int, count: bigint]

In [32]:
#repartition by card_category
repart2 = df.repartition("Card_category")

In [34]:
repart2.rdd.getNumPartitions()

3

In [36]:
from pyspark.sql.functions import spark_partition_id

df_part2 = repart2.withColumn("partitionId",spark_partition_id()) \
    .groupBy("partitionId") \
        .count()

In [37]:
df_part2.show()
#if we are using this data frame for join conditions we will receive data skewness

+-----------+-----+
|partitionId|count|
+-----------+-----+
|          0|10107|
|          2|   20|
+-----------+-----+

