In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [None]:
spark = SparkSession.builder.appName("data_skew").getOrCreate()
spark.conf.set("spark.sql.adaptive.enabled", "false")
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

# Uniform dataset 

In [None]:
df_uniform = spark.range(1000000)
df_uniform.show(3, truncate=False)

In [None]:
(
    df_uniform
    .withColumn("partition", F.spark_partition_id())
    .groupBy("partition")
    .count()
    .orderBy("partition")
    .show()
)

# Skewed dataset 

In [None]:
df0 = spark.range(0, 1000000).repartition(1)
df1 = spark.range(0, 10).repartition(1)
df2 = spark.range(0, 10).repartition(1)
df_skew = df0.union(df1).union(df2)
df_skew.show(3, truncate=False)

In [None]:
(
    df_skew
    .withColumn("partition", F.spark_partition_id())
    .groupBy("partition")
    .count()
    .orderBy("partition")
    .show()
)

# Skewed join dataset with join

In [None]:
df_transactions = spark.read.format('parquet').load("/opt/bitnami/spark/custom_data/chapter7/transactions/")
df_customers = spark.read.format('parquet').load("/opt/bitnami/spark/custom_data/chapter7/customers/")

In [None]:
(
    df_transactions
    .groupBy("cust_id")
    .agg(F.countDistinct("txn_id").alias('ct'))
    .orderBy(F.desc("ct"))
    .show(5, truncate=False)
)

In [None]:
df_txn_details = (
    df_transactions.join(
        df_customers,
        on="cust_id",
        how='inner'
    )
)

In [None]:
df_txn_details.count()

In [None]:
spark.stop()