In [1]:
import findspark
findspark.init()
findspark.find()

from IPython.display import *
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = (
            SparkSession
                .builder
                .appName("AqeHandlingSkewApp")
    
                .master("local[4]")
    
                .config("spark.dynamicAllocation.enabled", "false")
    
    
                # Disable Skew Join
                .config("spark.sql.adaptive.enabled", "true")    
                .config("spark.sql.adaptive.skewJoin.enabled", "false")
    
    
                .getOrCreate()
        )

sc = spark.sparkContext

spark

### Create Products DataFrame

With 2 million unique Product Ids

In [3]:
productsDF = (
                 spark
                    .range(1, 2000001)   # gives an 'id' column

                    .select(
                                col("id").alias("ProductId"),

                                expr("ROUND(RAND() * 100, 2) AS Price")
                           )
              )

productsDF.show()

+---------+-----+
|ProductId|Price|
+---------+-----+
|        1|33.67|
|        2|31.56|
|        3|20.02|
|        4|37.07|
|        5|88.98|
|        6|93.15|
|        7| 5.74|
|        8|85.46|
|        9|96.78|
|       10|13.52|
|       11| 9.62|
|       12|69.39|
|       13|58.52|
|       14| 46.0|
|       15|41.26|
|       16|88.41|
|       17|41.81|
|       18|43.68|
|       19|52.19|
|       20|61.34|
+---------+-----+
only showing top 20 rows



### Create Sales DataFrame

1. With 100 million sales records referring to Products
2. Close to 70% sales are of ProductId=1

In [4]:
salesDF = (
            spark
                .range(1, 100000001)   # gives an 'id' column

                .select(
                            col("id").alias("SalesId"),

                            # ProductId - 70% values will be Id 1
                            expr("""
                                       CASE 
                                           WHEN RAND() < 0.7 
                                                THEN 1                                                    
                                           ELSE 
                                               CAST (RAND() * 2000000 AS INT)
                                        END                                            
                                 """).alias("ProductId"),

                            # Quantity - Random
                            expr("CAST(RAND() * 10 AS INTEGER)").alias("QuantitySold"),

                            # Sales Date - Random
                            expr("DATE_ADD(CURRENT_DATE(), - CAST(RAND() * 365 AS INT))").alias("SalesDate")
                       )
            )

salesDF.show()

+-------+---------+------------+----------+
|SalesId|ProductId|QuantitySold| SalesDate|
+-------+---------+------------+----------+
|      1|        1|           6|2022-10-19|
|      2|        1|           3|2022-07-16|
|      3|    94849|           0|2022-11-20|
|      4|        1|           2|2023-02-13|
|      5|        1|           0|2022-08-19|
|      6|  1134608|           6|2022-11-04|
|      7|   205731|           5|2022-09-26|
|      8|        1|           4|2022-03-13|
|      9|        1|           3|2023-01-14|
|     10|        1|           5|2023-02-25|
|     11|        1|           8|2022-07-09|
|     12|  1977349|           6|2022-12-25|
|     13|   286223|           7|2022-10-02|
|     14|        1|           9|2022-04-10|
|     15|  1499828|           0|2022-08-07|
|     16|        1|           2|2023-01-16|
|     17|        1|           6|2022-09-08|
|     18|        1|           6|2023-01-28|
|     19|        1|           2|2022-12-25|
|     20|        1|           2|

### Create views on Products & Sales

In [5]:
productsDF.createOrReplaceTempView("Products")

salesDF.createOrReplaceTempView("Sales")

### Check sales of each product

Data is highly skewed in favor of ProductId=1

In [6]:
spark.sql("""

SELECT ProductId, COUNT(*) AS ProductCount

FROM Sales

GROUP BY ProductId

ORDER BY ProductCount DESC

""").show()

+---------+------------+
|ProductId|ProductCount|
+---------+------------+
|        1|    70004064|
|  1977815|          38|
|   652857|          37|
|  1323805|          36|
|   376760|          36|
|  1679899|          35|
|  1829808|          35|
|   165319|          35|
|  1105029|          35|
|    17836|          35|
|   101456|          35|
|   640114|          35|
|  1519083|          35|
|  1941062|          35|
|   377511|          35|
|  1563812|          34|
|  1424152|          34|
|   520889|          34|
|  1038946|          34|
|   560680|          34|
+---------+------------+
only showing top 20 rows



### Find total number of products sold per day

With Adaptive Query Execution: Skew Join disabled

In [7]:
spark.sql("""

SELECT s.SalesDate, SUM(Price * QuantitySold) AS SalesAmount

FROM Sales s

    JOIN Products p ON p.ProductId = s.ProductId

GROUP BY s.SalesDate

ORDER BY SalesAmount DESC

""").show()

+----------+--------------------+
| SalesDate|         SalesAmount|
+----------+--------------------+
|2022-09-19| 4.793803739002777E7|
|2022-06-10| 4.785602659002823E7|
|2023-01-17| 4.785248331002798E7|
|2022-07-14| 4.784608933002815E7|
|2022-09-28| 4.780843401002773E7|
|2022-04-19|4.7782788870028116E7|
|2022-07-21| 4.778059243002849E7|
|2022-04-05| 4.777592552002846E7|
|2023-01-30| 4.775811895002778E7|
|2022-11-16| 4.775654313002841E7|
|2023-02-01|  4.77474750900281E7|
|2022-10-24| 4.773835376002826E7|
|2023-01-08| 4.773803484002813E7|
|2022-07-07|  4.77343092200274E7|
|2022-09-21| 4.773031625002805E7|
|2022-12-04|4.7730253010027975E7|
|2023-02-02| 4.772624556002751E7|
|2022-12-12|4.7726188580027685E7|
|2023-01-09|4.7717497330027826E7|
|2022-08-20| 4.771574968002813E7|
+----------+--------------------+
only showing top 20 rows



### Enable Adaptive Query Execution - Handling Data Skew in Joins

In [8]:
spark.conf.set("spark.sql.adaptive.skewJoin.enabled", "true")

### Find total number of products sold per day

With Adaptive Query Execution: Skew Join enabled

In [9]:
spark.sql("""

SELECT s.SalesDate, SUM(Price * QuantitySold) AS SalesAmount

FROM Sales s

    JOIN Products p ON p.ProductId = s.ProductId

GROUP BY s.SalesDate

ORDER BY SalesAmount DESC

""").show()

+----------+--------------------+
| SalesDate|         SalesAmount|
+----------+--------------------+
|2022-09-19| 4.793803738999283E7|
|2022-06-10| 4.785602658999281E7|
|2023-01-17| 4.785248330999274E7|
|2022-07-14|  4.78460893299928E7|
|2022-09-28|4.7808434009992935E7|
|2022-04-19|4.7782788869992815E7|
|2022-07-21| 4.778059242999279E7|
|2022-04-05|4.7775925519992694E7|
|2023-01-30|4.7758118949992664E7|
|2022-11-16| 4.775654312999285E7|
|2023-02-01| 4.774747508999284E7|
|2022-10-24| 4.773835375999287E7|
|2023-01-08|  4.77380348399927E7|
|2022-07-07| 4.773430921999268E7|
|2022-09-21| 4.773031624999287E7|
|2022-12-04| 4.773025300999274E7|
|2023-02-02|4.7726245559992775E7|
|2022-12-12|4.7726188579992846E7|
|2023-01-09| 4.771749732999276E7|
|2022-08-20| 4.771574967999276E7|
+----------+--------------------+
only showing top 20 rows

