<a href="https://colab.research.google.com/github/visshal2301/AdvanceSpark_GoogleColab/blob/main/6_1_salting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [2]:
!pip install -q pyngrok

In [3]:
spark = SparkSession.builder \
       .master("local[*]") \
       .appName("Colab") \
       .config('spark.ui.port', '4050') \
       .getOrCreate()
spark

In [4]:
# prompt: need to see spark UI using colab
from pyngrok import ngrok
from google.colab import userdata

# Get the ngrok authtoken from Colab secrets
# You need to add your ngrok authtoken to Colab's secrets with the name 'NGROK_AUTH_TOKEN'
ngrok_auth_token = userdata.get('NGROK_AUTH_TOKEN')
ngrok.set_auth_token(ngrok_auth_token)

public_url = ngrok.connect(4050).public_url
print(f"Spark UI: {public_url}")

Spark UI: https://fe9214fe12dd.ngrok-free.app


In [5]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)
spark.conf.set("spark.sql.shuffle.partitions",3)

In [6]:
transactions_file = "/content/drive/MyDrive/data/data_skew/transactions.parquet"
df_transactions = spark.read.parquet(transactions_file)

In [7]:
df_transactions.show(5, False)

+----------+----------+----------+---------------+----------+----+-----+---+-------------+------+-----------+
|cust_id   |start_date|end_date  |txn_id         |date      |year|month|day|expense_type |amt   |city       |
+----------+----------+----------+---------------+----------+----+-----+---+-------------+------+-----------+
|C0YDPQWPBJ|2010-07-01|2018-12-01|TZ5SMKZY9S03OQJ|2018-10-07|2018|10   |7  |Entertainment|10.42 |boston     |
|C0YDPQWPBJ|2010-07-01|2018-12-01|TYIAPPNU066CJ5R|2016-03-27|2016|3    |27 |Motor/Travel |44.34 |portland   |
|C0YDPQWPBJ|2010-07-01|2018-12-01|TETSXIK4BLXHJ6W|2011-04-11|2011|4    |11 |Entertainment|3.18  |chicago    |
|C0YDPQWPBJ|2010-07-01|2018-12-01|TQKL1QFJY3EM8LO|2018-02-22|2018|2    |22 |Groceries    |268.97|los_angeles|
|C0YDPQWPBJ|2010-07-01|2018-12-01|TYL6DFP09PPXMVB|2010-10-16|2010|10   |16 |Entertainment|2.66  |chicago    |
+----------+----------+----------+---------------+----------+----+-----+---+-------------+------+-----------+
only showi

In [8]:
customers_file = "/content/drive/MyDrive/data/data_skew/customers.parquet"
df_customers = spark.read.parquet(customers_file)

In [9]:
df_transactions.printSchema()
df_transactions.show(3, False)

root
 |-- cust_id: string (nullable = true)
 |-- start_date: string (nullable = true)
 |-- end_date: string (nullable = true)
 |-- txn_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)
 |-- expense_type: string (nullable = true)
 |-- amt: string (nullable = true)
 |-- city: string (nullable = true)

+----------+----------+----------+---------------+----------+----+-----+---+-------------+-----+--------+
|cust_id   |start_date|end_date  |txn_id         |date      |year|month|day|expense_type |amt  |city    |
+----------+----------+----------+---------------+----------+----+-----+---+-------------+-----+--------+
|C0YDPQWPBJ|2010-07-01|2018-12-01|TZ5SMKZY9S03OQJ|2018-10-07|2018|10   |7  |Entertainment|10.42|boston  |
|C0YDPQWPBJ|2010-07-01|2018-12-01|TYIAPPNU066CJ5R|2016-03-27|2016|3    |27 |Motor/Travel |44.34|portland|
|C0YDPQWPBJ|2010-07-01|2018-12-01|TETSXIK4BLXH

In [10]:
df_customers.printSchema()
df_customers.show(3, False)

root
 |-- cust_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- birthday: string (nullable = true)
 |-- zip: string (nullable = true)
 |-- city: string (nullable = true)

+----------+------------+---+------+----------+-----+-------+
|cust_id   |name        |age|gender|birthday  |zip  |city   |
+----------+------------+---+------+----------+-----+-------+
|C007YEYTX9|Aaron Abbott|34 |Female|7/13/1991 |97823|boston |
|C00B971T1J|Aaron Austin|37 |Female|12/16/2004|30332|chicago|
|C00WRSJF1Q|Aaron Barnes|29 |Female|3/11/1977 |23451|denver |
+----------+------------+---+------+----------+-----+-------+
only showing top 3 rows



In [27]:
(
    df_transactions
    .groupBy("cust_id")
    .agg(F.countDistinct("txn_id").alias("ct"))
    .orderBy(F.desc("ct"))
    .show(5, False)
)

+----------+--------+
|cust_id   |ct      |
+----------+--------+
|C0YDPQWPBJ|17539732|
|CBW3FMEAU7|7999    |
|C3KUDEN3KO|7999    |
|C89FCEGPJP|7999    |
|CHNFNR89ZV|7998    |
+----------+--------+
only showing top 5 rows



In [11]:
df_txn_details = (
    df_transactions.join(
        df_customers,
        on="cust_id",
        how="inner"
    )
)

In [23]:
(
    df_txn_details
    .withColumn("partition", F.spark_partition_id())
    .where("cust_id='C0YDPQWPBJ'") # Add your where clause here
    .groupBy("cust_id", "partition")
    .count()
    .orderBy("cust_id", "partition")
    .show()
)

+----------+---------+--------+
|   cust_id|partition|   count|
+----------+---------+--------+
|C0YDPQWPBJ|        1|17539732|
+----------+---------+--------+



In [12]:
import time
start_time = time.time()
df_txn_details.count()
print(f"time taken: {time.time() - start_time}")

time taken: 63.674864768981934


In [13]:

SALT_NUMBER = int(spark.conf.get("spark.sql.shuffle.partitions"))
SALT_NUMBER

3

In [14]:
df_txn_details_skew = df_txn_details.withColumn("salt", (F.rand() * SALT_NUMBER).cast("int"))

In [15]:
df_txn_details_skew.show(5)

+----------+----------+----------+---------------+----------+----+-----+---+-------------+-----+------------+-------------+---+------+--------+-----+-----------+----+
|   cust_id|start_date|  end_date|         txn_id|      date|year|month|day| expense_type|  amt|        city|         name|age|gender|birthday|  zip|       city|salt|
+----------+----------+----------+---------------+----------+----+-----+---+-------------+-----+------------+-------------+---+------+--------+-----+-----------+----+
|C01AZWQMF3|2010-10-01|2019-03-01|T0KD2W40RJK4WPF|2017-09-22|2017|    9| 22|Entertainment| 3.64|      denver|Aaron Barrett| 31|  Male|7/9/1998|46613|los_angeles|   0|
|C01AZWQMF3|2010-10-01|2019-03-01|TO2WCFYSTV3F80D|2016-03-20|2016|    3| 20| Motor/Travel|13.49|   san_diego|Aaron Barrett| 31|  Male|7/9/1998|46613|los_angeles|   0|
|C01AZWQMF3|2010-10-01|2019-03-01|T00GZEM2XP0PEY3|2013-05-04|2013|    5|  4|    Groceries| 9.64|   san_diego|Aaron Barrett| 31|  Male|7/9/1998|46613|los_angeles|   2

In [16]:
df_customers_skew = (
    df_customers
    .withColumn("salt_values", F.array([F.lit(i) for i in range(SALT_NUMBER)]))
    .withColumn("salt", F.explode(F.col("salt_values")))
)

In [17]:
df_customers_skew.show(5)

+----------+------------+---+------+----------+-----+-------+-----------+----+
|   cust_id|        name|age|gender|  birthday|  zip|   city|salt_values|salt|
+----------+------------+---+------+----------+-----+-------+-----------+----+
|C007YEYTX9|Aaron Abbott| 34|Female| 7/13/1991|97823| boston|  [0, 1, 2]|   0|
|C007YEYTX9|Aaron Abbott| 34|Female| 7/13/1991|97823| boston|  [0, 1, 2]|   1|
|C007YEYTX9|Aaron Abbott| 34|Female| 7/13/1991|97823| boston|  [0, 1, 2]|   2|
|C00B971T1J|Aaron Austin| 37|Female|12/16/2004|30332|chicago|  [0, 1, 2]|   0|
|C00B971T1J|Aaron Austin| 37|Female|12/16/2004|30332|chicago|  [0, 1, 2]|   1|
+----------+------------+---+------+----------+-----+-------+-----------+----+
only showing top 5 rows



In [18]:
df_txn_details_skew = (
    df_txn_details_skew.join(
        df_customers_skew,
        on=["cust_id", "salt"],
        how="inner"
    )
)

In [19]:
import time
start_time = time.time()
df_txn_details_skew.count()
print(f"time taken: {time.time() - start_time}")

time taken: 113.88618326187134


In [24]:
(
    df_txn_details_skew
    .withColumn("partition", F.spark_partition_id())
    .where("cust_id='C0YDPQWPBJ'")
    .groupBy("cust_id", "partition")
    .count()
    .orderBy("cust_id", "partition")
    .show()
)

+----------+---------+--------+
|   cust_id|partition|   count|
+----------+---------+--------+
|C0YDPQWPBJ|        0| 5843085|
|C0YDPQWPBJ|        1|11696647|
+----------+---------+--------+



In [1]:
spark.stop()

NameError: name 'spark' is not defined