In [0]:
from pyspark.sql.functions import *

In [0]:
spark.conf.set("spark.sql.adaptive.enabled", "false")


| Transformation Type | Definition | Data Movement | Examples |
|---------------------|------------|----------------|----------|
| **Narrow**          | Each output partition depends on **a single input partition** | ❌ No shuffle | `map`, `filter`, `flatMap`, `union` |
| **Wide**            | Each output partition depends on **multiple input partitions** | ✅ Causes shuffle | `join`, `groupBy`, `reduceByKey`, `distinct` |


goal is avoid shuffling


## OPTIMIZE JOIN



| Table Size | Recommendation |
|------------|----------------|
| **Small table ≤ 10–25 MB** | Broadcast it. |
| **Small table > 100 MB**   | Do **not** broadcast — may cause memory pressure. |
| Use Spark config `spark.sql.autoBroadcastJoinThreshold` to control the limit (default: 10MB). |

In [0]:
# Big DataFrame
df_transactions = spark.createDataFrame([
    (1, "US", 100),
    (2, "IN", 200),
    (3, "UK", 150),
    (4, "US", 80),
], ["id", "country_code", "amount"])

# Small DataFrame
df_countries = spark.createDataFrame([
    ("US", "United States"),
    ("IN", "India"),
    ("UK", "United Kingdom"),
], ["country_code", "country_name"])


In [0]:
df_join=df_transactions.join(df_countries,df_transactions.country_code==df_countries.country_code,"inner")
df_join.show()

+---+------------+------+------------+--------------+
| id|country_code|amount|country_code|  country_name|
+---+------------+------+------------+--------------+
|  2|          IN|   200|          IN|         India|
|  3|          UK|   150|          UK|United Kingdom|
|  1|          US|   100|          US| United States|
|  4|          US|    80|          US| United States|
+---+------------+------+------------+--------------+



you should see A Sort-Merge Join (SMJ) is the default join strategy used in Spark when both tables are large and not broadcasted.


##use boardcast

In [0]:
df_join_opt=df_transactions.join(broadcast(df_countries),df_transactions.country_code==df_countries.country_code,"inner")


**you can also broadcast variable** like dictionary


In [0]:
df = spark.createDataFrame([
    ("1001",),
    ("1002",),
    ("1004",),
], ["product_id"])

# Lookup dictionary (small)
product_dict = {
    "1001": "iPhone",
    "1002": "Samsung",
    "1003": "Pixel"
}

In [0]:
broad_vr = spark.sparkContext.broadcast(product_dict)

In [0]:
broad_vr.value.get('1001')

'iPhone'

In [0]:
def mymap(x):

    return broad_vr.value.get(x)

In [0]:
mymapudf=udf(mymap)


In [0]:
df_with_names=df.withColumn("product_name",mymapudf("product_id"))

In [0]:
df_with_names.show()

+----------+------------+
|product_id|product_name|
+----------+------------+
|      1001|      iPhone|
|      1002|     Samsung|
|      1004|        NULL|
+----------+------------+

