In [1]:

// In python use: from pyspark.sql.functions import broadcast, split, lit
import org.apache.spark.sql.functions.{broadcast, split, lit}


val matchesBucketed = spark.read.option("header", "true")
                        .option("inferSchema", "true")
                        .csv("/home/iceberg/data/matches.csv")
val matchDetailsBucketed =  spark.read.option("header", "true")
                        .option("inferSchema", "true")
                        .csv("/home/iceberg/data/match_details.csv")


spark.sql("""DROP TABLE IF EXISTS bootcamp.matches_bucketed""")
val bucketedDDL = """
CREATE TABLE IF NOT EXISTS bootcamp.matches_bucketed (
     match_id STRING,
     is_team_game BOOLEAN,
     playlist_id STRING,
     completion_date TIMESTAMP
 )
 USING iceberg
 PARTITIONED BY (completion_date, bucket(16, match_id));
 """

# Partitions by completion_date (for time-based filtering).

# Buckets by match_id into 16 buckets.
 
 spark.sql(bucketedDDL)

// matchesBucketed.select(
//     $"match_id", $"is_team_game", $"playlist_id", $"completion_date"
//     )
//     .write.mode("append")
//     .partitionBy("completion_date")
//   .bucketBy(16, "match_id").saveAsTable("bootcamp.matches_bucketed")


// val bucketedDetailsDDL = """
// CREATE TABLE IF NOT EXISTS bootcamp.match_details_bucketed (
//     match_id STRING,
//     player_gamertag STRING,
//     player_total_kills INTEGER,
//     player_total_deaths INTEGER
// )
// USING iceberg
// PARTITIONED BY (bucket(16, match_id));
// """
// spark.sql(bucketedDetailsDDL)

// matchDetailsBucketed.select(
//     $"match_id", $"player_gamertag", $"player_total_kills", $"player_total_deaths")
//     .write.mode("append")
//   .bucketBy(16, "match_id").saveAsTable("bootcamp.match_details_bucketed")

spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")

matchesBucketed.createOrReplaceTempView("matches")
matchDetailsBucketed.createOrReplaceTempView("match_details")


# Since match_id is bucketed (16 partitions), Spark should avoid full shuffle joins.

# Instead, Spark will use bucket joins, which are faster for large datasets.

spark.sql("""
    SELECT * FROM bootcamp.match_details_bucketed mdb JOIN bootcamp.matches_bucketed md 
    ON mdb.match_id = md.match_id
    AND md.completion_date = DATE('2016-01-01')
        
""").explain()


# Joins without bucketing, forcing a full shuffle join.

# This is slower and less efficient than a bucket join.

spark.sql("""
    SELECT * FROM match_details mdb JOIN matches md ON mdb.match_id = md.match_id
        
""").explain()

// spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "1000000000000")

// val broadcastFromThreshold = matches.as("m").join(matchDetails.as("md"), $"m.match_id" === $"md.match_id")
//   .select($"m.completion_date", $"md.player_gamertag",  $"md.player_total_kills")
//   .take(5)


# Forces a broadcast join using broadcast(matchDetails).

# Broadcasts matchDetails across all worker nodes.

# Works well only for small tables.

# 💡 When to Use Broadcast Joins?

# When the right-side table is small (<10MB).

# When avoiding shuffle joins improves performance.


// val explicitBroadcast = matches.as("m").join(broadcast(matchDetails).as("md"), $"m.match_id" === $"md.match_id")
//   .select($"md.*", split($"completion_date", " ").getItem(0).as("ds"))

// val bucketedValues = matchDetailsBucketed.as("mdb").join(matchesBucketed.as("mb"), $"mb.match_id" === $"mdb.match_id").explain()
// // .take(5)

// val values = matchDetailsBucketed.as("m").join(matchesBucketed.as("md"), $"m.match_id" === $"md.match_id").explain()

// explicitBroadcast.write.mode("overwrite").insertInto("match_details_bucketed")

// matches.withColumn("ds", split($"completion_date", " ").getItem(0)).write.mode("overwrite").insertInto("matches_bucketed")

// spark.sql(bucketedSQL)



Intitializing Scala interpreter ...

Spark Web UI available at http://5669d5fc56f3:4048
SparkContext available as 'sc' (version = 3.5.5, master = local[*], app id = local-1743398383846)
SparkSession available as 'spark'


<console>: 26: error: ';' expected but '#' found.

In [None]:
# Join Strategy	How It Works	When to Use?
# Bucket Join	Pre-sorts data by bucketing	✅ Best for large datasets with frequent joins
# Shuffle Join	Re-distributes data across nodes	❌ Inefficient for large joins
# Broadcast Join	Sends a small table to all nodes	✅ Best for small tables (<10MB)