In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import broadcast, split, lit

spark = SparkSession.builder.getOrCreate()

matches_bucketed = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/home/iceberg/data/matches.csv")

match_details_bucketed = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/home/iceberg/data/match_details.csv")

25/08/07 00:38:26 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

In [7]:
matches_bucketed.count()

10000

In [8]:
match_details_bucketed.count()

10000

In [2]:
spark.sql("DROP TABLE IF EXISTS bootcamp.matches_bucketed")

# particionamento das duas tabelas iceberg em 16 buckets, usando hashes da coluna match_id
# isso torna o join das duas tabelas bastante eficiente, pois os buckets/partições irão se alinhar
# ou seja, o bucket 0 da primeira tabela irá se alinhar com o bucket 0 da segunda tabela
# "sempre usar potências de 2"

# o nº de arquivos/partições produzidas aqui será a quantidade de datas únicas (completion_date) X 16 (nº de buckets)

'''
bucketed_matches_ddl = """
CREATE TABLE IF NOT EXISTS bootcamp.matches_bucketed (
    match_id STRING,
    is_team_game BOOLEAN,
    playlist_id STRING,
    completion_date TIMESTAMP
)
USING iceberg
PARTITIONED BY (completion_date, bucket(16, match_id))
"""

spark.sql(bucketed_matches_ddl)

# o nº de arquivos/partições produzidas aqui será 16 (nº de buckets)
bucketed_details_ddl = """
CREATE TABLE IF NOT EXISTS bootcamp.match_details_bucketed (
    match_id STRING,
    player_gamertag STRING,
    player_total_kills INT,
    player_total_deaths INT
)
USING iceberg
PARTITIONED BY (bucket(16, match_id))
"""

spark.sql(bucketed_details_ddl)
'''

# tabelas iceberg não suportam buckets

bucketed_matches_ddl = """
CREATE TABLE IF NOT EXISTS bootcamp.matches_bucketed (
    match_id STRING,
    is_team_game BOOLEAN,
    playlist_id STRING,
    completion_date TIMESTAMP
)
USING iceberg
PARTITIONED BY (completion_date)
"""

spark.sql(bucketed_matches_ddl)

bucketed_details_ddl = """
CREATE TABLE IF NOT EXISTS bootcamp.match_details_bucketed (
    match_id STRING,
    player_gamertag STRING,
    player_total_kills INT,
    player_total_deaths INT
)
USING iceberg
"""

spark.sql(bucketed_details_ddl)

DataFrame[]

In [4]:
# tá dando OOM =(

'''
from pyspark.sql.functions import col

matches_bucketed.select( "match_id", "is_team_game", "playlist_id", "completion_date") \
  .write.mode("append") \
  .bucketBy(16, "match_id") \
  .partitionBy("completion_date") \
  .saveAsTable("bootcamp.matches_bucketed")

match_details_bucketed.select( "match_id", "player_gamertag", "player_total_kills", "player_total_deaths") \
  .write.mode("append") \
  .bucketBy(16, "match_id") \
  .saveAsTable("bootcamp.match_details_bucketed")
'''

# forma correta de escrever dados em tabelas Iceberg (não há bucket join!!)
matches_bucketed.select( "match_id", "is_team_game", "playlist_id", "completion_date") \
  .writeTo("bootcamp.matches_bucketed").append()

match_details_bucketed.select( "match_id", "player_gamertag", "player_total_kills", "player_total_deaths") \
  .writeTo("bootcamp.match_details_bucketed").append()

                                                                                

In [5]:
# evitando broadcast join automático
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")

In [6]:
matches_bucketed.createOrReplaceTempView("matches")
match_details_bucketed.createOrReplaceTempView("match_details")

25/08/06 20:30:22 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [8]:
# Pelo fato de ter dado OOM na inserção dos dados eu não consigo ver muito bem o resultado da bucketização no join
# Mas mesmo assim o plano de execução do primeiro Join ainda é menor

# Como o do Zach rodou a inserção, não teve shuffle nenhum nesse primeiro join

# join entre as tabelas "bucketizadas"
spark.sql("""
    SELECT * 
    FROM bootcamp.match_details_bucketed mdb 
    JOIN bootcamp.matches_bucketed md 
    ON mdb.match_id = md.match_id
""").explain()

# join entre as views em cache
spark.sql("""
    SELECT * 
    FROM match_details mdb 
    JOIN matches md 
    ON mdb.match_id = md.match_id
""").explain()


== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- SortMergeJoin [match_id#245], [match_id#249], Inner
   :- Sort [match_id#245 ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(match_id#245, 200), ENSURE_REQUIREMENTS, [plan_id=159]
   :     +- BatchScan demo.bootcamp.match_details_bucketed[match_id#245, player_gamertag#246, player_total_kills#247, player_total_deaths#248] demo.bootcamp.match_details_bucketed (branch=null) [filters=match_id IS NOT NULL, groupedBy=] RuntimeFilters: []
   +- Sort [match_id#249 ASC NULLS FIRST], false, 0
      +- Exchange hashpartitioning(match_id#249, 200), ENSURE_REQUIREMENTS, [plan_id=160]
         +- Filter isnotnull(match_id#249)
            +- BatchScan demo.bootcamp.matches_bucketed[match_id#249, is_team_game#250, playlist_id#251, completion_date#252] demo.bootcamp.matches_bucketed (branch=null) [filters=match_id IS NOT NULL, groupedBy=] RuntimeFilters: []


== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- SortMergeJoi

In [10]:
spark.stop()