In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import broadcast, split, lit

spark = SparkSession.builder.getOrCreate()

# desabilitando broadcast join automático
# " Disabled automatic broadcast join with `spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")` "
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")

In [2]:
matches = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/home/iceberg/data/matches.csv")

match_details = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/home/iceberg/data/match_details.csv")

medals_matches_players = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/home/iceberg/data/medals_matches_players.csv")

medals = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/home/iceberg/data/medals.csv")

maps = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/home/iceberg/data/maps.csv")

                                                                                

In [26]:
from pyspark.sql.functions import col

# join 1
maps_matches = maps.join(matches, on="mapid")

# join 2

# só para evitar ambiguidade
medal_matches_players_renamed = medal_matches_players.withColumnRenamed("match_id", "match_id_2")

maps_with_medal_id = maps_matches.join(
    medal_matches_players_renamed,
    col("match_id") == col("match_id_2")
)

# mapid, match_id, match_id_2, medal_id
# maps_with_medal_id.show(3)

In [27]:
# 755.229
# maps_with_medal_id.count()

# 183
# medals.count()

# medal_id
# medals.show(3)

                                                                                

755229

In [25]:
maps_with_medal_id.printSchema()

root
 |-- mapid: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- match_id: string (nullable = true)
 |-- is_team_game: boolean (nullable = true)
 |-- playlist_id: string (nullable = true)
 |-- game_variant_id: string (nullable = true)
 |-- is_match_over: boolean (nullable = true)
 |-- completion_date: timestamp (nullable = true)
 |-- match_duration: string (nullable = true)
 |-- game_mode: string (nullable = true)
 |-- map_variant_id: string (nullable = true)
 |-- match_id_2: string (nullable = true)
 |-- player_gamertag: string (nullable = true)
 |-- medal_id: long (nullable = true)
 |-- count: integer (nullable = true)



In [30]:
spark.sql("""
    CREATE DATABASE IF NOT EXISTS bootcamp
""")

# tabela de staging que faz a junção de maps, matches e medal_matches_players
spark.sql("""
    CREATE TABLE IF NOT EXISTS bootcamp.stg_maps_medals (
        mapid STRING,
        name STRING,
        description STRING,
        match_id STRING,
        is_team_game BOOLEAN,
        playlist_id STRING,
        game_variant_id STRING,
        is_match_over BOOLEAN,
        completion_date TIMESTAMP,
        match_duration STRING,
        game_mode STRING,
        map_variant_id STRING,
        match_id_2 STRING,
        player_gamertag STRING,
        medal_id BIGINT,
        count INT
    )
    USING ICEBERG
""")

maps_with_medal_id.writeTo("bootcamp.stg_maps_medals").append()

                                                                                

In [31]:
# "Explicitly broadcast JOINs `medals` and `maps`"

'''
Foi preciso fazer muitos joins para unir a tabela 'maps' com a chave 'medal_id'.
O resultado de todos esses joins foi persistido na tabela de staging 'bootcamp.stg_maps_medals'
'''

stg_maps_medals_matches = spark.read.table("bootcamp.stg_maps_medals")

# checando os planos de execução com e sem broadcast join
stg_maps_medals_matches.join(broadcast(medals), on = "medal_id").explain()
stg_maps_medals_matches.join(medals, on = "medal_id").explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [medal_id#1238L, mapid#1224, name#1225, description#1226, match_id#1227, is_team_game#1228, playlist_id#1229, game_variant_id#1230, is_match_over#1231, completion_date#1232, match_duration#1233, game_mode#1234, map_variant_id#1235, match_id_2#1236, player_gamertag#1237, count#1239, sprite_uri#169, sprite_left#170, sprite_top#171, sprite_sheet_width#172, sprite_sheet_height#173, sprite_width#174, sprite_height#175, classification#176, ... 3 more fields]
   +- BroadcastHashJoin [medal_id#1238L], [medal_id#168L], Inner, BuildRight, false
      :- Filter isnotnull(medal_id#1238L)
      :  +- BatchScan demo.bootcamp.stg_maps_medals[mapid#1224, name#1225, description#1226, match_id#1227, is_team_game#1228, playlist_id#1229, game_variant_id#1230, is_match_over#1231, completion_date#1232, match_duration#1233, game_mode#1234, map_variant_id#1235, match_id_2#1236, player_gamertag#1237, medal_id#1238L, count#1239] demo.bootcamp.st

In [32]:
spark.stop()