In [19]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import broadcast, split, lit, expr

spark = SparkSession.builder.getOrCreate()

# desabilitando broadcast join automático
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")

'''
CONFIGURAÇÕES DE STORAGE PARTITION JOIN (SPJ)

SPJ é uma otimização de join no Spark SQL que evita o shuffle (embaralhamento de dados entre nós) 
ao aproveitar o layout físico dos dados já particionados no armazenamento — como S3, HDFS ou MinIO.

SPJ é uma generalização do Bucket Join, suportando muito mais casos, especialmente em formatos 
modernos como Iceberg (fontes de dados V2).
'''

# tenta eliminar shuffle usando o particionamento da própria fonte de dados
spark.conf.set("spark.sql.sources.v2.bucketing.enabled", "true")

# força o planner do Spark a preservar o agrupamento físico dos dados ao ler 
# tabelas Iceberg — principalmente quando elas são bucketed ou partitioned
spark.conf.set("spark.sql.iceberg.planning.preserve-data-grouping", "true")

# tenta eliminar shuffle quando um lado do join não possui valores das partições do outro lado
# requer que 'spark.sql.sources.v2.bucketing.enabled' seja true 
spark.conf.set("spark.sql.sources.v2.bucketing.pushPartValues.enabled", "true")

# requer que as chaves do join ou merge sejam as mesmas e estejam na mesma ordem para eliminar shuffle
# setando para false eu flexibilizo isso
spark.conf.set("spark.sql.requireAllClusterKeysForCoPartition", "false")

# isso aqui aparentemente otimiza o join  quando há desbalanceamento na distribuição dos dados
# requer que 'spark.sql.sources.v2.bucketing.enabled' seja true
# requer que 'spark.sql.sources.v2.bucketing.pushPartValues.enabled' seja true
spark.conf.set("spark.sql.sources.v2.bucketing.partiallyClusteredDistribution.enable", "true")

25/08/07 15:11:17 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/08/07 15:11:17 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/08/07 15:11:17 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [20]:
matches = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/home/iceberg/data/matches.csv")

match_details = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/home/iceberg/data/match_details.csv")

medals_matches_players = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/home/iceberg/data/medals_matches_players.csv")

                                                                                

In [None]:
# "Bucket join `match_details`, `matches`, and `medal_matches_players` on `match_id` with `16` buckets"

In [21]:
# Convertendo os csv's para parquet's "bucketizados"

# Cria o database caso não exista
spark.sql("CREATE DATABASE IF NOT EXISTS bootcamp")

# o correto aqui seria eu escrever tabelas iceberg no lugar de parquet's
# mas por algum motivo ainda assim ele faz bucket join/storage partition join

matches.write \
    .format("parquet") \
    .bucketBy(16, "match_id") \
    .mode("overwrite") \
    .saveAsTable("bootcamp.matches_bucket")

match_details.write \
    .format("parquet") \
    .bucketBy(16, "match_id") \
    .mode("overwrite") \
    .saveAsTable("bootcamp.match_details_bucket")

medals_matches_players.write \
    .format("parquet") \
    .bucketBy(16, "match_id") \
    .mode("overwrite") \
    .saveAsTable("bootcamp.medals_matches_players_bucket")

'''
matches.writeTo("bootcamp.matches_bucket_table") \
  .using("iceberg") \
  .partitionedBy(expr("bucket(16, match_id)")) \
  .createOrReplace()

match_details.writeTo("bootcamp.match_details_bucket_table") \
  .using("iceberg") \
  .partitionedBy(expr("bucket(16, match_id)")) \
  .createOrReplace()

medals_matches_players.writeTo("bootcamp.medals_matches_players_bucket_table") \
  .using("iceberg") \
  .partitionedBy(expr("bucket(16, match_id)")) \
  .createOrReplace()
'''

                                                                                

'\nmatches.writeTo("bootcamp.matches_bucket_table")   .using("iceberg")   .partitionedBy(expr("bucket(16, match_id)"))   .createOrReplace()\n\nmatch_details.writeTo("bootcamp.match_details_bucket_table")   .using("iceberg")   .partitionedBy(expr("bucket(16, match_id)"))   .createOrReplace()\n\nmedals_matches_players.writeTo("bootcamp.medals_matches_players_bucket_table")   .using("iceberg")   .partitionedBy(expr("bucket(16, match_id)"))   .createOrReplace()\n'

In [22]:
# lendo os parquet's 'bucketizados'
matches_bucketed = spark.table("bootcamp.matches_bucket")
match_details_bucketed = spark.table("bootcamp.match_details_bucket")
medals_matches_players_bucketed = spark.table("bootcamp.medals_matches_players_bucket")

In [23]:
# sem shuffle!! =)
matches_bucketed.join(match_details_bucketed, on="match_id") \
                .join(medals_matches_players_bucketed, on="match_id") \
                .explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [match_id#1894, mapid#1895, is_team_game#1896, playlist_id#1897, game_variant_id#1898, is_match_over#1899, completion_date#1900, match_duration#1901, game_mode#1902, map_variant_id#1903, player_gamertag#1915, previous_spartan_rank#1916, spartan_rank#1917, previous_total_xp#1918, total_xp#1919, previous_csr_tier#1920, previous_csr_designation#1921, previous_csr#1922, previous_csr_percent_to_next_tier#1923, previous_csr_rank#1924, current_csr_tier#1925, current_csr_designation#1926, current_csr#1927, current_csr_percent_to_next_tier#1928, ... 24 more fields]
   +- SortMergeJoin [match_id#1894], [match_id#1986], Inner
      :- Project [match_id#1894, mapid#1895, is_team_game#1896, playlist_id#1897, game_variant_id#1898, is_match_over#1899, completion_date#1900, match_duration#1901, game_mode#1902, map_variant_id#1903, player_gamertag#1915, previous_spartan_rank#1916, spartan_rank#1917, previous_total_xp#1918, total_xp#1919

In [46]:
final_df = matches_bucketed.join(match_details_bucketed, on="match_id") \
                .join(medals_matches_players_bucketed, on="match_id")

In [47]:
final_df.printSchema()

root
 |-- match_id: string (nullable = true)
 |-- mapid: string (nullable = true)
 |-- is_team_game: boolean (nullable = true)
 |-- playlist_id: string (nullable = true)
 |-- game_variant_id: string (nullable = true)
 |-- is_match_over: boolean (nullable = true)
 |-- completion_date: timestamp (nullable = true)
 |-- match_duration: string (nullable = true)
 |-- game_mode: string (nullable = true)
 |-- map_variant_id: string (nullable = true)
 |-- player_gamertag: string (nullable = true)
 |-- previous_spartan_rank: integer (nullable = true)
 |-- spartan_rank: integer (nullable = true)
 |-- previous_total_xp: integer (nullable = true)
 |-- total_xp: integer (nullable = true)
 |-- previous_csr_tier: integer (nullable = true)
 |-- previous_csr_designation: integer (nullable = true)
 |-- previous_csr: integer (nullable = true)
 |-- previous_csr_percent_to_next_tier: integer (nullable = true)
 |-- previous_csr_rank: integer (nullable = true)
 |-- current_csr_tier: integer (nullable = true)


In [50]:
# Which player averages the most kills per game?

final_df.select("match_details_bucket.match_id", 
                "match_details_bucket.player_gamertag", 
                "match_details_bucket.player_total_kills") \
        .createOrReplaceTempView("final_df_1")

sql_exp = '''
-- essa cte garante apenas 1 linha de jogador por partida
WITH dedup AS (
  SELECT *,
         ROW_NUMBER() OVER (PARTITION BY match_id, player_gamertag ORDER BY match_id) AS rn
  FROM final_df_1
)
SELECT player_gamertag, AVG(player_total_kills) AS avg_kills
FROM dedup
WHERE rn = 1
GROUP BY player_gamertag
ORDER BY 2 DESC
'''

res = spark.sql(sql_exp)

res.show(1)



+---------------+---------+
|player_gamertag|avg_kills|
+---------------+---------+
|   gimpinator14|    109.0|
+---------------+---------+
only showing top 1 row



                                                                                

In [54]:
# Which playlist gets played the most?

final_df.select("matches_bucket.match_id", 
                "matches_bucket.playlist_id") \
        .createOrReplaceTempView("final_df_2")

sql_exp = '''
-- essa cte garante apenas 1 linha de playlist por partida
WITH dedup AS (
  SELECT *,
         ROW_NUMBER() OVER (PARTITION BY match_id, playlist_id ORDER BY match_id) AS rn
  FROM final_df_2
)
SELECT playlist_id, COUNT(1) AS sum_playlist_played -- COUNT ou SUM, nesse caso tanto faz
FROM dedup
WHERE rn = 1
GROUP BY playlist_id
ORDER BY 2 DESC
'''

res = spark.sql(sql_exp)

res.show(1)



+--------------------+-------------------+
|         playlist_id|sum_playlist_played|
+--------------------+-------------------+
|f72e0ef0-7c4a-430...|               7640|
+--------------------+-------------------+
only showing top 1 row



                                                                                

In [67]:
spark.stop()