In [None]:
max_match_ts = dbutils.jobs.taskValues.get(taskKey="Extract_Max_ts", key="max_match_ts", debugValue="2026-01-01 00:00:00")
max_delivery_ts = dbutils.jobs.taskValues.get(taskKey="Extract_Max_ts", key="max_delivery_ts", debugValue="2026-01-01 00:00:00")

print(max_match_ts, max_delivery_ts)

In [None]:
# Read the table
df = spark.sql(f"SELECT * FROM bronze.match_loading WHERE ingestion_ts <= '{max_match_ts}'")

In [None]:
selected_team_df = df.select(["team1","team2"])

In [None]:
from pyspark.sql.functions import col, lit, current_timestamp

# .distinct() ensures we don't try to insert the same umpire twice from the same batch
source_team_df = selected_team_df.select(col("team1").alias("team_name")) \
    .union(selected_team_df.select(col("team2").alias("team_name"))) \
    .filter(col("team_name").isNotNull()) \
    .distinct()

In [None]:
from pyspark.sql.functions import split, col
from pyspark.sql.functions import transform, col
from pyspark.sql.functions import concat_ws, upper


# Split team_name into an array of words
teams_split_df = source_team_df.withColumn("words", split(col("team_name"), " "))

# Take first letter of each word
teams_letters_df = teams_split_df.withColumn(
    "letters",
    transform(col("words"), lambda x: x.substr(1, 1))
)

# Concatenate letters into short_name
teams_short_df = teams_letters_df.withColumn(
    "short_name",
    upper(concat_ws("", col("letters")))
)

In [None]:
# âœ… Keep only the columns we need
teams_short_df_final = teams_short_df.select("team_name", "short_name")

In [None]:
from pyspark.sql.functions import lit, current_timestamp

# 1. Prepare Data: Add the missing columns required by the table
# We assume active_flag is True by default for new teams
final_team_df = teams_short_df_final \
    .withColumn("active_flag", lit(True)) \
    .withColumn("created_at", current_timestamp())


In [None]:
# 2. Create Temp View so SQL can see the dataframe
final_team_df.createOrReplaceTempView("source_teams")


In [None]:
# 3. Perform the Merge
# Logic: Look at target table; if 'team_name' matches, do nothing. If NOT matched, insert.
spark.sql("""
    MERGE INTO silver.dim_team AS target
    USING source_teams AS source
    ON target.team_name = source.team_name
    
    WHEN NOT MATCHED THEN
      INSERT (
        team_name, 
        short_name, 
        active_flag, 
        created_at
      )
      VALUES (
        source.team_name, 
        source.short_name, 
        source.active_flag, 
        source.created_at
      )
""")

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]