In [None]:
%sql
-- Create the gold fact table (run once or if schema changes needed)
CREATE TABLE IF NOT EXISTS gold.ipl_team_vs_team_season (
  season INT,
  team_sk BIGINT,
  opponent_team_sk BIGINT,

  matches_played INT,
  wins INT,
  losses INT,

  avg_runs_scored DOUBLE,
  avg_runs_conceded DOUBLE,

  ingestion_ts TIMESTAMP
)
USING DELTA
TBLPROPERTIES (
  delta.autoOptimize.optimizeWrite = true,
  delta.autoOptimize.autoCompact = true
);
-- Optional: PARTITIONED BY (season)  -- Uncomment for better performance with many seasons

In [None]:
from pyspark.sql import functions as F

# Read silver tables
fact_matches = spark.table("silver.fact_matches")
fact_deliveries = spark.table("silver.fact_deliveries")

In [None]:
# Step 1: Directed match pairs (one row per team per match)
team1_df = fact_matches.select(
    "season",
    "match_id",
    F.col("team1_sk").alias("team_sk"),
    F.col("team2_sk").alias("opponent_sk"),
    "winner_sk"
)
team2_df = fact_matches.select(
    "season",
    "match_id",
    F.col("team2_sk").alias("team_sk"),
    F.col("team1_sk").alias("opponent_sk"),
    "winner_sk"
)
match_teams = team1_df.union(team2_df)

In [None]:
# Step 2: Head-to-head outcomes
outcomes = match_teams.groupBy("season", "team_sk", "opponent_sk").agg(
    F.count("*").cast("int").alias("matches_played"),
    F.sum(F.when(F.col("winner_sk") == F.col("team_sk"), 1).otherwise(0)).cast("int").alias("wins"),
    F.sum(F.when(F.col("winner_sk") == F.col("opponent_sk"), 1).otherwise(0)).cast("int").alias("losses")
)

In [None]:
# Step 3: Runs scored (team batting against opponent) - exclude super overs
runs_scored_df = fact_deliveries \
    .join(fact_matches.select("match_id", "season"), "match_id") \
    .filter(F.col("is_super_over") == 0) \
    .groupBy("season", F.col("batting_team_sk").alias("team_sk"), F.col("bowling_team_sk").alias("opponent_sk")) \
    .agg(F.sum("total_runs").alias("runs_scored_total"))

In [None]:
# Step 4: Runs conceded (team bowling against opponent) - exclude super overs
runs_conceded_df = fact_deliveries \
    .join(fact_matches.select("match_id", "season"), "match_id") \
    .filter(F.col("is_super_over") == 0) \
    .groupBy("season", F.col("bowling_team_sk").alias("team_sk"), F.col("batting_team_sk").alias("opponent_sk")) \
    .agg(F.sum("total_runs").alias("runs_conceded_total"))

In [None]:
# Step 5: Assemble final DataFrame
gold_df = outcomes \
    .join(runs_scored_df, ["season", "team_sk", "opponent_sk"], "left") \
    .join(runs_conceded_df, ["season", "team_sk", "opponent_sk"], "left") \
    .select(
        "season",
        "team_sk",
        "opponent_sk",
        "matches_played",
        "wins",
        "losses",
        F.when(
            F.col("matches_played") > 0,
            F.round(F.coalesce(F.col("runs_scored_total"), F.lit(0.0)) / F.col("matches_played"), 2)
        ).otherwise(F.lit(0.0)).alias("avg_runs_scored"),
        F.when(
            F.col("matches_played") > 0,
            F.round(F.coalesce(F.col("runs_conceded_total"), F.lit(0.0)) / F.col("matches_played"), 2)
        ).otherwise(F.lit(0.0)).alias("avg_runs_conceded"),
        F.current_timestamp().alias("ingestion_ts")
    )

In [None]:
# Write to gold table (full overwrite - no duplicates due to (season, team_sk, opponent_sk) grain)
gold_df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("gold.ipl_team_vs_team_season")

In [None]:
# Optimize (Z-order by common query patterns)
spark.sql("OPTIMIZE gold.ipl_team_vs_team_season ZORDER BY (season, team_sk)")

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,clusteringStats:struct<inputZCubeFiles:struct<numFiles:bigint,size:bigint>,inputOtherFiles:struct<numFiles:bigint,size:bigint>,inputNumZCubes:bigint,mergedFiles:struct<numFiles:bigint,size:bigint>,numOutputZCubes:bigint>,numBins:bigint,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,