In [None]:
%sql
-- Create the gold fact table (run once or if schema changes needed)
CREATE TABLE IF NOT EXISTS gold.ipl_players_season (
  player_sk BIGINT,
  player_name STRING,
  season INT,

  matches INT,
  innings_batted INT,
  innings_bowled INT,

  -- Batting
  runs INT,
  balls_faced INT,
  not_outs INT,
  highest_score INT,
  batting_average DOUBLE,
  strike_rate DOUBLE,
  fours INT,
  sixes INT,

  -- Bowling
  overs_bowled DOUBLE,
  runs_conceded INT,
  wickets INT,
  bowling_average DOUBLE,
  economy DOUBLE,

  -- Fielding
  catches INT,
  stumpings INT,
  run_outs INT,

  ingestion_ts TIMESTAMP
)
USING DELTA
TBLPROPERTIES (
  delta.autoOptimize.optimizeWrite = true,
  delta.autoOptimize.autoCompact = true
);
-- Optional: PARTITIONED BY (season)  -- Uncomment for many seasons / better performance

In [None]:
from pyspark.sql import functions as F

# Read silver tables
fact_deliveries = spark.table("silver.fact_deliveries")
fact_matches = spark.table("silver.fact_matches")
dim_player = spark.table("silver.dim_player")

In [None]:
# Join season and exclude super overs
df = fact_deliveries \
    .join(fact_matches.select("match_id", "season"), "match_id") \
    .filter(F.col("is_super_over") == 0)


In [None]:
# Per-match batting aggregates (only when player was on strike)
per_match_batting = df.groupBy("match_id", "season", F.col("batsman_sk").alias("player_sk")).agg(
    F.sum("batsman_runs").alias("match_runs"),
    F.sum(F.when((F.col("wide_runs") == 0) & (F.col("noball_runs") == 0), 1).otherwise(0)).alias("match_balls"),
    F.sum(F.when(F.col("batsman_runs") == 4, 1).otherwise(0)).alias("match_fours"),
    F.sum(F.when(F.col("batsman_runs") == 6, 1).otherwise(0)).alias("match_sixes")
)

In [None]:
# Per-match bowling
per_match_bowling = df.groupBy("match_id", "season", F.col("bowler_sk").alias("player_sk")).agg(
    F.sum("total_runs").alias("match_runs_conceded"),
    (F.sum(F.when((F.col("wide_runs") == 0) & (F.col("noball_runs") == 0), 1).otherwise(0)) / 6.0).alias("match_overs"),
    F.sum(
        F.when(
            F.col("player_dismissed_sk").isNotNull() &
            ~F.col("dismissal_kind").isin("run out", "retired hurt", "retired out", "obstructing the field"),
            1
        ).otherwise(0)
    ).alias("match_wickets")
)

In [None]:
# Per-match fielding
per_match_fielding = df \
    .filter(F.col("player_dismissed_sk").isNotNull() & F.col("fielder_sk").isNotNull()) \
    .groupBy("match_id", "season", F.col("fielder_sk").alias("player_sk")).agg(
        F.sum(F.when(F.col("dismissal_kind") == "caught", 1).otherwise(0)).alias("match_catches"),
        F.sum(F.when(F.col("dismissal_kind") == "stumped", 1).otherwise(0)).alias("match_stumpings"),
        F.sum(F.when(F.col("dismissal_kind") == "run out", 1).otherwise(0)).alias("match_run_outs")
    )

In [None]:
# Player participation (any role) â†’ matches played
player_participation = df.select("match_id", "season", F.col("batsman_sk").alias("player_sk")).filter(F.col("player_sk").isNotNull()) \
    .union(df.select("match_id", "season", F.col("non_striker_sk").alias("player_sk")).filter(F.col("player_sk").isNotNull())) \
    .union(df.select("match_id", "season", F.col("bowler_sk").alias("player_sk")).filter(F.col("player_sk").isNotNull())) \
    .union(df.select("match_id", "season", F.col("fielder_sk").alias("player_sk")).filter(F.col("player_sk").isNotNull())) \
    .union(df.select("match_id", "season", F.col("player_dismissed_sk").alias("player_sk")).filter(F.col("player_sk").isNotNull())) \
    .distinct()

matches_played = player_participation.groupBy("season", "player_sk").agg(F.count("*").alias("matches"))

In [None]:
# Innings batted / bowled
innings_batted = per_match_batting.select("match_id", "season", "player_sk").distinct() \
    .groupBy("season", "player_sk").agg(F.count("*").alias("innings_batted"))

innings_bowled = per_match_bowling.select("match_id", "season", "player_sk").distinct() \
    .groupBy("season", "player_sk").agg(F.count("*").alias("innings_bowled"))

In [None]:
# Times out (for batting average - exclude retired/obstructing)
times_out_df = df \
    .filter(F.col("player_dismissed_sk").isNotNull() &
            ~F.col("dismissal_kind").isin("retired hurt", "retired out", "obstructing the field")) \
    .select("match_id", "season", F.col("player_dismissed_sk").alias("player_sk")).distinct() \
    .groupBy("season", "player_sk").agg(F.count("*").alias("times_out"))

In [None]:
# Not outs = innings_batted - times_out
not_outs = innings_batted \
    .join(times_out_df, ["season", "player_sk"], "left") \
    .select("season", "player_sk",
            (F.col("innings_batted") - F.coalesce(F.col("times_out"), F.lit(0))).alias("not_outs"))

In [None]:
# Season-level aggregates
season_batting = per_match_batting.groupBy("season", "player_sk").agg(
    F.sum("match_runs").alias("runs_raw"),
    F.sum("match_balls").alias("balls_faced_raw"),
    F.sum("match_fours").alias("fours_raw"),
    F.sum("match_sixes").alias("sixes_raw"),
    F.max("match_runs").alias("highest_score_raw")
)

season_bowling = per_match_bowling.groupBy("season", "player_sk").agg(
    F.sum("match_runs_conceded").alias("runs_conceded_raw"),
    F.sum("match_overs").alias("overs_bowled_raw"),
    F.sum("match_wickets").alias("wickets_raw")
)

season_fielding = per_match_fielding.groupBy("season", "player_sk").agg(
    F.sum("match_catches").alias("catches_raw"),
    F.sum("match_stumpings").alias("stumpings_raw"),
    F.sum("match_run_outs").alias("run_outs_raw")
)

In [None]:
# Assemble final gold DataFrame (base = players with at least 1 match in season)
gold_df = matches_played \
    .join(dim_player.select("player_sk", "player_name"), "player_sk") \
    .join(innings_batted, ["season", "player_sk"], "left") \
    .join(innings_bowled, ["season", "player_sk"], "left") \
    .join(not_outs, ["season", "player_sk"], "left") \
    .join(times_out_df, ["season", "player_sk"], "left") \
    .join(season_batting, ["season", "player_sk"], "left") \
    .join(season_bowling, ["season", "player_sk"], "left") \
    .join(season_fielding, ["season", "player_sk"], "left") \
    .select(
        "player_sk",
        "player_name",
        "season",
        F.coalesce(F.col("matches"), F.lit(0)).cast("int").alias("matches"),
        F.coalesce(F.col("innings_batted"), F.lit(0)).cast("int").alias("innings_batted"),
        F.coalesce(F.col("innings_bowled"), F.lit(0)).cast("int").alias("innings_bowled"),
        F.coalesce(F.col("runs_raw"), F.lit(0)).cast("int").alias("runs"),
        F.coalesce(F.col("balls_faced_raw"), F.lit(0)).cast("int").alias("balls_faced"),
        F.coalesce(F.col("not_outs"), F.lit(0)).cast("int").alias("not_outs"),
        F.coalesce(F.col("highest_score_raw"), F.lit(0)).cast("int").alias("highest_score"),
        F.when(F.col("times_out") > 0, F.round(F.col("runs_raw") / F.col("times_out"), 2)).otherwise(F.lit(None)).alias("batting_average"),
        F.when(F.col("balls_faced_raw") > 0, F.round(F.col("runs_raw") * 100.0 / F.col("balls_faced_raw"), 2)).otherwise(F.lit(0.0)).alias("strike_rate"),
        F.coalesce(F.col("fours_raw"), F.lit(0)).cast("int").alias("fours"),
        F.coalesce(F.col("sixes_raw"), F.lit(0)).cast("int").alias("sixes"),
        F.coalesce(F.col("overs_bowled_raw"), F.lit(0.0)).alias("overs_bowled"),
        F.coalesce(F.col("runs_conceded_raw"), F.lit(0)).cast("int").alias("runs_conceded"),
        F.coalesce(F.col("wickets_raw"), F.lit(0)).cast("int").alias("wickets"),
        F.when(F.col("wickets_raw") > 0, F.round(F.col("runs_conceded_raw") / F.col("wickets_raw"), 2)).otherwise(F.lit(None)).alias("bowling_average"),
        F.when(F.col("overs_bowled_raw") > 0, F.round(F.col("runs_conceded_raw") / F.col("overs_bowled_raw"), 2)).otherwise(F.lit(0.0)).alias("economy"),
        F.coalesce(F.col("catches_raw"), F.lit(0)).cast("int").alias("catches"),
        F.coalesce(F.col("stumpings_raw"), F.lit(0)).cast("int").alias("stumpings"),
        F.coalesce(F.col("run_outs_raw"), F.lit(0)).cast("int").alias("run_outs"),
        F.current_timestamp().alias("ingestion_ts")
    )

In [None]:
# Write (full overwrite - no duplicates due to player_sk + season grain)
gold_df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("gold.ipl_players_season")

In [None]:
# Optimize
spark.sql("OPTIMIZE gold.ipl_players_season ZORDER BY (season, player_sk)")

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,clusteringStats:struct<inputZCubeFiles:struct<numFiles:bigint,size:bigint>,inputOtherFiles:struct<numFiles:bigint,size:bigint>,inputNumZCubes:bigint,mergedFiles:struct<numFiles:bigint,size:bigint>,numOutputZCubes:bigint>,numBins:bigint,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,