In [None]:
%sql
-- Create the gold fact table (run once or if schema changes needed)
CREATE TABLE IF NOT EXISTS gold.ipl_team_season (
  team_sk BIGINT,
  team_name STRING,
  season INT,

  matches_played INT,
  wins INT,
  losses INT,
  no_results INT,

  runs_scored INT,
  runs_conceded INT,
  wickets_taken INT,
  wickets_lost INT,

  avg_runs_scored DOUBLE,
  avg_runs_conceded DOUBLE,

  net_run_rate DOUBLE,
  points INT,

  ingestion_ts TIMESTAMP
)
USING DELTA
TBLPROPERTIES (
  delta.autoOptimize.optimizeWrite = true,
  delta.autoOptimize.autoCompact = true
);
-- Optional: uncomment for better performance with many seasons
-- PARTITIONED BY (season)

In [None]:
# PySpark cell: Full refresh of gold.ipl_team_season (all fixes applied)
from pyspark.sql import functions as F

# Read silver tables
fact_matches = spark.table("silver.fact_matches")
fact_deliveries = spark.table("silver.fact_deliveries")
dim_team = spark.table("silver.dim_team")


Failed to fetch the result.

In [None]:
# Unpivot matches for team-level outcomes
team1_df = fact_matches.select(
    "season",
    F.col("team1_sk").alias("team_sk"),
    F.col("team2_sk").alias("opponent_sk"),
    "winner_sk"
)
team2_df = fact_matches.select(
    "season",
    F.col("team2_sk").alias("team_sk"),
    F.col("team1_sk").alias("opponent_sk"),
    "winner_sk"
)
match_teams = team1_df.union(team2_df)

In [None]:
# Match outcomes (explicit INT casts to avoid LongType issues)
team_match_outcomes = match_teams.groupBy("season", "team_sk").agg(
    F.count("*").cast("int").alias("matches_played"),
    F.sum(F.when(F.col("winner_sk") == F.col("team_sk"), 1).otherwise(0)).cast("int").alias("wins"),
    F.sum(F.when(F.col("winner_sk") == F.col("opponent_sk"), 1).otherwise(0)).cast("int").alias("losses"),
    F.sum(F.when(F.col("winner_sk").isNull(), 1).otherwise(0)).cast("int").alias("no_results"),
    F.sum(
        F.when(F.col("winner_sk") == F.col("team_sk"), 2)
        .when(F.col("winner_sk").isNull(), 1)
        .otherwise(0)
    ).cast("int").alias("points")
)

In [None]:
# Batting stats (exclude super overs)
batting_stats = fact_deliveries.join(fact_matches, "match_id") \
    .filter(F.col("is_super_over") == 0) \
    .groupBy(fact_matches.season, fact_deliveries.batting_team_sk.alias("team_sk")) \
    .agg(
        F.sum("total_runs").alias("runs_scored_raw"),
        F.sum(
            F.when(
                (F.col("player_dismissed_sk").isNotNull()) &
                (~F.col("dismissal_kind").isin("retired hurt", "retired out")),
                1
            ).otherwise(0)
        ).alias("wickets_lost_raw"),
        (F.sum(F.when((F.col("wide_runs") == 0) & (F.col("noball_runs") == 0), 1).otherwise(0)) / 6.0).alias("overs_faced")
    )


In [None]:
# Bowling stats (exclude super overs)
bowling_stats = fact_deliveries.join(fact_matches, "match_id") \
    .filter(F.col("is_super_over") == 0) \
    .groupBy(fact_matches.season, fact_deliveries.bowling_team_sk.alias("team_sk")) \
    .agg(
        F.sum("total_runs").alias("runs_conceded_raw"),
        F.sum(
            F.when(
                (F.col("player_dismissed_sk").isNotNull()) &
                (~F.col("dismissal_kind").isin("retired hurt", "retired out")),
                1
            ).otherwise(0)
        ).alias("wickets_taken_raw"),
        (F.sum(F.when((F.col("wide_runs") == 0) & (F.col("noball_runs") == 0), 1).otherwise(0)) / 6.0).alias("overs_bowled")
    )

In [None]:
# Final assembly with proper casts and calculations
gold_df = team_match_outcomes \
    .join(dim_team.select("team_sk", "team_name"), "team_sk") \
    .join(batting_stats, ["team_sk", "season"], "left") \
    .join(bowling_stats, ["team_sk", "season"], "left") \
    .select(
        "team_sk",
        "team_name",
        "season",
        "matches_played",
        "wins",
        "losses",
        "no_results",
        F.coalesce(F.col("runs_scored_raw"), F.lit(0)).cast("int").alias("runs_scored"),
        F.coalesce(F.col("runs_conceded_raw"), F.lit(0)).cast("int").alias("runs_conceded"),
        F.coalesce(F.col("wickets_taken_raw"), F.lit(0)).cast("int").alias("wickets_taken"),
        F.coalesce(F.col("wickets_lost_raw"), F.lit(0)).cast("int").alias("wickets_lost"),
        F.round(F.coalesce(F.col("runs_scored_raw"), F.lit(0.0)) / F.when(F.col("matches_played") == 0, None).otherwise(F.col("matches_played")), 2).alias("avg_runs_scored"),
        F.round(F.coalesce(F.col("runs_conceded_raw"), F.lit(0.0)) / F.when(F.col("matches_played") == 0, None).otherwise(F.col("matches_played")), 2).alias("avg_runs_conceded"),
        F.round(
            F.coalesce(F.col("runs_scored_raw") / F.when(F.col("overs_faced") == 0, None).otherwise(F.col("overs_faced")), F.lit(0.0)) -
            F.coalesce(F.col("runs_conceded_raw") / F.when(F.col("overs_bowled") == 0, None).otherwise(F.col("overs_bowled")), F.lit(0.0)),
            3
        ).alias("net_run_rate"),
        "points",
        F.current_timestamp().alias("ingestion_ts")
    )

In [None]:
# Write (full overwrite with schema safety)
gold_df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("gold.ipl_team_season")

In [None]:
# Optimize
spark.sql("OPTIMIZE gold.ipl_team_season ZORDER BY (season, points)")