In [0]:
import dlt
from pyspark.sql import DataFrame
from pyspark.sql.window import Window
from pyspark.sql.functions import (
    col, when, trim, current_timestamp, lit, expr, md5, 
    coalesce, concat_ws, row_number
)
from pyspark.sql.types import IntegerType

In [0]:
# ============================================================
# 1. CONFIG
# ============================================================

RAW_DATA_PATH = "/Volumes/workspace/imdb/imdb/title.episode.tsv"

In [0]:
# ============================================================
# 2. BRONZE LAYER
# ============================================================

@dlt.table(
    name="bronze_title_episode",
    comment="Bronze table for IMDb title.episode - raw data plus audit columns",
    table_properties={
        "quality": "bronze",
        "pipelines.autoOptimize.zOrderCols": "tconst"
    }
)
def bronze_title_episode():
    """
    Bronze layer: Raw data ingestion from title.episode.tsv
    """
    df_raw = (
        spark.read
        .option("header", "true")
        .option("sep", "\t")
        .option("nullValue", "\\N")
        .csv(RAW_DATA_PATH)
    )

    # Convert empty strings to NULL
    df_raw = (
        df_raw
        .withColumn("tconst", when(trim(col("tconst")) == "", None).otherwise(col("tconst")))
        .withColumn("parentTconst", when(trim(col("parentTconst")) == "", None).otherwise(col("parentTconst")))
        .withColumn("seasonNumber", when(trim(col("seasonNumber")) == "", None).otherwise(col("seasonNumber")))
        .withColumn("episodeNumber", when(trim(col("episodeNumber")) == "", None).otherwise(col("episodeNumber")))
    )

    df_bronze = (
        df_raw
        .withColumn("bronze_ingestion_timestamp", current_timestamp())
        .withColumn("bronze_ingestion_date", current_timestamp().cast("date"))
        .withColumn("bronze_source_system", lit("imdb"))
        .withColumn("bronze_source_file", lit(RAW_DATA_PATH))
        .withColumn(
            "bronze_record_hash",
            md5(
                concat_ws(
                    "|",
                    coalesce(col("tconst"), lit("")),
                    coalesce(col("parentTconst"), lit("")),
                    coalesce(col("seasonNumber"), lit("")),
                    coalesce(col("episodeNumber"), lit(""))
                )
            )
        )
    )

    return df_bronze

In [0]:
# ============================================================
# 3. CLEANING HELPERS
# ============================================================

def validate_tconst(df: DataFrame) -> DataFrame:
    """Validate TCONST format"""
    return df.withColumn(
        "tconst_valid",
        when(col("tconst").isNotNull() & col("tconst").rlike("^tt[0-9]+$"), True).otherwise(False)
    )

def validate_parent_tconst(df: DataFrame) -> DataFrame:
    """Validate Parent TCONST format"""
    return df.withColumn(
        "parent_tconst_valid",
        when(col("parentTconst").isNotNull() & col("parentTconst").rlike("^tt[0-9]+$"), True).otherwise(False)
    )

def clean_tconst(df: DataFrame) -> DataFrame:
    """Clean and standardize TCONST"""
    return df.withColumn(
        "tconst_clean",
        when(col("tconst").isNotNull(), trim(col("tconst"))).otherwise(None)
    )

def clean_parent_tconst(df: DataFrame) -> DataFrame:
    """Clean and standardize Parent TCONST"""
    return df.withColumn(
        "parent_tconst_clean",
        when(col("parentTconst").isNotNull(), trim(col("parentTconst"))).otherwise(None)
    )

def clean_season_number(df: DataFrame) -> DataFrame:
    """Clean and validate season number"""
    return df.withColumn(
        "season_number_clean",
        when(
            col("seasonNumber").isNotNull()
            & col("seasonNumber").rlike("^[0-9]+$")
            & (col("seasonNumber").cast(IntegerType()) >= 0),
            col("seasonNumber").cast(IntegerType())
        ).otherwise(None)
    )

def clean_episode_number(df: DataFrame) -> DataFrame:
    """Clean and validate episode number"""
    return df.withColumn(
        "episode_number_clean",
        when(
            col("episodeNumber").isNotNull()
            & col("episodeNumber").rlike("^[0-9]+$")
            & (col("episodeNumber").cast(IntegerType()) >= 1),
            col("episodeNumber").cast(IntegerType())
        ).otherwise(None)
    )

In [0]:
# ============================================================
# 4. SILVER ALL (CLEANED + QUALITY FLAGS - ALL RECORDS)
# ============================================================

@dlt.table(
    name="silver_title_episode",
    comment="Silver cleaned table with quality scoring - includes ALL records with quality flags",
    table_properties={
        "quality": "silver",
        "pipelines.autoOptimize.zOrderCols": "tconst"
    }
)
def silver_title_episode():
    """
    Silver layer: Clean and validate data with quality scoring
    ALL records are kept here with quality flags
    NULLs are preserved - will be replaced in silver_clean
    """
    df = dlt.read("bronze_title_episode")

    df = validate_tconst(df)
    df = validate_parent_tconst(df)
    df = clean_tconst(df)
    df = clean_parent_tconst(df)
    df = clean_season_number(df)
    df = clean_episode_number(df)

    # Quality flags
    df = df.withColumn(
        "has_valid_season",
        col("season_number_clean").isNotNull()
    )

    df = df.withColumn(
        "has_valid_episode",
        col("episode_number_clean").isNotNull()
    )

    df = df.withColumn(
        "has_anomalous_season",
        (col("season_number_clean") < 0) | (col("season_number_clean") > 100)
    )

    df = df.withColumn(
        "has_anomalous_episode",
        (col("episode_number_clean") < 1) | (col("episode_number_clean") > 1000)
    )

    # Quality scoring
    df = df.withColumn(
        "data_completeness_score",
        (
            when(col("tconst_valid"), 25).otherwise(0)
            + when(col("parent_tconst_valid"), 25).otherwise(0)
            + when(col("has_valid_season"), 25).otherwise(0)
            + when(col("has_valid_episode"), 25).otherwise(0)
        )
    )

    df = df.withColumn(
        "quality_tier",
        when(col("data_completeness_score") >= 75, "HIGH")
        .when(col("data_completeness_score") >= 50, "MEDIUM")
        .otherwise("LOW")
    )

    # Metadata
    df = df.withColumn("silver_processing_timestamp", current_timestamp())
    df = df.withColumn("silver_processing_date", current_timestamp().cast("date"))
    df = df.withColumn("silver_version", lit("1.0"))

    # Quality check flag (but don't filter - keep all records)
    df = df.withColumn(
        "silver_quality_check",
        when(~col("tconst_valid"), "FAILED")
        .when(~col("parent_tconst_valid"), "FAILED")
        .when(col("has_anomalous_season"), "FAILED")
        .when(col("has_anomalous_episode"), "FAILED")
        .when(col("quality_tier") == "LOW", "FAILED")
        .otherwise("PASSED")
    )

    # Add failure reason for tracking
    df = df.withColumn(
        "quality_issue_reason",
        when(~col("tconst_valid"), "Invalid TCONST format")
        .when(~col("parent_tconst_valid"), "Invalid Parent TCONST format")
        .when(col("has_anomalous_season"), "Anomalous season number")
        .when(col("has_anomalous_episode"), "Anomalous episode number")
        .when(col("quality_tier") == "LOW", "Low data completeness score")
        .otherwise(None)
    )

    # Deduplicate by tconst
    w = Window.partitionBy("tconst").orderBy(col("bronze_ingestion_timestamp").desc())
    df = df.withColumn("rn", row_number().over(w)).filter(col("rn") == 1).drop("rn")

    return df


In [0]:
# ============================================================
# 5. SILVER CLEAN (NO NULLS - HIGH QUALITY ONLY)
# ============================================================

@dlt.table(
    name="silver_title_episode_clean",
    comment="Silver with NULLs replaced by meaningful defaults - includes ALL records",
    table_properties={
        "quality": "silver_clean",
        "pipelines.autoOptimize.zOrderCols": "TCONST"
    }
)
def silver_title_episode_clean():
    """
    Silver Clean layer: Replace all NULLs with meaningful default values
    Includes ALL records - both PASSED and FAILED
    """
    df = dlt.read("silver_title_episode")  # No filter - keep ALL records

    return df.select(
        coalesce(col("tconst_clean"), col("tconst"), lit("UNKNOWN")).alias("TCONST"),
        coalesce(col("parent_tconst_clean"), col("parentTconst"), lit("UNKNOWN")).alias("Parent_TCONST"),
        
        # Replace NULL season/episode numbers with meaningful defaults
        coalesce(col("season_number_clean"), lit(-1)).alias("Season_Number"),
        coalesce(col("episode_number_clean"), lit(-1)).alias("Episode_Number"),
        
        # Quality metrics
        coalesce(col("data_completeness_score"), lit(0)).alias("Data_Completeness_Score"),
        coalesce(col("quality_tier"), lit("UNKNOWN")).alias("Quality_Tier"),
        coalesce(col("silver_quality_check"), lit("UNKNOWN")).alias("Quality_Check"),
        coalesce(col("quality_issue_reason"), lit("None")).alias("Quality_Issue_Reason"),
        
        # Flags for data presence
        coalesce(col("has_valid_season"), lit(False)).alias("Has_Valid_Season"),
        coalesce(col("has_valid_episode"), lit(False)).alias("Has_Valid_Episode"),
        coalesce(col("tconst_valid"), lit(False)).alias("TCONST_Valid"),
        coalesce(col("parent_tconst_valid"), lit(False)).alias("Parent_TCONST_Valid"),
        
        # Metadata
        coalesce(col("silver_processing_timestamp"), current_timestamp()).alias("Silver_Processing_Timestamp"),
        coalesce(col("silver_processing_date"), current_timestamp().cast("date")).alias("Silver_Processing_Date"),
        coalesce(col("silver_version"), lit("1.0")).alias("Silver_Version"),
        
        coalesce(col("bronze_ingestion_timestamp"), current_timestamp()).alias("Bronze_Ingestion_Timestamp"),
        coalesce(col("bronze_ingestion_date"), current_timestamp().cast("date")).alias("Bronze_Ingestion_Date"),
        coalesce(col("bronze_source_system"), lit("imdb")).alias("Bronze_Source_System"),
        coalesce(col("bronze_source_file"), lit("UNKNOWN_FILE")).alias("Bronze_Source_File"),
        coalesce(col("bronze_record_hash"), lit("")).alias("Bronze_Record_Hash")
    )