In [0]:
import dlt
from pyspark.sql import DataFrame
from pyspark.sql.window import Window
from pyspark.sql.functions import (
    col, when, trim, current_timestamp, lit, expr, md5, 
    coalesce, concat_ws, row_number, split, array_distinct, 
    array_remove, size
)

In [0]:
# ============================================================
# 1. CONFIG
# ============================================================

RAW_DATA_PATH = "/Volumes/workspace/damg7370/datastore/IMDB/title.crew.tsv.gz"

In [0]:
# ============================================================
# 2. BRONZE LAYER
# ============================================================

@dlt.table(
    name="bronze_title_crew",
    comment="Bronze table for IMDb title.crew - raw data plus audit columns",
    table_properties={
        "quality": "bronze",
        "pipelines.autoOptimize.zOrderCols": "tconst"
    }
)
def bronze_title_crew():
    """
    Bronze layer: Raw data ingestion from title.crew.tsv
    """
    df_raw = (
        spark.read
        .option("header", "true")
        .option("sep", "\t")
        .option("nullValue", "\\N")
        .csv(RAW_DATA_PATH)
    )

    # Convert empty strings to NULL
    df_raw = (
        df_raw
        .withColumn("directors", when(trim(col("directors")) == "", None).otherwise(col("directors")))
        .withColumn("writers", when(trim(col("writers")) == "", None).otherwise(col("writers")))
    )

    df_bronze = (
        df_raw
        .withColumn("bronze_ingestion_timestamp", current_timestamp())
        .withColumn("bronze_ingestion_date", current_timestamp().cast("date"))
        .withColumn("bronze_source_system", lit("imdb"))
        .withColumn("bronze_source_file", lit(RAW_DATA_PATH))
        .withColumn(
            "bronze_record_hash",
            md5(
                concat_ws(
                    "|",
                    coalesce(col("tconst"), lit("")),
                    coalesce(col("directors"), lit("")),
                    coalesce(col("writers"), lit(""))
                )
            )
        )
    )

    return df_bronze

In [0]:
# ============================================================
# 3. CLEANING HELPERS
# ============================================================

def validate_tconst(df: DataFrame) -> DataFrame:
    """Validate TCONST format"""
    return df.withColumn(
        "tconst_valid",
        when(col("tconst").isNotNull() & col("tconst").rlike("^tt[0-9]+$"), True).otherwise(False)
    )

def clean_tconst(df: DataFrame) -> DataFrame:
    """Clean and standardize TCONST"""
    return df.withColumn(
        "tconst_clean",
        when(col("tconst").isNotNull(), trim(col("tconst"))).otherwise(None)
    )

def parse_directors(df: DataFrame) -> DataFrame:
    """Parse directors into array and count"""
    df = df.withColumn(
        "directors_array",
        when(col("directors").isNotNull(),
             array_distinct(array_remove(split(col("directors"), ","), "")))
        .otherwise(expr("array()"))
    )
    df = df.withColumn("directors_count", size(col("directors_array")))
    return df

def parse_writers(df: DataFrame) -> DataFrame:
    """Parse writers into array and count"""
    df = df.withColumn(
        "writers_array",
        when(col("writers").isNotNull(),
             array_distinct(array_remove(split(col("writers"), ","), "")))
        .otherwise(expr("array()"))
    )
    df = df.withColumn("writers_count", size(col("writers_array")))
    return df

In [0]:
# ============================================================
# 4. SILVER ALL (CLEANED + QUALITY FLAGS)
# ============================================================

@dlt.table(
    name="silver_title_crew",
    comment="Silver cleaned table with quality scoring",
    table_properties={
        "quality": "silver",
        "pipelines.autoOptimize.zOrderCols": "tconst"
    }
)
@dlt.expect_or_drop("valid_tconst_not_null", "tconst IS NOT NULL")
@dlt.expect_or_drop("valid_tconst_format", "tconst RLIKE '^tt[0-9]+$'")
def silver_title_crew():
    """
    Silver layer: Clean and validate data with quality scoring
    NULLs are kept as-is here, will be replaced in silver_clean
    """
    df = dlt.read("bronze_title_crew")

    df = validate_tconst(df)
    df = clean_tconst(df)
    df = parse_directors(df)
    df = parse_writers(df)

    # Quality scoring
    df = df.withColumn(
        "data_completeness_score",
        (
            when(col("tconst_valid"), 40).otherwise(0)
            + when(col("directors_count") > 0, 30).otherwise(0)
            + when(col("writers_count") > 0, 30).otherwise(0)
        )
    )

    df = df.withColumn(
        "quality_tier",
        when(col("data_completeness_score") >= 70, "HIGH")
        .when(col("data_completeness_score") >= 40, "MEDIUM")
        .otherwise("LOW")
    )

    # Metadata
    df = df.withColumn("silver_processing_timestamp", current_timestamp())
    df = df.withColumn("silver_processing_date", current_timestamp().cast("date"))
    df = df.withColumn("silver_version", lit("1.0"))

    # Quality check
    df = df.withColumn(
        "silver_quality_check",
        when(~col("tconst_valid"), "FAILED")
        .when(col("quality_tier") == "LOW", "FAILED")
        .otherwise("PASSED")
    )

    # Deduplicate by tconst
    w = Window.partitionBy("tconst").orderBy(col("bronze_ingestion_timestamp").desc())
    df = df.withColumn("rn", row_number().over(w)).filter(col("rn") == 1).drop("rn")

    return df


In [0]:
# ============================================================
# 5. SILVER CLEAN (NO NULLS)
# ============================================================

@dlt.table(
    name="silver_title_crew_clean",
    comment="Silver high-quality rows with NULLs replaced by meaningful defaults",
    table_properties={
        "quality": "silver_clean",
        "pipelines.autoOptimize.zOrderCols": "TCONST"
    }
)
def silver_title_crew_clean():
    """
    Silver Clean layer: Replace all NULLs with meaningful default values
    Only includes records that passed quality checks
    """
    df = dlt.read("silver_title_crew").filter(col("silver_quality_check") == "PASSED")

    return df.select(
        coalesce(col("tconst_clean"), lit("UNKNOWN")).alias("TCONST"),
        
        # Replace NULL directors/writers with meaningful defaults
        coalesce(col("directors"), lit("No Director Information")).alias("Directors"),
        coalesce(col("writers"), lit("No Writer Information")).alias("Writers"),
        
        # Arrays and counts
        coalesce(col("directors_array"), expr("array()")).alias("Directors_Array"),
        coalesce(col("writers_array"), expr("array()")).alias("Writers_Array"),
        coalesce(col("directors_count"), lit(0)).alias("Directors_Count"),
        coalesce(col("writers_count"), lit(0)).alias("Writers_Count"),
        
        # Quality metrics
        coalesce(col("data_completeness_score"), lit(0)).alias("Data_Completeness_Score"),
        coalesce(col("quality_tier"), lit("UNKNOWN")).alias("Quality_Tier"),
        
        # Flags for data presence
        when(col("directors").isNotNull(), lit(True)).otherwise(lit(False)).alias("Has_Directors"),
        when(col("writers").isNotNull(), lit(True)).otherwise(lit(False)).alias("Has_Writers"),
        when(col("directors").isNotNull() & col("writers").isNotNull(), lit(True)).otherwise(lit(False)).alias("Has_Complete_Crew"),
        
        # Metadata
        coalesce(col("silver_processing_timestamp"), current_timestamp()).alias("Silver_Processing_Timestamp"),
        coalesce(col("silver_processing_date"), current_timestamp().cast("date")).alias("Silver_Processing_Date"),
        coalesce(col("silver_version"), lit("1.0")).alias("Silver_Version"),
        
        coalesce(col("bronze_ingestion_timestamp"), current_timestamp()).alias("Bronze_Ingestion_Timestamp"),
        coalesce(col("bronze_ingestion_date"), current_timestamp().cast("date")).alias("Bronze_Ingestion_Date"),
        coalesce(col("bronze_source_system"), lit("imdb")).alias("Bronze_Source_System"),
        coalesce(col("bronze_source_file"), lit("UNKNOWN_FILE")).alias("Bronze_Source_File"),
        coalesce(col("bronze_record_hash"), lit("")).alias("Bronze_Record_Hash")
    )


In [0]:
# ============================================================
# 6. QUARANTINE
# ============================================================

@dlt.table(
    name="silver_title_crew_quarantine",
    comment="Records that failed Silver quality checks",
    table_properties={
        "quality": "quarantine"
    }
)
def silver_title_crew_quarantine():
    """
    Quarantine layer: Capture all records that fail validation
    """
    df = dlt.read("silver_title_crew").filter(col("silver_quality_check") == "FAILED")
    
    # Add quarantine reason
    df = df.withColumn(
        "quarantine_reason",
        when(~col("tconst_valid"), "Invalid TCONST format")
        .when(col("quality_tier") == "LOW", "Low data completeness score")
        .otherwise("Unknown validation failure")
    )
    
    df = df.withColumn("quarantine_timestamp", current_timestamp())
    
    return df