In [0]:
import dlt
from pyspark.sql import DataFrame
from pyspark.sql.window import Window
from pyspark.sql.functions import (
    col, when, regexp_replace, trim, split, array_distinct,
    array_remove, size, concat_ws, year, current_timestamp,
    lit, expr, md5, coalesce, lower, row_number
)
from pyspark.sql.types import IntegerType

# ============================================================
# 1. CONFIG
# ============================================================

RAW_DATA_PATH = "/Volumes/workspace/imdb/imdb/title.basics.tsv"

# ============================================================
# 2. BRONZE LAYER
# ============================================================

@dlt.table(
    name="title_basics_bronze",
    comment="Bronze table for IMDb title.basics â€“ raw data plus audit columns"
)
def title_basics_bronze():
    df_raw = (
        spark.read
        .option("header", "true")
        .option("sep", "\t")
        .option("nullValue", "\\N")
        .csv(RAW_DATA_PATH)
    )

    # normalize empties to nulls
    df_raw = (
        df_raw
        .withColumn("primaryTitle",   when(trim(col("primaryTitle"))   == "", None).otherwise(col("primaryTitle")))
        .withColumn("originalTitle",  when(trim(col("originalTitle"))  == "", None).otherwise(col("originalTitle")))
        .withColumn("titleType",      when(trim(col("titleType"))      == "", None).otherwise(col("titleType")))
        .withColumn("isAdult",        when(trim(col("isAdult"))        == "", None).otherwise(col("isAdult")))
        .withColumn("startYear",      when(trim(col("startYear"))      == "", None).otherwise(col("startYear")))
        .withColumn("endYear",        when(trim(col("endYear"))        == "", None).otherwise(col("endYear")))
        .withColumn("runtimeMinutes", when(trim(col("runtimeMinutes")) == "", None).otherwise(col("runtimeMinutes")))
        .withColumn("genres",         when(trim(col("genres"))         == "", None).otherwise(col("genres")))
    )

    df_bronze = (
        df_raw
        .withColumn("bronze_ingestion_timestamp", current_timestamp())
        .withColumn("bronze_ingestion_date", current_timestamp().cast("date"))
        .withColumn("bronze_source_system", lit("imdb"))
        .withColumn("bronze_source_file", lit(RAW_DATA_PATH))
        .withColumn(
            "bronze_record_hash",
            md5(
                concat_ws(
                    "|",
                    coalesce(col("tconst"), lit("")),
                    coalesce(col("titleType"), lit("")),
                    coalesce(col("primaryTitle"), lit("")),
                    coalesce(col("originalTitle"), lit("")),
                    coalesce(col("isAdult"), lit("")),
                    coalesce(col("startYear"), lit("")),
                    coalesce(col("endYear"), lit("")),
                    coalesce(col("runtimeMinutes"), lit("")),
                    coalesce(col("genres"), lit(""))
                )
            )
        )
    )

    return df_bronze

# ============================================================
# 3. CLEANING HELPERS
# ============================================================

def validate_title_id(df: DataFrame) -> DataFrame:
    """Flag whether tconst is a valid IMDb title id (tt[0-9]+)."""
    return df.withColumn(
        "tconst_valid",
        when(col("tconst").isNotNull() & col("tconst").rlike("^tt[0-9]+$"), True).otherwise(False)
    )

def clean_title_strings(df: DataFrame) -> DataFrame:
    """Normalize title strings (primary & original)."""
    return (
        df.withColumn(
            "primaryTitle_clean",
            when(col("primaryTitle").isNotNull(),
                 trim(regexp_replace(col("primaryTitle"), "\\s+", " ")))
        )
        .withColumn(
            "originalTitle_clean",
            when(col("originalTitle").isNotNull(),
                 trim(regexp_replace(col("originalTitle"), "\\s+", " ")))
        )
    )

def clean_year(df: DataFrame, column: str) -> DataFrame:
    """
    Parse year column as 4-digit integer.
    Range checks (1..current_year) are done later in validity flags.
    """
    clean_col = f"{column}_clean"
    return df.withColumn(
        clean_col,
        when(
            col(column).isNotNull() & col(column).rlike("^[0-9]{4}$"),
            col(column).cast(IntegerType())
        ).otherwise(None)
    )

def clean_runtime(df: DataFrame) -> DataFrame:
    """
    Parse runtimeMinutes as integer.
    A value is considered 'clean' if:
      - numeric AND between 1 and 10_000 minutes.
    """
    return df.withColumn(
        "runtime_minutes_clean",
        when(
            col("runtimeMinutes").isNotNull() &
            col("runtimeMinutes").rlike("^[0-9]+$") &
            (col("runtimeMinutes").cast(IntegerType()) >= 1) &
            (col("runtimeMinutes").cast(IntegerType()) <= 10000),
            col("runtimeMinutes").cast(IntegerType())
        ).otherwise(None)
    )

def parse_genres(df: DataFrame) -> DataFrame:
    """Split genres into array; handle nulls."""
    df = df.withColumn(
        "genres_array",
        when(
            col("genres").isNotNull(),
            array_distinct(array_remove(split(col("genres"), ","), ""))
        ).otherwise(expr("array()"))
    )

    df = df.withColumn("genre_count", size(col("genres_array")))
    return df

def clean_is_adult(df: DataFrame) -> DataFrame:
    """Convert isAdult from string (0/1) to boolean; treat null as 0 (non-adult)."""
    return df.withColumn(
        "is_adult_flag",
        when(col("isAdult") == "1", True)
        .otherwise(False)   # null or "0" -> False
    )

# ============================================================
# 4. SILVER ALL (CLEANED + QUALITY FLAGS, NO ROWS DROPPED)
# ============================================================

@dlt.table(
    name="title_basics_silver_all",
    comment="Silver cleaned table for title.basics with quality flags (no rows dropped)"
)
def title_basics_silver_all():

    df = dlt.read("title_basics_bronze")

    # Cleaning
    df = validate_title_id(df)
    df = clean_title_strings(df)
    df = clean_year(df, "startYear")
    df = clean_year(df, "endYear")
    df = clean_runtime(df)
    df = parse_genres(df)
    df = clean_is_adult(df)

    # Derived flags
    df = df.withColumn(
        "is_series",
        col("titleType").isin("tvSeries", "tvMiniSeries", "tvEpisode")
    )

    current_year = year(current_timestamp())

    # Start year validity
    df = df.withColumn(
        "has_valid_start_year",
        when(col("startYear").isNull(), True)
        .when(
            col("startYear_clean").isNotNull() &
            (col("startYear_clean") >= 1) &
            (col("startYear_clean") <= current_year),
            True
        )
        .otherwise(False)
    )

    # End year validity
    df = df.withColumn(
        "has_valid_end_year",
        when(col("endYear").isNull(), True)
        .when(
            col("endYear_clean").isNotNull() &
            (col("endYear_clean") >= 1) &
            (col("endYear_clean") <= current_year) &
            (
                col("startYear_clean").isNull() |
                (col("endYear_clean") >= col("startYear_clean"))
            ),
            True
        )
        .otherwise(False)
    )

    # Runtime validity
    df = df.withColumn(
        "has_valid_runtime",
        when(col("runtimeMinutes").isNull(), True)
        .when(col("runtime_minutes_clean").isNotNull(), True)
        .otherwise(False)
    )

    # Runtime anomaly flag (optional, informational)
    df = df.withColumn(
        "has_anomalous_runtime",
        (col("runtime_minutes_clean") > 300)
    )

    # Completeness score (info only)
    df = df.withColumn(
        "data_completeness_score",
        (
            when(col("tconst_valid"), 20).otherwise(0)
            + when(col("primaryTitle_clean").isNotNull(), 20).otherwise(0)
            + when(col("startYear_clean").isNotNull(), 20).otherwise(0)
            + when(col("runtime_minutes_clean").isNotNull(), 20).otherwise(0)
            + when(col("genre_count") > 0, 20).otherwise(0)
        )
    )

    df = df.withColumn(
        "quality_tier",
        when(col("data_completeness_score") >= 80, "HIGH")
        .when(col("data_completeness_score") >= 60, "MEDIUM")
        .when(col("data_completeness_score") >= 40, "LOW")
        .otherwise("POOR")
    )

    df = df.withColumn("silver_processing_timestamp", current_timestamp())
    df = df.withColumn("silver_processing_date", current_timestamp().cast("date"))
    df = df.withColumn("silver_version", lit("1.0"))

    # Quality check flag (for analysis only, NOT used to drop rows)
    df = df.withColumn(
        "silver_quality_check",
        when(~col("tconst_valid"), "FAILED")
        .when(~col("has_valid_start_year"), "FAILED")
        .when(~col("has_valid_end_year"), "FAILED")
        .when(~col("has_valid_runtime"), "FAILED")
        .otherwise("PASSED")
    )

    # Deduplicate by tconst, keep last ingested version
    w = Window.partitionBy("tconst").orderBy(col("bronze_ingestion_timestamp").desc())
    df = df.withColumn("rn", row_number().over(w)).filter(col("rn") == 1).drop("rn")

    return df

# ============================================================
# 5. SILVER CLEAN (NO NULLS, ALL ROWS, READY FOR GOLD)
# ============================================================

@dlt.table(
    name="title_basics_silver_clean",
    comment="Silver table for title.basics with null-handling and defaults for all rows (no quarantine)"
)
def title_basics_silver_clean():
    # NOTE: we keep ALL rows from silver_all, no filter on silver_quality_check
    df = dlt.read("title_basics_silver_all")

    return df.select(
        # if tconst is invalid, set to 'UNKNOWN'
        when(col("tconst_valid"), col("tconst")).otherwise(lit("UNKNOWN")).alias("title_imdb_id"),

        coalesce(col("titleType"), lit("unknown")).alias("title_type"),
        coalesce(col("primaryTitle_clean"), lit("Unknown")).alias("primary_title"),
        coalesce(col("originalTitle_clean"), lit("Unknown")).alias("original_title"),

        coalesce(col("is_adult_flag"), lit(False)).alias("is_adult"),
        coalesce(col("is_series"), lit(False)).alias("is_series"),

        coalesce(col("startYear_clean"), lit(-1)).alias("start_year"),
        coalesce(col("endYear_clean"), lit(-1)).alias("end_year"),
        coalesce(col("runtime_minutes_clean"), lit(-1)).alias("runtime_minutes"),

        coalesce(col("genre_count"), lit(0)).alias("genre_count"),
        coalesce(col("genres_array"), expr("array()")).alias("genres_array"),

        coalesce(col("data_completeness_score"), lit(0)).alias("data_completeness_score"),
        coalesce(col("quality_tier"), lit("UNKNOWN")).alias("quality_tier"),

        coalesce(col("silver_processing_timestamp"), current_timestamp()).alias("silver_processing_timestamp"),
        coalesce(col("silver_processing_date"), current_timestamp().cast("date")).alias("silver_processing_date"),
        coalesce(col("silver_version"), lit("1.0")).alias("silver_version"),

        coalesce(col("bronze_ingestion_timestamp"), current_timestamp()).alias("bronze_ingestion_timestamp"),
        coalesce(col("bronze_ingestion_date"), current_timestamp().cast("date")).alias("bronze_ingestion_date"),

        coalesce(col("bronze_source_system"), lit("imdb")).alias("bronze_source_system"),
        coalesce(col("bronze_source_file"), lit("UNKNOWN_FILE")).alias("bronze_source_file"),
        coalesce(col("bronze_record_hash"), lit("")).alias("bronze_record_hash"),

        # keep quality flag for analysis if you want
        col("silver_quality_check")
    )
