In [0]:
import dlt
from pyspark.sql import DataFrame
from pyspark.sql.window import Window
from pyspark.sql.functions import (
    col, when, regexp_replace, trim, split, array_distinct, 
    array_remove, size, concat_ws, year, current_timestamp,
    lit, expr, md5, coalesce, lower, row_number
)
from pyspark.sql.types import IntegerType

# ============================================================
# 1. CONFIG
# ============================================================

RAW_DATA_PATH = "/Volumes/workspace/damg7370/datastore/imdb/raw/name.basics.tsv"

# ============================================================
# 2. BRONZE LAYER
# ============================================================

@dlt.table(
    name="name_basics_bronze",
    comment="Bronze table for IMDb name.basics – raw data plus audit columns"
)
def name_basics_bronze():
    df_raw = (
        spark.read
        .option("header", "true")
        .option("sep", "\t")
        .option("nullValue", "\\N")  
        .csv(RAW_DATA_PATH)
    )

    df_raw = (
        df_raw
        .withColumn("primaryName", when(trim(col("primaryName")) == "", None).otherwise(col("primaryName")))
        .withColumn("primaryProfession", when(trim(col("primaryProfession")) == "", None).otherwise(col("primaryProfession")))
        .withColumn("knownForTitles", when(trim(col("knownForTitles")) == "", None).otherwise(col("knownForTitles")))
    )

    df_bronze = (
        df_raw
        .withColumn("bronze_ingestion_timestamp", current_timestamp())
        .withColumn("bronze_ingestion_date", current_timestamp().cast("date"))
        .withColumn("bronze_source_system", lit("imdb"))
        .withColumn("bronze_source_file", lit(RAW_DATA_PATH))
        .withColumn(
            "bronze_record_hash",
            md5(
                concat_ws(
                    "|",
                    coalesce(col("nconst"), lit("")),
                    coalesce(col("primaryName"), lit("")),
                    coalesce(col("birthYear"), lit("")),
                    coalesce(col("deathYear"), lit("")),
                    coalesce(col("primaryProfession"), lit("")),
                    coalesce(col("knownForTitles"), lit(""))
                )
            )
        )
    )

    return df_bronze


# ============================================================
# 3. CLEANING HELPERS
# ============================================================

def validate_person_id(df: DataFrame) -> DataFrame:
    return df.withColumn(
        "nconst_valid",
        when(col("nconst").isNotNull() & col("nconst").rlike("^nm[0-9]+$"), True).otherwise(False)
    )

def clean_name(df: DataFrame) -> DataFrame:
    return df.withColumn(
        "primaryName_clean",
        when(col("primaryName").isNotNull(), trim(regexp_replace(col("primaryName"), "\\s+", " ")))
        .otherwise(None)
    )

def clean_year(df: DataFrame, column: str) -> DataFrame:
    clean_col = f"{column}_clean"
    return df.withColumn(
        clean_col,
        when(
            col(column).isNotNull()
            & col(column).rlike("^[0-9]{4}$")
            & (col(column).cast(IntegerType()) >= 1)
            & (col(column).cast(IntegerType()) <= year(current_timestamp())),
            col(column).cast(IntegerType())
        ).otherwise(None)
    )

def parse_professions(df: DataFrame) -> DataFrame:
    df = df.withColumn(
        "primaryProfession_norm",
        when(col("primaryProfession").isNotNull(), lower(trim(col("primaryProfession"))))
    )

    df = df.withColumn(
        "primaryProfession_array",
        when(col("primaryProfession_norm").isNotNull(),
             array_distinct(array_remove(split(col("primaryProfession_norm"), ","), "")))
        .otherwise(expr("array()"))
    )

    df = df.withColumn("profession_count", size(col("primaryProfession_array")))
    df = df.withColumn("is_actor",
                       expr("array_contains(primaryProfession_array, 'actor')") |
                       expr("array_contains(primaryProfession_array, 'actress')"))

    df = df.withColumn("is_director", expr("array_contains(primaryProfession_array, 'director')"))
    df = df.withColumn("is_writer", expr("array_contains(primaryProfession_array, 'writer')"))
    df = df.withColumn("is_producer", expr("array_contains(primaryProfession_array, 'producer')"))

    return df

def parse_titles(df: DataFrame) -> DataFrame:
    df = df.withColumn(
        "knownForTitles_array_raw",
        when(col("knownForTitles").isNotNull(),
             array_distinct(array_remove(split(col("knownForTitles"), ","), "")))
        .otherwise(expr("array()"))
    )

    df = df.withColumn(
        "knownForTitles_array",
        expr("filter(knownForTitles_array_raw, x -> x rlike '^tt[0-9]+$')")
    )

    df = df.withColumn("titles_count", size(col("knownForTitles_array")))
    return df


# ============================================================
# 4. SILVER ALL (CLEANED + QUALITY FLAGS)
# ============================================================

@dlt.table(
    name="name_basics_silver_all",
    comment="Silver cleaned table with quality scoring"
)
@dlt.expect_or_drop("valid_nconst_not_null", "nconst IS NOT NULL")
@dlt.expect_or_drop("valid_nconst_format", "nconst RLIKE '^nm[0-9]+$'")
def name_basics_silver_all():

    df = dlt.read("name_basics_bronze")

    df = validate_person_id(df)
    df = clean_name(df)
    df = clean_year(df, "birthYear")
    df = clean_year(df, "deathYear")
    df = parse_professions(df)
    df = parse_titles(df)

    df = df.withColumn("is_deceased", col("deathYear_clean").isNotNull())

    df = df.withColumn(
        "age_at_death",
        when(col("deathYear_clean").isNotNull() & col("birthYear_clean").isNotNull(),
             col("deathYear_clean") - col("birthYear_clean"))
    )

    df = df.withColumn(
        "current_age",
        when(~col("is_deceased") & col("birthYear_clean").isNotNull(),
             year(current_timestamp()) - col("birthYear_clean"))
    )

    # FIXED RULE: allow ANY real historical birth year (1–current_year)
    df = df.withColumn(
        "has_valid_birth_year",
        col("birthYear_clean").isNotNull() &
        (col("birthYear_clean") >= 1) &
        (col("birthYear_clean") <= year(current_timestamp()))
    )

    df = df.withColumn(
        "has_valid_death_year",
        col("deathYear_clean").isNull() |
        (
            col("deathYear_clean").isNotNull()
            & col("birthYear_clean").isNotNull()
            & (col("deathYear_clean") >= col("birthYear_clean"))
            & (col("deathYear_clean") <= year(current_timestamp()))
        )
    )

    df = df.withColumn(
        "has_anomalous_age",
        (col("age_at_death") < 0) |
        (col("age_at_death") > 120) |
        (col("current_age") < 0) |
        (col("current_age") > 120)
    )

    df = df.withColumn(
        "data_completeness_score",
        (
            when(col("nconst_valid"), 20).otherwise(0)
            + when(col("primaryName_clean").isNotNull(), 20).otherwise(0)
            + when(col("birthYear_clean").isNotNull(), 20).otherwise(0)
            + when(col("profession_count") > 0, 20).otherwise(0)
            + when(col("titles_count") > 0, 20).otherwise(0)
        )
    )

    df = df.withColumn(
        "quality_tier",
        when(col("data_completeness_score") >= 80, "HIGH")
        .when(col("data_completeness_score") >= 60, "MEDIUM")
        .when(col("data_completeness_score") >= 40, "LOW")
        .otherwise("POOR")
    )

    df = df.withColumn("silver_processing_timestamp", current_timestamp())
    df = df.withColumn("silver_processing_date", current_timestamp().cast("date"))
    df = df.withColumn("silver_version", lit("1.0"))

    df = df.withColumn(
        "silver_quality_check",
        when(col("has_anomalous_age"), "FAILED")
        .when(~coalesce(col("has_valid_birth_year"), lit(False)), "FAILED")
        .when(~coalesce(col("has_valid_death_year"), lit(False)), "FAILED")
        .when(col("quality_tier") == "POOR", "FAILED")
        .otherwise("PASSED")
    )

    w = Window.partitionBy("nconst").orderBy(col("bronze_ingestion_timestamp").desc())
    df = df.withColumn("rn", row_number().over(w)).filter(col("rn") == 1).drop("rn")

    return df


# ============================================================
# 5. SILVER CLEAN (NO NULLS)
# ============================================================

@dlt.table(
    name="name_basics_silver_clean",
    comment="Silver high-quality rows (no NULLs)"
)
def name_basics_silver_clean():
    df = dlt.read("name_basics_silver_all").filter(col("silver_quality_check") == "PASSED")

    return df.select(
        coalesce(col("nconst"), lit("UNKNOWN")).alias("person_imdb_id"),
        coalesce(col("primaryName_clean"), lit("Unknown")).alias("primary_name"),
        coalesce(col("quality_tier"), lit("UNKNOWN")).alias("quality_tier"),

        coalesce(col("birthYear_clean"), lit(-1)).alias("birth_year"),
        coalesce(col("deathYear_clean"), lit(-1)).alias("death_year"),
        coalesce(col("age_at_death"), lit(-1)).alias("age_at_death"),
        coalesce(col("current_age"), lit(-1)).alias("current_age"),

        coalesce(col("profession_count"), lit(0)).alias("profession_count"),
        coalesce(col("titles_count"), lit(0)).alias("titles_count"),
        coalesce(col("data_completeness_score"), lit(0)).alias("data_completeness_score"),

        coalesce(col("is_deceased"), lit(False)).alias("is_deceased"),
        coalesce(col("is_actor"), lit(False)).alias("is_actor"),
        coalesce(col("is_director"), lit(False)).alias("is_director"),
        coalesce(col("is_writer"), lit(False)).alias("is_writer"),
        coalesce(col("is_producer"), lit(False)).alias("is_producer"),

        coalesce(col("primaryProfession_array"), expr("array()")).alias("primary_professions"),
        coalesce(col("knownForTitles_array"), expr("array()")).alias("known_for_titles"),

        coalesce(col("silver_processing_timestamp"), current_timestamp()).alias("silver_processing_timestamp"),
        coalesce(col("silver_processing_date"), current_timestamp().cast("date")).alias("silver_processing_date"),
        coalesce(col("silver_version"), lit("1.0")).alias("silver_version"),

        coalesce(col("bronze_ingestion_timestamp"), current_timestamp()).alias("bronze_ingestion_timestamp"),
        coalesce(col("bronze_ingestion_date"), current_timestamp().cast("date")).alias("bronze_ingestion_date"),

        coalesce(col("bronze_source_system"), lit("imdb")).alias("bronze_source_system"),
        coalesce(col("bronze_source_file"), lit("UNKNOWN_FILE")).alias("bronze_source_file"),
        coalesce(col("bronze_record_hash"), lit("")).alias("bronze_record_hash")
    )


# ============================================================
# 6. QUARANTINE
# ============================================================

@dlt.table(
    name="name_basics_quarantine",
    comment="Records that failed Silver quality checks"
)
def name_basics_quarantine():
    return dlt.read("name_basics_silver_all").filter(col("silver_quality_check") == "FAILED")
