In [0]:
import dlt
from pyspark.sql.functions import col, trim, coalesce, lit

RAW_BASE_PATH = "/Volumes/workspace/imdb/imdb"

# ---------------------------
# BRONZE – PRINCIPALS
# ---------------------------

@dlt.table(
    name="imdb_title_principals_bronze",
    comment="Raw IMDb title.principals.tsv from Volumes"
)
def imdb_title_principals_bronze():
    return (
        spark.read
            .option("header", "true")
            .option("sep", "\t")
            .option("nullValue", "\\N")
            .csv(RAW_BASE_PATH + "/title.principals.tsv")
    )

# ---------------------------
# BRONZE – RATINGS
# ---------------------------

@dlt.table(
    name="imdb_title_ratings_bronze",
    comment="Raw IMDb title.ratings.tsv from Volumes"
)
def imdb_title_ratings_bronze():
    return (
        spark.read
            .option("header", "true")
            .option("sep", "\t")
            .option("nullValue", "\\N")
            .csv(RAW_BASE_PATH + "/title.ratings.tsv")
    )

# ---------------------------
# SILVER – PRINCIPALS
# ---------------------------

@dlt.table(
    name="imdb_title_principals_silver",
    comment="Cleaned principals – renamed, typed, filtered, no nulls"
)
def imdb_title_principals_silver():
    df = dlt.read("imdb_title_principals_bronze")

    df_renamed = (
        df.withColumnRenamed("tconst", "TCONST")
          .withColumnRenamed("ordering", "Ordering")
          .withColumnRenamed("nconst", "NCONST")
          .withColumnRenamed("category", "Category")
          .withColumnRenamed("job", "Job")
          .withColumnRenamed("characters", "Characters")
    )

    df_clean = (
        df_renamed
            .withColumn("Ordering", col("Ordering").cast("int"))
            .withColumn("Category", trim(col("Category")))
            .withColumn("Job", trim(col("Job")))
            .filter(col("TCONST").isNotNull() & col("TCONST").rlike("^tt[0-9]+$"))
            .filter(col("NCONST").isNotNull() & col("NCONST").rlike("^nm[0-9]+$"))
            .filter(col("Ordering").isNotNull() & (col("Ordering") > 0))
            .filter(col("Category").isNotNull() & (col("Category") != ""))
            .withColumn("Job", coalesce(col("Job"), lit("Unknown")))
            .withColumn("Characters", coalesce(col("Characters"), lit("[]")))
    )

    return df_clean.na.drop("any")

# ---------------------------
# SILVER – RATINGS
# ---------------------------

@dlt.table(
    name="imdb_title_ratings_silver",
    comment="Cleaned ratings – valid TCONST, rating 0–10, non-negative votes"
)
def imdb_title_ratings_silver():
    df = dlt.read("imdb_title_ratings_bronze")

    df_renamed = (
        df.withColumnRenamed("tconst", "TCONST")
          .withColumnRenamed("averageRating", "Average_Rating")
          .withColumnRenamed("numVotes", "Num_Votes")
    )

    df_clean = (
        df_renamed
            .withColumn("Average_Rating", col("Average_Rating").cast("double"))
            .withColumn("Num_Votes", col("Num_Votes").cast("int"))
            .filter(col("TCONST").isNotNull() & col("TCONST").rlike("^tt[0-9]+$"))
            .filter(col("Average_Rating").isNotNull() &
                    (col("Average_Rating") >= 0.0) &
                    (col("Average_Rating") <= 10.0))
            .filter(col("Num_Votes").isNotNull() & (col("Num_Votes") >= 0))
    )

    return df_clean.na.drop("any")