In [0]:
# Cell 1: Load title.crew.tsv data
df_title_crew = spark.read.option("header", "true") \
    .option("sep", "\t") \
    .option("nullValue", "\\N") \
    .csv("/Volumes/workspace/damg7370/datastore/IMDB/title.crew.tsv.gz")
        




In [0]:
# Cell 2: DataFrame describe
df_title_crew.describe()


In [0]:
# Cell 3: DataFrame display
display(df_title_crew)

In [0]:
# Cell 4: Trim and rename columns
# Drop summary column if exists
df_titlecrew_trimmed = df_title_crew.drop("summary") if "summary" in df_title_crew.columns else df_title_crew

In [0]:
# Rename columns for title.crew
df_titlecrew_trimmed = (
    df_titlecrew_trimmed
        .withColumnRenamed("tconst", "TCONST")
        .withColumnRenamed("directors", "Directors")
        .withColumnRenamed("writers", "Writers")
)

display(df_titlecrew_trimmed)

In [0]:
# Cell 5: Install ydata-profiling
%pip install ydata-profiling


In [0]:
# Cell 6: Restart Python
%restart_python

In [0]:
# Cell 7: Install databricks-labs-dqx
%pip install databricks-labs-dqx

In [0]:
from databricks.labs.dqx.profiler.profiler import DQProfiler
from databricks.sdk import WorkspaceClient

ws = WorkspaceClient()
profiler = DQProfiler(ws)

profile_options = {"sample_fraction": 0.3}

summary_stats, profiles = profiler.profile(df_titlecrew_trimmed, options=profile_options)

print(profiles)

In [0]:
# Cell 9: Generate DQX quality rules
# generate DQX quality rules based on the profile
generator = DQGenerator(ws)
checks = generator.generate_dq_rules(profiles)
for chk in checks:
    print(chk)
print("-----------------------------------------------------------------------------------------------")

In [0]:
# Cell 10: Apply checks and validate
# apply the checks generated based on profile to validate
dqengine = DQEngine(ws)
results = dqengine.apply_checks_by_metadata(df_titlecrew_trimmed, checks)
display(results)

In [0]:
# Cell 11: User-defined DQX checks for title.crew
import yaml
from pyspark.sql.functions import col, when, count

# 1️⃣ User-defined DQX checks for TCONST format validation
udChecks = yaml.safe_load("""
- criticality: error
  check:
    function: sql_expression
    arguments:
      expression: "TCONST RLIKE '^tt[0-9]+$'"
      msg: "Invalid TCONST format - must start with 'tt' followed by digits"
""")

dqengine = DQEngine(ws)

# 2️⃣ Apply check and split into valid vs quarantine
valid, quarantine = dqengine.apply_checks_by_metadata_and_split(
    df_titlecrew_trimmed,
    udChecks,
    globals()
)

display(quarantine)


In [0]:
# Cell 12: Show all quarantined rows
# Show all quarantined rows in console
quarantine.select("*").show()

In [0]:
# Cell 13: Aggregation by Directors
# ── 2. Aggregation: how many bad rows per Directors ──
aggresult = quarantine.groupBy("Directors").count()
display(aggresult)

In [0]:
# Cell 14: Aggregation by Writers
# Example: aggregation on Writers column
aggresult = quarantine.groupBy("Writers").count()
display(aggresult)

In [0]:
# Cell 15: Non-null counts for selected columns
# ── 3. Non-null counts for selected columns ──
quarantine_columns = ["Directors", "Writers"]
quarantine.select(
    [count(when(col(c).isNotNull(), c)).alias(c) for c in quarantine_columns]
).show()

In [0]:
# Cell 16: Null counts for selected columns
# ── 4. Null counts for the same columns ──
quarantine.select(
    [count(when(col(c).isNull(), c)).alias(c) for c in quarantine_columns]
).show()


In [0]:
# Cell 17: Distinct invalid TCONST values
# ── 5. Distinct invalid TCONST values (from quarantine) ──
quarantine_dist = quarantine.select("TCONST").distinct()
quarantine_dist.show()
print(quarantine_dist.count())