In [0]:
df_title_akas = spark.read.option("header", "true") \
    .option("sep", "\t") \
    .option("nullValue", "\\N") \
    .csv("/Volumes/workspace/damg7370/datastore/imdb/raw/title.akas.tsv")

In [0]:
#DataFrame describe
df_title_akas.describe()

In [0]:
#DataFrame display
display(df_title_akas)

In [0]:
# Current schema:
# summary, titleId, ordering, title, region, language, types, attributes, isOriginalTitle

# 1️⃣ Drop the summary column
df_titleakas_trimmed = df_title_akas.drop("summary")

# 2️⃣ Rename columns (same style as your other tables)
df_titleakas_trimmed = (
    df_titleakas_trimmed
        .withColumnRenamed("titleId", "Title_ID")
        .withColumnRenamed("ordering", "Ordering")
        .withColumnRenamed("title", "Title")
        .withColumnRenamed("region", "Region")
        .withColumnRenamed("language", "Language")
        .withColumnRenamed("types", "Types")
        .withColumnRenamed("attributes", "Attributes")
        .withColumnRenamed("isOriginalTitle", "Is_Original_Title")
)

display(df_titleakas_trimmed)


In [0]:
%pip install ydata-profiling

In [0]:
%restart_python

In [0]:
pip install databricks-labs-dqx

In [0]:
from databricks.labs.dqx.profiler.profiler import DQProfiler
from databricks.labs.dqx.profiler.generator import DQGenerator
from databricks.labs.dqx.profiler.dlt_generator import DQDltGenerator
from databricks.labs.dqx.engine import DQEngine
from databricks.sdk import WorkspaceClient
import json
import yaml
from pyspark.sql.functions import col, when, count

profile_options = {
    "round": True,
    "max_in_count": 10,
    "distinct_ratio": 0.05,
    "max_null_ratio": 0.01,
    "remove_outliers": True,
    "outlier_columns": [],
    "num_sigmas": 3,
    "trim_strings": True,
    "max_empty_ratio": 0.01,
    "sample_fraction": 0.3,
    "sample_seed": None,
    "limit": 100,
}

# ── Workspace client ─────────────────────────────────────────────────────────
ws = WorkspaceClient()

for c in ws.clusters.list():
    print(c.cluster_name)
print("-----------------------------------------------------------------------------------------------")

db_fs = ws.dbutils.fs.ls('/')
for f in db_fs:
    print(f.path)
print("-----------------------------------------------------------------------------------------------")

# ── 1. Profile the data (title.akas trimmed dataframe) ───────────────────────
profiler = DQProfiler(ws)

summary_stats, profiles = profiler.profile(
    df_titleakas_trimmed,        # << use title.akas DF
    options=profile_options
)

# Print the data profile / generated rules metadata
for pf in profiles:
    print(pf)

print("-----------------------------------------------------------------------------------------------")

# Pretty-print summary stats
json_formatted = json.dumps(summary_stats, indent=4)
print(json_formatted)
print("-----------------------------------------------------------------------------------------------")

In [0]:
# ── 2. Generate DQX quality rules based on the profile ───────────────────────
generator = DQGenerator(ws)
checks = generator.generate_dq_rules(profiles)

for chk in checks:
    print(chk)
print("-----------------------------------------------------------------------------------------------")

In [0]:
# Apply the generated checks
dqengine = DQEngine(ws)
results = dqengine.apply_checks_by_metadata(df_titleakas_trimmed, checks)
display(results)

In [0]:
# ── 3. User-defined DQX check (example: Ordering must be a non-negative integer) ─────────────────
udChecks = yaml.safe_load("""
- criticality: error
  check:
    function: sql_expression
    arguments:
      expression: "Ordering RLIKE '^[0-9]+$' AND CAST(Ordering AS INT) >= 0"
      msg: "Invalid Ordering value (must be a non-negative integer)"
""")

# Apply UDC and split into valid & quarantine
valid, quarantine = dqengine.apply_checks_by_metadata_and_split(
    df_titleakas_trimmed,
    udChecks,
    globals()
)

display(quarantine)

In [0]:
# Show all quarantined rows in console
quarantine.select("*").show()

In [0]:
# 4.1 How many bad rows per Ordering
aggresult = quarantine.groupBy("Ordering").count()
display(aggresult)

In [0]:
# 4.2 Aggregation on another column (e.g., Region)
aggresult = quarantine.groupBy("Region").count()
display(aggresult)

In [0]:
# 4.3 Non-null counts for selected columns
quarantine_columns = ["Ordering", "Region"]

quarantine.select(
    [count(when(col(c).isNotNull(), c)).alias(c) for c in quarantine_columns]
).show()

In [0]:
# 4.4 Null counts for the same columns
quarantine.select(
    [count(when(col(c).isNull(), c)).alias(c) for c in quarantine_columns]
).show()

In [0]:
# 4.5 Distinct invalid Ordering values (from quarantine)
quarantine_dist = quarantine.select("Ordering").distinct()
quarantine_dist.show()
print(quarantine_dist.count())