In [0]:
df_title_basics = spark.read.option("header", "true") \
    .option("sep", "\t") \
    .option("nullValue", "\\N") \
    .csv("/Volumes/workspace/damg7370/datastore/imdb/raw/title.basics.tsv")

In [0]:
#DataFrame describe
df_title_basics.describe()

In [0]:
#DataFrame display
display(df_title_basics)

In [0]:
# Trim the summary column
df_titlebasics_trimmed = df_title_basics.drop("summary")

# Rename only the required columns
df_titlebasics_trimmed = (
    df_titlebasics_trimmed
        .withColumnRenamed("tconst", "TCONST")
        .withColumnRenamed("titleType", "Title_Type")
        .withColumnRenamed("primaryTitle", "Primary_Title")
        .withColumnRenamed("originalTitle", "Original_Title")
        .withColumnRenamed("isAdult", "Is_Adult")
        .withColumnRenamed("startYear", "Start_Year")
        .withColumnRenamed("endYear", "End_Year")
        .withColumnRenamed("runtimeMinutes", "Runtime_Minutes")
        .withColumnRenamed("genres", "Genres")
)

display(df_titlebasics_trimmed)


In [0]:
%pip install ydata-profiling

In [0]:
%restart_python

In [0]:
pip install databricks-labs-dqx

In [0]:
from databricks.labs.dqx.profiler.profiler import DQProfiler
from databricks.labs.dqx.profiler.generator import DQGenerator
from databricks.labs.dqx.profiler.dlt_generator import DQDltGenerator
from databricks.labs.dqx.engine import DQEngine
from databricks.sdk import WorkspaceClient
import json

profile_options = {
    "round": True,
    "max_in_count": 10,
    "distinct_ratio": 0.05,
    "max_null_ratio": 0.01,
    "remove_outliers": True,
    "outlier_columns": [],
    "num_sigmas": 3,
    "trim_strings": True,
    "max_empty_ratio": 0.01,
    "sample_fraction": 0.3,
    "sample_seed": None,
    "limit": 100,
}

# The engine requires a Databricks workspace client for authentication
ws = WorkspaceClient()

for c in ws.clusters.list():
    print(c.cluster_name)
print("-----------------------------------------------------------------------------------------------")

db_fs = ws.dbutils.fs.ls('/')
for f in db_fs:
    print(f.path)
print("-----------------------------------------------------------------------------------------------")

# Profile the data
profiler = DQProfiler(ws)

# üëâ Profile ALL columns of the trimmed/renamed title.basics dataframe
summary_stats, profiles = profiler.profile(
    df_titlebasics_trimmed,
    options=profile_options
)

# Print the data profile / generated rules
for pf in profiles:
    print(pf)

print("-----------------------------------------------------------------------------------------------")

# Pretty-print summary stats
json_formatted = json.dumps(summary_stats, indent=4)
print(json_formatted)
print("-----------------------------------------------------------------------------------------------")


In [0]:
# generate DQX quality rules based on the profile
generator = DQGenerator(ws)
checks = generator.generate_dq_rules(profiles)
for chk in checks:
    print(chk)
print("-----------------------------------------------------------------------------------------------")

In [0]:
# drop few columns from DF
# apply the checks generated based on profile to validate
dqengine = DQEngine(ws)
results = dqengine.apply_checks_by_metadata(df_titlebasics_trimmed, checks)
display(results)   

In [0]:
import yaml
from pyspark.sql.functions import col, when, count

# 1Ô∏è‚É£ User-defined DQX check for Start_Year
udChecks = yaml.safe_load("""
- criticality: error
  check:
    function: sql_expression
    arguments:
      expression: "Start_Year RLIKE '^[0-9]{4}$' AND CAST(Start_Year AS INT) BETWEEN 1800 AND 2100"
      msg: "Invalid or out-of-range Start_Year"
""")

dqengine = DQEngine(ws)

# 2Ô∏è‚É£ Apply check and split into valid vs quarantine
valid, quarantine = dqengine.apply_checks_by_metadata_and_split(
    df_titlebasics_trimmed,   # <<< use your title.basics DF here
    udChecks,
    globals()
)

display(quarantine)


In [0]:
# Show all quarantined rows in console
quarantine.select("*").show()

In [0]:
# ‚îÄ‚îÄ 2. Aggregation: how many bad rows per Start_Year ‚îÄ‚îÄ
aggresult = quarantine.groupBy("Start_Year").count()
display(aggresult)

In [0]:
# Example: aggregation on another column (e.g., Title_Type)
aggresult = quarantine.groupBy("Title_Type").count()
display(aggresult)


In [0]:

# ‚îÄ‚îÄ 3. Non-null counts for selected columns ‚îÄ‚îÄ
quarantine_columns = ["Start_Year", "End_Year"]
quarantine.select(
    [count(when(col(c).isNotNull(), c)).alias(c) for c in quarantine_columns]
).show()

In [0]:
# ‚îÄ‚îÄ 4. Null counts for the same columns ‚îÄ‚îÄ
quarantine.select(
    [count(when(col(c).isNull(), c)).alias(c) for c in quarantine_columns]
).show()


In [0]:
# ‚îÄ‚îÄ 5. Distinct invalid Start_Year values (from quarantine) ‚îÄ‚îÄ
quarantine_dist = quarantine.select("Start_Year").distinct()
quarantine_dist.show()
print(quarantine_dist.count())