In [0]:
df_name_basics = spark.read.option("header", "true") \
    .option("sep", "\t") \
    .option("nullValue", "\\N") \
    .csv("/Volumes/workspace/damg7370/datastore/imdb/raw/name.basics.tsv")

In [0]:
#DataFrame describe
df_name_basics.describe()

In [0]:
#DataFrame display
display(df_name_basics)

In [0]:
%pip install ydata-profiling

In [0]:
%restart_python

In [0]:
pip install databricks-labs-dqx

In [0]:
df_namebasics_trimmed = df_name_basics.drop("summary")

# Rename columns
df_namebasics_trimmed = (
    df_namebasics_trimmed
        .withColumnRenamed("nconst", "NCONST")
        .withColumnRenamed("primaryName", "Primary_Name")
        .withColumnRenamed("birthYear", "Birth_Year")
        .withColumnRenamed("deathYear", "Death_Year")
        .withColumnRenamed("primaryProfession", "Primary_Profession")
        .withColumnRenamed("knownForTitles", "Known_For_Titles")
)

# Display final dataframe
display(df_namebasics_trimmed)

In [0]:
from databricks.labs.dqx.profiler.profiler import DQProfiler
from databricks.labs.dqx.profiler.generator import DQGenerator
from databricks.labs.dqx.profiler.dlt_generator import DQDltGenerator
from databricks.labs.dqx.engine import DQEngine
from databricks.sdk import WorkspaceClient
import json

profile_options = {
    "round": True,           # round the min/max values
    "max_in_count": 10,      # generate is_in if we have less than 1 percent of distinct values
    "distinct_ratio": 0.05,  # generate is_distinct if we have less than 1 percent of distinct values
    "max_null_ratio": 0.01,  # generate is_null if we have less than 1 percent of nulls
    "remove_outliers": True, # remove outliers
    "outlier_columns": [],   # remove outliers in the columns
    "num_sigmas": 3,         # number of sigmas to use when remove_outliers is True
    "trim_strings": True,    # trim whitespace from strings
    "max_empty_ratio": 0.01, # generate is_empty if we have less than 1 percent of empty strings
    "sample_fraction": 0.3,  # fraction of data to sample (30%)
    "sample_seed": None,     # seed for sampling
    "limit": 100,            # limit the number of samples
}

# (Optional) if you ever want to restrict to specific columns:
# columns_to_profile = [
#     "nconst",
#     "primary_name",
#     "birth_year",
#     "death_year",
#     "primary_profession",
#     "known_for_titles"
# ]

# The engine requires a Databricks workspace client for authentication
ws = WorkspaceClient()

for c in ws.clusters.list():
    print(c.cluster_name)
print("-----------------------------------------------------------------------------------------------")

db_fs = ws.dbutils.fs.ls('/')
for f in db_fs:
    print(f.path)
print("-----------------------------------------------------------------------------------------------")

# Profile the data
profiler = DQProfiler(ws)

# ðŸ‘‰ Profile ALL columns of df_namebasics_renamed
summary_stats, profiles = profiler.profile(
    df_namebasics_renamed,
    options=profile_options
    # or, if you want specific columns only:
    # cols=columns_to_profile
)

# Print the data profile / generated rules
for pf in profiles:  # print list
    print(pf)

print("-----------------------------------------------------------------------------------------------")

# Pretty-print summary stats
json_formatted = json.dumps(summary_stats, indent=4)
print(json_formatted)
print("-----------------------------------------------------------------------------------------------")


In [0]:
# generate DQX quality rules based on the profile
generator = DQGenerator(ws)
checks = generator.generate_dq_rules(profiles)
for chk in checks:
    print(chk)
print("-----------------------------------------------------------------------------------------------")

In [0]:
# drop few columns from DF
# apply the checks generated based on profile to validate
dqengine = DQEngine(ws)
results = dqengine.apply_checks_by_metadata(df_namebasics_renamed, checks)
display(results)    

In [0]:
import yaml
from pyspark.sql.functions import col, when, count

# â”€â”€ 1. User-defined DQX check (example: Birth_Year must be a valid 4-digit year in range) â”€â”€
udChecks = yaml.safe_load("""
- criticality: error
  check:
    function: sql_expression
    arguments:
      expression: "Birth_Year RLIKE '^[0-9]{4}$' AND CAST(Birth_Year AS INT) BETWEEN 1800 AND 2025"
      msg: "Invalid or out-of-range Birth_Year"
""")
dqengine = DQEngine(ws)

# Apply check and split into valid vs quarantine
valid, quarantine = dqengine.apply_checks_by_metadata_and_split(
    df_namebasics_trimmed,
    udChecks,
    globals()
)
display(quarantine)

In [0]:
# Show all quarantined rows in console
quarantine.select("*").show()

In [0]:

# â”€â”€ 2. Aggregation: how many bad rows per Birth_Year â”€â”€
aggresult = quarantine.groupBy("Birth_Year").count()
display(aggresult)

In [0]:
# (repeat if you want to group by something else, e.g. Primary_Profession)
aggresult = quarantine.groupBy("Primary_Profession").count()
display(aggresult)

In [0]:
# â”€â”€ 3. Non-null counts for selected columns â”€â”€
quarantine_columns = ["Birth_Year", "Death_Year"]

quarantine.select(
    [count(when(col(c).isNotNull(), c)).alias(c) for c in quarantine_columns]
).show()

In [0]:
# â”€â”€ 4. Null counts for the same columns â”€â”€
quarantine.select(
    [count(when(col(c).isNull(), c)).alias(c) for c in quarantine_columns]
).show()

In [0]:
# â”€â”€ 5. Distinct invalid Birth_Year values (from quarantine or full DF â€“ here quarantine) â”€â”€
quarantine_dist = quarantine.select("Birth_Year").distinct()
quarantine_dist.show()
print(quarantine_dist.count())