In [0]:
# Cell 1: Load title.episode.tsv data
df_title_episode = spark.read.option("header", "true") \
    .option("sep", "\t") \
    .option("nullValue", "\\N") \
    .csv("/Volumes/workspace/damg7370/datastore/IMDB/title.episode.tsv.gz")

In [0]:
# Cell 2: DataFrame describe
df_title_episode.describe()

In [0]:

# Cell 3: DataFrame display
display(df_title_episode)


In [0]:
# Cell 4: Trim and rename columns
# Drop summary column if exists
df_titleepisode_trimmed = df_title_episode.drop("summary") if "summary" in df_title_episode.columns else df_title_episode

In [0]:
# Rename columns for title.episode
df_titleepisode_trimmed = (
    df_titleepisode_trimmed
        .withColumnRenamed("tconst", "TCONST")
        .withColumnRenamed("parentTconst", "Parent_TCONST")
        .withColumnRenamed("seasonNumber", "Season_Number")
        .withColumnRenamed("episodeNumber", "Episode_Number")
)
display(df_titleepisode_trimmed)


In [0]:
# Cell 5: Install ydata-profiling
%pip install ydata-profiling


In [0]:

# Cell 6: Restart Python
%restart_python

In [0]:
# Cell 7: Install databricks-labs-dqx
%pip install databricks-labs-dqx

In [0]:
# Cell 8: Profile the data using DQX
from databricks.labs.dqx.profiler.profiler import DQProfiler
from databricks.labs.dqx.profiler.generator import DQGenerator
from databricks.labs.dqx.engine import DQEngine
from databricks.sdk import WorkspaceClient
import json

profile_options = {
    "round": True,
    "max_in_count": 10,
    "distinct_ratio": 0.05,
    "max_null_ratio": 0.01,
    "remove_outliers": True,
    "outlier_columns": [],
    "num_sigmas": 3,
    "trim_strings": True,
    "max_empty_ratio": 0.01,
    "sample_fraction": 0.3,
    "sample_seed": None,
    "limit": 100,
}

ws = WorkspaceClient()

for c in ws.clusters.list():
    print(c.cluster_name)
print("-----------------------------------------------------------------------------------------------")

db_fs = ws.dbutils.fs.ls('/')
for f in db_fs:
    print(f.path)
print("-----------------------------------------------------------------------------------------------")

profiler = DQProfiler(ws)

summary_stats, profiles = profiler.profile(df_titleepisode_trimmed, options=profile_options)

for pf in profiles:
    print(pf)

print("-----------------------------------------------------------------------------------------------")

json_formatted = json.dumps(summary_stats, indent=4)
print(json_formatted)
print("-----------------------------------------------------------------------------------------------")

In [0]:
# Cell 9: Generate DQX quality rules
generator = DQGenerator(ws)
checks = generator.generate_dq_rules(profiles)
for chk in checks:
    print(chk)
print("-----------------------------------------------------------------------------------------------")

In [0]:
# Cell 10: Apply checks and validate
dqengine = DQEngine(ws)
results = dqengine.apply_checks_by_metadata(df_titleepisode_trimmed, checks)
display(results)

In [0]:
import yaml
from pyspark.sql.functions import col, when, count

# User-defined DQX checks for TCONST and Parent_TCONST format validation
udChecks = yaml.safe_load("""
- criticality: error
  check:
    function: sql_expression
    arguments:
      expression: "TCONST RLIKE '^tt[0-9]+$' AND Parent_TCONST RLIKE '^tt[0-9]+$'"
      msg: "Invalid TCONST or Parent_TCONST format - must start with 'tt' followed by digits"
- criticality: warn
  check:
    function: sql_expression
    arguments:
      expression: "Season_Number IS NULL OR (Season_Number RLIKE '^[0-9]+$' AND CAST(Season_Number AS INT) >= 0)"
      msg: "Invalid Season_Number - must be a non-negative integer"
- criticality: warn
  check:
    function: sql_expression
    arguments:
      expression: "Episode_Number IS NULL OR (Episode_Number RLIKE '^[0-9]+$' AND CAST(Episode_Number AS INT) >= 1)"
      msg: "Invalid Episode_Number - must be a positive integer"
""")

dqengine = DQEngine(ws)

valid, quarantine = dqengine.apply_checks_by_metadata_and_split(
    df_titleepisode_trimmed,
    udChecks,
    globals()
)

display(quarantine)

In [0]:
# Cell 12: Show all quarantined rows
quarantine.select("*").show()

In [0]:
# Cell 12a: Verify quarantine DataFrame
print(f"Quarantine row count: {quarantine.count()}")
print(f"Valid row count: {valid.count()}")
print("\nQuarantine columns:")
print(quarantine.columns)
print("\nQuarantine schema:")
quarantine.printSchema()

# Show sample if there are quarantined rows
if quarantine.count() > 0:
    print("\nSample quarantine data:")
    quarantine.show(5)
else:
    print("\nNo rows in quarantine - all data passed validation!")

In [0]:
# Cell 13: Check quarantine data and aggregate
print(f"Quarantine row count: {quarantine.count()}")
print("Quarantine columns:")
quarantine.printSchema()

# Show a sample of quarantine data
print("Sample quarantine data:")
quarantine.show(5)

# Now try the aggregation if Season_Number exists
if "Season_Number" in quarantine.columns:
    aggresult = quarantine.groupBy("Season_Number").count()
    display(aggresult)
else:
    print("Season_Number column not found in quarantine DataFrame")

In [0]:
# Cell 14: Aggregation by Episode_Number (if data exists)
if quarantine.count() > 0 and "Episode_Number" in quarantine.columns:
    aggresult = quarantine.groupBy("Episode_Number").count()
    display(aggresult)
else:
    print("No quarantine data to aggregate or Episode_Number column not found")

In [0]:
# Cell 15: Non-null counts for selected columns
# Use original column names that exist in quarantine
quarantine_columns = ["seasonNumber", "episodeNumber", "parentTconst"]

# First check what columns actually exist
print("Available columns in quarantine:")
print(quarantine.columns)
print()

# Then do the count
quarantine.select(
    [count(when(col(c).isNotNull(), c)).alias(c) for c in quarantine_columns]
).show()

In [0]:
# Cell 16: Null counts for selected columns
quarantine.select(
    [count(when(col(c).isNull(), c)).alias(c) for c in quarantine_columns]
).show()

In [0]:

# Cell 17: Distinct invalid TCONST values
quarantine_dist = quarantine.select("TCONST").distinct()
quarantine_dist.show()
print(quarantine_dist.count())


In [0]:
# Cell 18: Distinct invalid Parent_TCONST values
quarantine_dist_parent = quarantine.select("parentTconst").distinct()
quarantine_dist_parent.show()
print(quarantine_dist_parent.count())