In [0]:
df_title_ratings = spark.read.option("header", "true") \
    .option("sep", "\t") \
    .option("nullValue", "\\N") \
    .csv("/Volumes/workspace/imdb/imdb/title.ratings.tsv")

df_title_ratings.describe()
display(df_title_ratings)


In [0]:
from pyspark.sql.functions import col

df_title_ratings_renamed = (
    df_title_ratings
        .withColumnRenamed("tconst", "TCONST")
        .withColumnRenamed("averageRating", "Average_Rating")
        .withColumnRenamed("numVotes", "Num_Votes")
)

display(df_title_ratings_renamed)
df_title_ratings_renamed.printSchema()


In [0]:
from pyspark.sql.functions import col

df_title_ratings_clean = (
    df_title_ratings_renamed
        # cast numeric columns
        .withColumn("Average_Rating", col("Average_Rating").cast("double"))
        .withColumn("Num_Votes", col("Num_Votes").cast("int"))
        # keep only rows with good IDs and valid ranges
        .filter(col("TCONST").isNotNull() & col("TCONST").rlike("^tt[0-9]+$"))
        .filter(col("Average_Rating").isNotNull() &
                (col("Average_Rating") >= 0.0) &
                (col("Average_Rating") <= 10.0))
        .filter(col("Num_Votes").isNotNull() & (col("Num_Votes") >= 0))
)

# drop any row that still has nulls
df_title_ratings_clean = df_title_ratings_clean.na.drop("any")

print("Original ratings rows:", df_title_ratings.count())
print("After cleaning (modelling ready ratings):", df_title_ratings_clean.count())
display(df_title_ratings_clean.limit(20))


In [0]:
%pip install ydata-profiling

In [0]:
import databricks.labs.dqx
print("DQX is available here")


In [0]:
from databricks.labs.dqx.profiler.profiler import DQProfiler
from databricks.labs.dqx.profiler.generator import DQGenerator
from databricks.labs.dqx.engine import DQEngine
from databricks.sdk import WorkspaceClient
import json, yaml

ws = WorkspaceClient()   # reuse if already defined above

profiler_r = DQProfiler(ws)

profile_options_r = {
    "round": True,
    "max_in_count": 10,
    "distinct_ratio": 0.05,
    "max_null_ratio": 0.01,
    "remove_outliers": True,
    "outlier_columns": [],
    "num_sigmas": 3,
    "trim_strings": True,
    "max_empty_ratio": 0.01,
    "sample_fraction": 0.3,
    "sample_seed": None,
    "limit": 1000,
}

df_ratings_to_profile = df_title_ratings_clean.select(
    "TCONST", "Average_Rating", "Num_Votes"
)

ratings_summary, ratings_profiles = profiler_r.profile(
    df_ratings_to_profile,
    options=profile_options_r
)

print("── RATINGS DQX PROFILES ──")
for p in ratings_profiles:
    print(p)

print("\n── RATINGS DQX SUMMARY (truncated) ──")
print(json.dumps(ratings_summary, indent=4)[:3000])
