In [0]:
from pyspark.sql.functions import col

def check_null_values(df, df_name):
    """Checks for null values in each column."""
    print(f"Checking null values for DataFrame: {df_name}")
    for col_name in df.columns:
        null_count = df.filter(col(col_name).isNull()).count()
        if null_count > 0:
            print(f" DataFrame '{df_name}': Column '{col_name}' has {null_count} null values")
    print(f"Null Value check for DataFrame '{df_name}' is completed!\n")


In [0]:
from pyspark.sql.functions import countDistinct
def check_primary_key_uniqueness(df, df_name, primary_key):
    """Checks uniqueness of primary key."""
    print(f"Checking uniqueness for primary key '{primary_key}' in DataFrame: {df_name}")
    total_count = df.count()
    unique_count = df.select(countDistinct(col(primary_key))).collect()[0][0]
    
    if total_count == unique_count:
        print(f"Primary key '{primary_key}' is unique in '{df_name}'.")
    else:
        print(f"Primary key '{primary_key}' has duplicate values in '{df_name}'.")
    
    print(f"Primary Key Uniqueness check for '{df_name}' is completed!\n")

In [0]:
def check_data_types(df, df_name, expected_schema):
    """Validates the data types of columns."""
    print(f"Checking data types for DataFrame: {df_name}")
    actual_schema = {col_name: dtype for col_name, dtype in df.dtypes}
    
    for col_name, expected_dtype in expected_schema.items():
        actual_dtype = actual_schema.get(col_name)
        if actual_dtype != expected_dtype:
            print(f" Column '{col_name}' in '{df_name}' has incorrect data type. Expected: {expected_dtype}, Found: {actual_dtype}")
        else:
            print(f"Column '{col_name}' in '{df_name}' has the correct data type: {expected_dtype}.")
    
    print(f"Data Type check for '{df_name}' is completed!\n")

In [0]:
from pyspark.sql.functions import col

def check_foreign_key_constraint(fact_df, fact_name, dim_df, dim_name, fact_fk, dim_pk):
    """Checks if foreign key values in fact table exist in dimension table."""
    print(f"🔍 Checking Foreign Key Constraint between '{fact_name}' (fact) and '{dim_name}' (dimension) on '{fact_fk}' → '{dim_pk}'")

    # Find foreign key values in fact that are missing in dimension
    missing_fk_count = fact_df.join(dim_df, fact_df[fact_fk] == dim_df[dim_pk], "left_anti").count()

    if missing_fk_count > 0:
        print(f"{missing_fk_count} records in '{fact_name}' have missing foreign key values that do not exist in '{dim_name}'.")
    else:
        print(f"Foreign Key Constraint '{fact_fk} → {dim_pk}' is satisfied.")

    print(f" Foreign Key Constraint check for '{fact_name}' → '{dim_name}' is completed!\n")
