In [0]:
from pyspark.sql.functions import col

# Load data into a DataFrame from shared data set
df = spark.read.table("deep_sync_us_zip_code_metadata_populations_geo_centroid_lat_lng_city_names_state_dma_demographics.default.zip_code_metadata")

# Write to Delta table
df.write.mode("overwrite").format("delta").saveAsTable("tabular.dataexpert.population_data_bronze")

In [0]:
from pyspark.sql.functions import col

# Load the data
df = spark.table("tabular.dataexpert.population_data_bronze")

# Define Quality Checks
quality_checks = {
    "Check for Null ZIP Codes": df.filter(col("ZIP").isNull()).count(),
    "Check for Null GEOPOINTs": df.filter(col("GEOPOINT").isNull()).count(),
    "Check for Negative Population Values": df.filter((col("TOTAL_POPULATION") < 0) | 
                                                      (col("TOTAL_MALE_POPULATION") < 0) | 
                                                      (col("TOTAL_FEMALE_POPULATION") < 0)).count(),
    "Check for Total Population Mismatch": df.filter(col("TOTAL_POPULATION") != 
                                                     (col("TOTAL_MALE_POPULATION") + col("TOTAL_FEMALE_POPULATION"))).count(),
    "Check for Invalid Latitude/Longitude": df.filter((col("LATITUDE") < -90) | (col("LATITUDE") > 90) |
                                                     (col("LONGITUDE") < -180) | (col("LONGITUDE") > 180)).count(),
    "Check for Median Age Range (0-120)": df.filter((col("MEDIAN_AGE") < 0) | (col("MEDIAN_AGE") > 120)).count(),
}

# Display Results
for check, count in quality_checks.items():
    print(f"{check}: {count}")

# Generate a Summary Report
summary_df = spark.createDataFrame([(k, v) for k, v in quality_checks.items()], ["Check", "Count"])
display(summary_df)