In [0]:
from pyspark.sql.functions import col, round

# Load the data
df = spark.table("tabular.dataexpert.population_data_bronze")

# Round longitude and latitude till 2 decimal places
df = df.withColumn("latitude", round(col("latitude"), 2)).withColumn("longitude", round(col("longitude"), 2))

# Define Quality Checks
quality_checks = {
    "Check for Null ZIP Codes": df.filter(col("ZIP").isNull()).count(),
    "Check for Null GEOPOINTs": df.filter(col("GEOPOINT").isNull()).count(),
    "Check for Negative Population Values": df.filter((col("TOTAL_POPULATION") < 0) | 
                                                      (col("TOTAL_MALE_POPULATION") < 0) | 
                                                      (col("TOTAL_FEMALE_POPULATION") < 0)).count(),
    "Check for Total Population Mismatch": df.filter(col("TOTAL_POPULATION") != 
                                                     (col("TOTAL_MALE_POPULATION") + col("TOTAL_FEMALE_POPULATION"))).count(),
    "Check for Invalid Latitude/Longitude": df.filter((col("LATITUDE") < -90) | (col("LATITUDE") > 90) |
                                                     (col("LONGITUDE") < -180) | (col("LONGITUDE") > 180)).count(),
    "Check for Median Age Range (0-120)": df.filter((col("MEDIAN_AGE") < 0) | (col("MEDIAN_AGE") > 120)).count(),
}

# Display Results
for check, count in quality_checks.items():
    print(f"{check}: {count}")

# Generate a Summary Report
summary_df = spark.createDataFrame([(k, v) for k, v in quality_checks.items()], ["Check", "Count"])
display(summary_df)

In [0]:
# Check if all data quality checks passed
if all(count == 0 for count in quality_checks.values()):
    # Write the data to the silver table
    df.write.format("delta").mode("overwrite").saveAsTable("tabular.dataexpert.population_data_silver")
else:
    print("Data quality checks failed. Data not written to silver table.")