In [0]:
from pyspark.sql.functions import min, max, col

# Load the data
df = spark.table("tabular.dataexpert.population_data_silver")

# Define window for ZIP code groups
from pyspark.sql.window import Window
zip_window = Window.partitionBy("ZIP")

# Compute min/max latitude and longitude per ZIP
df_bounds = df.withColumn("min_lat", min(col("LATITUDE")).over(zip_window)) \
              .withColumn("max_lat", max(col("LATITUDE")).over(zip_window)) \
              .withColumn("min_lon", min(col("LONGITUDE")).over(zip_window)) \
              .withColumn("max_lon", max(col("LONGITUDE")).over(zip_window))

df_bounds = df_bounds.select("ZIP", "min_lat", "max_lat", "min_lon", "max_lon").dropDuplicates()
df_bounds.show()


In [0]:
from pyspark.sql.functions import col, when, udf
import math
from pyspark.sql.types import DoubleType

# Approximate radius of Earth in kilometers
R = 6371.0  

# Function to calculate haversine distance
def haversine_distance(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    return R * c

# Apply Haversine formula in PySpark
haversine_udf = udf(lambda lat1, lon1, lat2, lon2: haversine_distance(lat1, lon1, lat2, lon2), DoubleType())

df_area = df_bounds.withColumn("lat_distance_km", haversine_udf(col("min_lat"), col("min_lon"), col("max_lat"), col("min_lon"))) \
                   .withColumn("lon_distance_km", haversine_udf(col("min_lat"), col("min_lon"), col("min_lat"), col("max_lon")))

# Compute approximate area
df_area = df_area.withColumn("land_area_sq_km", col("lat_distance_km") * col("lon_distance_km"))

df_area = df_area.withColumn(
    "land_area_sq_km",
    when(col("land_area_sq_km") == 0, 1).otherwise(col("land_area_sq_km"))
)

#display(df_area)

In [0]:
from pyspark.sql.functions import col, expr

# Join land area data with population data
df_final = df.join(df_area.select("ZIP", "land_area_sq_km"), "ZIP", "left")

# Compute Population Density using try_divide
df_final = df_final.withColumn("population_density", expr("try_divide(TOTAL_POPULATION, land_area_sq_km)"))


In [0]:
df_final.write.format("delta").mode("overwrite").saveAsTable("tabular.dataexpert.population_data_gold")