In [None]:
# Overview of geospatial data, coordinate systems and geometry types.  

In [2]:
from sedona.spark import *
from sedona.sql.functions import st_isvalid, st_isvalidreason, st_makevalid
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, when


config = SedonaContext.builder().getOrCreate()
sedona = SedonaContext.create(config)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
                                                                                

In [19]:
from sedona.sql.st_functions import ST_IsValid, ST_IsValidReason, ST_MakeValid
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, when


In [None]:
# Read and create basic data frames  - Pranav

sedona.read

In [None]:
# Vector drivers  - Pranav

# GeoJSON
# CSV
# Shapefile

# Show others in comments

In [None]:
# Raster - Pranav

# Raster

# Show others

In [None]:
# Intro to cloud native formats - Pranav

# Demo the speed on this 
# COG from LANDSAT - STAC

In [None]:
# Intro to Iceberg - Pranav

In [None]:
# Transforming data with non-native readers - in slides

In [None]:
# Wherobots Fundamentals - Constructing Geometries - Furqaan

In [None]:
# Wherobots Fundamentals - Spatial predicates  - Furqaan

In [None]:
# Wherobots Fundamentals - Range joins - Furqaan

In [None]:
# create and manage Havasu (Iceberg) tables for vector and raster data  - Furqaan

# Data validity checks

Two of the most common issues with geospatial data include managing projections or Coordinate Reference Systems (CRS) and ensuring geometries are valid.

- A geometry is invalid if it violates spatial rules like self-intersections, unclosed rings, misaligned holes, or overlapping parts—making it topologically incorrect.
- Spatial files generally contain a Coordinate Reference System or CRS that is defined by a Spatial Reference ID or SRID. This tells us how the data is projected from the round spheroid of the earth onto a flat surface.

To fix these issues and ensure our data is valid and in the correct format we use two approaches:

1. Check the geometries for any invalidities, and if there are attempt to fix them using `ST_IsValid`, `ST_IsValidDetail`, and `ST_MakeValid`
2. Remove or log out any geometries that cannot be fixed
3. Standardize our geometries in a single CRS, in this case [EPSG:4326](https://epsg.io/4326) which renders in a coordinate reference system

## Validating geometries

In [None]:
# Data validity checks - Matt

## Transforming CRS

In [None]:
# Handling and transforming CRS - Matt

In [None]:
# Dataset loading - aka load all datasets to tables - Matt

# Loading datasets into WherobotsDB

In [3]:
prefix = 's3://wherobots-examples/gdea-course-data/raw-data/'
database = 'gde_bronze'

In [12]:
sedona.sql(f'CREATE DATABASE IF NOT EXISTS wherobots.{database}')

DataFrame[]

In [26]:
def check_invalid_geometries(df: DataFrame, geom_col: str = "geom", reason_col: str = "why_invalid") -> int:
    df_with_reason = df.withColumn(reason_col, ST_IsValidReason(col(geom_col)))
    # cache to avoid recomputation if you inspect reasons later
    df_with_reason.cache()
    invalid_count = df_with_reason.filter(~ST_IsValid(col(geom_col))).count()
    print(f"✅ Checked geometries — found {invalid_count} invalid geometries.")
    return invalid_count

def fix_invalid_geometries(df: DataFrame, invalid_count: int, geom_col: str = "geom") -> DataFrame:
    if invalid_count > 1:
        print(f"🔧 Attempting to fix {invalid_count} invalid geometries...")
        return df.withColumn(
            geom_col,
            when(~ST_IsValid(col(geom_col)), ST_MakeValid(col(geom_col))).otherwise(col(geom_col))
        )
    else:
        print("⚡ Only one invalid geometry (or none). Skipping automated fix.")
        return df

# --- driver program ---
def process_geometries(
    df: DataFrame,
    geom_col: str = "geom",
    attempt_fix: bool = True,
    split_on_fail: bool = True
):
    """
    Runs validity check -> optional repair -> optional split.
    Returns either:
      - {"df": corrected_df}  when all geometries valid after repair (or none invalid)
      - {"valid_df": ..., "invalid_df": ...} when some invalid remain and split_on_fail=True
    """
    # 1) Initial check
    invalid_count = check_invalid_geometries(df, geom_col=geom_col)

    if invalid_count == 0:
        print("✅ All geometries are valid.")
        return {"df": df}  # nothing to do

    # 2) Attempt repair (only changes rows that are invalid per your earlier contract)
    if attempt_fix:
        df_fixed = fix_invalid_geometries(df, invalid_count, geom_col=geom_col)
        remaining_invalid_count = df_fixed.filter(~ST_IsValid(col(geom_col))).count()
        print(f"🔎 After fixing, {remaining_invalid_count} invalid geometries remain.")
    
        if remaining_invalid_count == 0:
            print("✅ All geometries are valid after fixing.")
            return {"df": df_fixed}
        elif split_on_fail:
            print("⚠️ Some invalid geometries remain — splitting dataset.")
            valid_df = df_fixed.filter(ST_IsValid(col(geom_col)))
            invalid_df = df_fixed.filter(~ST_IsValid(col(geom_col)))
            print(f"✅ Split complete: {valid_df.count()} valid / {invalid_df.count()} invalid.")
            return {"valid_df": valid_df, "invalid_df": invalid_df}
        else:
            print("⚠️ Some invalid geometries remain, returning best-effort fixed DataFrame.")
            return {"df": df_fixed}
    
    # If no fix attempt, just split if requested
    if split_on_fail:
        print("⚠️ Skipping fix — splitting into valid and invalid.")
        valid_df = df.filter(ST_IsValid(col(geom_col)))
        invalid_df = df.filter(~ST_IsValid(col(geom_col)))
        print(f"✅ Split complete: {valid_df.count()} valid / {invalid_df.count()} invalid.")
        return {"valid_df": valid_df, "invalid_df": invalid_df}
    
    print("⚠️ Invalid geometries found but no fix or split requested. Returning original DataFrame.")
    return {"df": df}

In [27]:
# FEMA Flood Hazard Areas
fld_hazard_area = sedona.read.format('shapefile').load(f'{prefix}' + '53033C_20250330/S_FLD_HAZ_AR.shp')

                                                                                

In [40]:
result = process_geometries(fld_hazard_area, geom_col="geometry", attempt_fix=True, split_on_fail=True)

if "df" in result:
    df_final = result["df"]  # all valid (either already valid or successfully repaired)
else:
    valid_df = result["valid_df"]
    invalid_df = result["invalid_df"]
    # handle invalids (e.g., export for manual review)(fld_hazard_area, 'geometry')

25/09/30 18:43:28 WARN CacheManager: Asked to cache already cached data.


✅ Checked geometries — found 15 invalid geometries.
🔧 Attempting to fix 15 invalid geometries...


[Stage 37:>                                                         (0 + 1) / 1]

🔎 After fixing, 0 invalid geometries remain.
✅ All geometries are valid after fixing.


                                                                                

In [41]:
df_final.writeTo(f"wherobots.{database}.fema_flood_zones_bronze").createOrReplace()

                                                                                

In [42]:
# King County Generalized Land Use Data
gen_land_use = sedona.read.format('shapefile').load(f'{prefix}' + 'General_Land_Use_Final_Dataset/General_Land_Use_Final_Dataset.shp')

                                                                                

In [43]:
result = process_geometries(gen_land_use, geom_col="geometry", attempt_fix=True, split_on_fail=True)

if "df" in result:
    df_final = result["df"]  # all valid (either already valid or successfully repaired)
else:
    valid_df = result["valid_df"]
    invalid_df = result["invalid_df"]
    # handle invalids (e.g., export for manual review)(fld_hazard_area, 'geometry')

                                                                                

✅ Checked geometries — found 2987 invalid geometries.
🔧 Attempting to fix 2987 invalid geometries...


[Stage 46:>                                                         (0 + 1) / 1]

🔎 After fixing, 0 invalid geometries remain.
✅ All geometries are valid after fixing.


                                                                                

In [44]:
valid_df.writeTo(f"wherobots.{database}.gen_land_use_bronze").createOrReplace()

                                                                                

In [45]:
# King County Sherrif Patrol Districts
sherrif_districts = sedona.read.format('shapefile').load(f'{prefix}' + 'King_County_Sheriff_Patrol_Districts___patrol_districts_area/King_County_Sheriff_Patrol_Districts___patrol_districts_area.shp')

                                                                                

In [46]:
result = process_geometries(sherrif_districts, geom_col="geometry", attempt_fix=True, split_on_fail=True)

if "df" in result:
    df_final = result["df"]  # all valid (either already valid or successfully repaired)
else:
    valid_df = result["valid_df"]
    invalid_df = result["invalid_df"]
    # handle invalids (e.g., export for manual review)(fld_hazard_area, 'geometry')

[Stage 51:>                                                         (0 + 1) / 1]

✅ Checked geometries — found 0 invalid geometries.
✅ All geometries are valid.


                                                                                

In [47]:
df_final.writeTo(f"wherobots.{database}.sherrif_districts_bronze").createOrReplace()

                                                                                

In [48]:
offense_reports = sedona.read.format('csv').load(f'{prefix}' + 'KCSO_Offense_Reports__2020_to_Present_20250923.csv')

In [49]:
offense_reports.writeTo(f"wherobots.{database}.offense_reports_bronze").createOrReplace()

                                                                                

In [50]:
# King County Bike Lanes
bike_lanes = sedona.read.format('shapefile').load(f'{prefix}' + 'Metro_Transportation_Network_(TNET)_in_King_County_for_Bicycle_Mode___trans_network_bike_line/Metro_Transportation_Network_(TNET)_in_King_County_for_Bicycle_Mode___trans_network_bike_line.shp')

In [51]:
result = process_geometries(bike_lanes, geom_col="geometry", attempt_fix=True, split_on_fail=True)

if "df" in result:
    df_final = result["df"]  # all valid (either already valid or successfully repaired)
else:
    valid_df = result["valid_df"]
    invalid_df = result["invalid_df"]
    # handle invalids (e.g., export for manual review)(fld_hazard_area, 'geometry')

[Stage 59:>                                                         (0 + 1) / 1]

✅ Checked geometries — found 0 invalid geometries.
✅ All geometries are valid.


                                                                                

In [52]:
df_final.writeTo(f"wherobots.{database}.bike_lanes_bronze").createOrReplace()

                                                                                

In [53]:
# FEMA National Risk Index
fema_nri = sedona.read.format('shapefile').load(f'{prefix}' + 'NRI_Shapefile_CensusTracts/NRI_Shapefile_CensusTracts.shp')

In [56]:
result = process_geometries(fema_nri, geom_col="geometry", attempt_fix=True, split_on_fail=True)

if "df" in result:
    df_final = result["df"]  # all valid (either already valid or successfully repaired)
else:
    valid_df = result["valid_df"]
    invalid_df = result["invalid_df"]
    # handle invalids (e.g., export for manual review)(fld_hazard_area, 'geometry')

25/09/30 19:01:47 WARN CacheManager: Asked to cache already cached data.
                                                                                

✅ Checked geometries — found 82 invalid geometries.
🔧 Attempting to fix 82 invalid geometries...


[Stage 75:>                                                         (0 + 1) / 1]

🔎 After fixing, 0 invalid geometries remain.
✅ All geometries are valid after fixing.


                                                                                

In [57]:
df_final.writeTo(f"wherobots.{database}.fema_nri_bronze").createOrReplace()

25/09/30 19:19:03 WARN ExecutorPodsWatchSnapshotSource: Kubernetes client has been closed.


In [1]:
# King County School Sites
school_sites = sedona.read.format('shapefile').load(f'{prefix}' + 'School_Sites_in_King_County___schsite_point/School_Sites_in_King_County___schsite_point.shp')

NameError: name 'sedona' is not defined

In [2]:
# Schools Report Card

In [3]:
# Seismic Hazards

In [4]:
# Census Block Groups

In [5]:
# Census CSVs

In [6]:
# Tranist Routes

In [7]:
# Transit Stops

In [8]:
# Water Bodies

In [10]:
# Wildfire Polygons

In [11]:
# Wildfire Rasters

In [12]:
# E;evation