# Geopandas Shapefile Extraction

Utility notebook for loading a shapefile with **geopandas**, performing a simple inspection, and exporting data to a convenient format (e.g., CSV). This can be used as a starting point for any geospatial data preprocessing within the ETL pipeline.

In [None]:
# Install geopandas in the current environment if needed
# !pixi add --feature gis --pypi geopandas
# (Uncomment the line above and run the cell in the appropriate environment)


In [1]:
import geopandas as gpd
import matplotlib.pyplot as plt
""
# Path to your shapefile (update this path to point at the desired .shp file)
#shapefile_path = 'data/example_shapefile.shp'

# This is the path to the landiq dataset saved locally on Peter's machine
shapefile_path = '/Users/pjsmitty301/BioCirV/i15_crop_mapping_2023_provisional'

# Load the shapefile
gdf = gpd.read_file(shapefile_path).head(100)

# Quick inspection
print(gdf.head())
print(f'CRS: {gdf.crs}')


    DataStatus UniqueID DWR_REVISE SYMB_CLASS MULTIUSE CLASS1 SUBCLASS1  \
0  Provisional  4900289       None          G        S     **        **   
1  Provisional  4900338       None          G        S     **        **   
2  Provisional  4901032       None          G        S     **        **   
3  Provisional  4901050       None          G        S     **        **   
4  Provisional  4901182       None          G        D      V        **   

  SPECOND1 IRR_TYP1PA IRR_TYP1PB  ... REGION     ACRES  COUNTY    HYDRO_RGN  \
0        *          *          *  ...   NCRO  0.554073  Sonoma  North Coast   
1        *          *          *  ...   NCRO  3.381716  Sonoma  North Coast   
2        *          *          *  ...   NCRO  1.222060  Sonoma  North Coast   
3        *          *          *  ...   NCRO  1.247221  Sonoma  North Coast   
4        *          *          *  ...   NCRO  0.810619  Sonoma  North Coast   

            LIQ_REPORT MAIN_CROP MAIN_CROP_ Shape_Leng    Shape_Area  \
0 

In [9]:
# I want to test the landiq extract to see if it is only getting 100 records as well

from ca_biositing.pipeline.etl.extract.landiq import extract

DEFAULT_SHAPEFILE_PATH = "data/landiq/i15_Crop_Mapping_2023_Provisional.shp"

extract(shapefile_path=DEFAULT_SHAPEFILE_PATH)

Unnamed: 0,DataStatus,UniqueID,DWR_REVISE,SYMB_CLASS,MULTIUSE,CLASS1,SUBCLASS1,SPECOND1,IRR_TYP1PA,IRR_TYP1PB,...,REGION,ACRES,COUNTY,HYDRO_RGN,LIQ_REPORT,MAIN_CROP,MAIN_CROP_,Shape_Leng,Shape_Area,geometry
0,Provisional,4900289,,G,S,**,**,*,*,*,...,NCRO,0.554073,Sonoma,North Coast,G6 **** **** ****,G6,118.0,0.002002,2.310881e-07,"POLYGON Z ((-122.98063 38.35045 0, -122.98068 ..."
1,Provisional,4900338,,G,S,**,**,*,*,*,...,NCRO,3.381716,Sonoma,North Coast,G6 **** **** ****,G6,20.0,0.004707,1.411760e-06,"POLYGON Z ((-122.90137 38.42115 0, -122.90132 ..."
2,Provisional,4901032,,G,S,**,**,*,*,*,...,NCRO,1.222060,Sonoma,North Coast,G6 **** **** ****,G6,86.0,0.004302,5.102894e-07,"POLYGON Z ((-122.7868 38.43781 0, -122.78679 3..."
3,Provisional,4901050,,G,S,**,**,*,*,*,...,NCRO,1.247221,Sonoma,North Coast,G6 **** **** ****,G6,126.0,0.003350,5.227858e-07,"POLYGON Z ((-122.84651 38.71665 0, -122.84545 ..."
4,Provisional,4901182,,G,D,V,**,*,*,*,...,NCRO,0.810619,Sonoma,North Coast,V G6 **** ****,G6,100.0,0.002245,3.382431e-07,"POLYGON Z ((-122.92076 38.38457 0, -122.92096 ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Provisional,5500006,,T,S,**,**,*,*,*,...,SCRO,1.633212,Tuolumne,San Joaquin River,T18 **** **** ****,T18,264.0,0.003672,6.771030e-07,"POLYGON Z ((-120.35042 37.90698 0, -120.34975 ..."
96,Provisional,5500005,,P,S,**,**,*,*,*,...,SCRO,4.639009,Tuolumne,San Joaquin River,P3 **** **** ****,P3,0.0,0.008668,1.923793e-06,"POLYGON Z ((-120.41246 37.92947 0, -120.41118 ..."
97,Provisional,5500004,,I,S,**,**,*,*,*,...,NCRO,1.763105,Tuolumne,San Joaquin River,I4 **** **** ****,I4,0.0,0.003435,7.319561e-07,"POLYGON Z ((-120.41064 38.00821 0, -120.41135 ..."
98,Provisional,5500003,,P,S,**,**,*,*,*,...,SCRO,16.002246,Tuolumne,San Joaquin River,P3 **** **** ****,P3,0.0,0.013549,6.632227e-06,"POLYGON Z ((-120.43476 37.88465 0, -120.43446 ..."


In [6]:
gdf.dtypes

DataStatus      object
UniqueID        object
DWR_REVISE      object
SYMB_CLASS      object
MULTIUSE        object
CLASS1          object
SUBCLASS1       object
SPECOND1        object
IRR_TYP1PA      object
IRR_TYP1PB      object
PCNT1           object
CLASS2          object
SUBCLASS2       object
SPECOND2        object
IRR_TYP2PA      object
IRR_TYP2PB      object
PCNT2           object
CLASS3          object
SUBCLASS3       object
SPECOND3        object
IRR_TYP3PA      object
IRR_TYP3PB      object
PCNT3           object
CLASS4          object
SUBCLASS4       object
SPECOND4        object
IRR_TYP4PA      object
IRR_TYP4PB      object
PCNT4           object
UCF_ATT         object
YR_PLANTED       int64
SEN_CROP        object
ADOY_SEN       float64
CROPTYP1        object
CTYP1_NOTE      object
ADOY1          float64
CROPTYP2        object
CTYP2_NOTE      object
ADOY2          float64
CROPTYP3        object
CTYP3_NOTE      object
ADOY3          float64
CROPTYP4        object
CTYP4_NOTE 

In [4]:
from ca_biositing.pipeline.etl.transform.landiq.landiq_record import transform_landiq_record

transform_landiq_record(gdf)

DEBUG: Starting normalization for 1 DataFrames
DEBUG: Opening database session...
DEBUG: Database session opened


Unnamed: 0,record_id,acres,version,irrigated,dataset_id,main_crop,secondary_crop,tertiary_crop,quaternary_crop,polygon_id,geometry
0,4900289,0.554073,land use 2023,False,12,1414,1420,1423,1420,222219,"POLYGON Z ((-122.98063 38.35045 0, -122.98068 ..."
1,4900338,3.381716,land use 2023,False,12,1414,1420,1423,1420,222279,"POLYGON Z ((-122.90137 38.42115 0, -122.90132 ..."
2,4901032,1.222060,land use 2023,False,12,1414,1420,1423,1420,222205,"POLYGON Z ((-122.7868 38.43781 0, -122.78679 3..."
3,4901050,1.247221,land use 2023,False,12,1414,1420,1423,1420,222252,"POLYGON Z ((-122.84651 38.71665 0, -122.84545 ..."
4,4901182,0.810619,land use 2023,False,12,1414,1413,1423,1420,222264,"POLYGON Z ((-122.92076 38.38457 0, -122.92096 ..."
...,...,...,...,...,...,...,...,...,...,...,...
95,5500006,1.633212,land use 2023,False,12,1418,1420,1426,1420,222287,"POLYGON Z ((-120.35042 37.90698 0, -120.34975 ..."
96,5500005,4.639009,land use 2023,False,12,1416,1420,1425,1420,222248,"POLYGON Z ((-120.41246 37.92947 0, -120.41118 ..."
97,5500004,1.763105,land use 2023,False,12,1419,1420,1422,1420,222268,"POLYGON Z ((-120.41064 38.00821 0, -120.41135 ..."
98,5500003,16.002247,land use 2023,False,12,1416,1420,1425,1420,222244,"POLYGON Z ((-120.43476 37.88465 0, -120.43446 ..."


In [None]:
# Plot the geometry (simple quick visual)
gdf.plot(figsize=(10, 6))
plt.title('Shapefile Overview')
plt.show()


In [None]:
import os
import sys
import pandas as pd
import numpy as np
import janitor as jn
import logging
from IPython.display import display
from sqlalchemy.orm import Session
from sqlalchemy import select

# --- Basic Logging Configuration for Notebook ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

In [None]:
# Use the refactored cleaning/coercion helpers from the new package
from ca_biositing.pipeline.utils.cleaning_functions import standard_clean, coerce_columns, coerce_columns_list

def clean_the_gsheets(df, lowercase=True, replace_empty=True):
    """Wrapper that applies the standardized cleaning pipeline implemented in `cleaning_functions`."""
    logger.info('Starting DataFrame cleaning via standard_clean.')
    if not isinstance(df, pd.DataFrame):
        logger.error('Input is not a pandas DataFrame.')
        return None
    try:
        # Run the composed standard clean (names, empty->NA, lowercase, convert_dtypes)
        df_cleaned = standard_clean(df, lowercase=lowercase, replace_empty=replace_empty)
        # Preserve behaviour: drop rows missing key columns if present
        subset = [c for c in ['resource', 'value'] if c in df_cleaned.columns]
        if subset:
            df_cleaned = df_cleaned.dropna(subset=subset)
        logger.info(f'Cleaning complete; rows remaining: {len(df_cleaned)}')
        return df_cleaned
    except Exception as e:
        logger.error(f'An error occurred during DataFrame cleaning: {e}', exc_info=True)
        return None


# --- Coercion Configuration Templates ---
COERCION_CONFIG_EXPLICIT = {
    'int_cols': ['repl_no', 'sample_no'],
    'float_cols': ['value', 'measurement'],
    'datetime_cols': ['created_at', 'updated_at'],
    'bool_cols': ['is_valid'],
    'category_cols': ['status'],
    'geometry_cols': []
}

COERCION_CONFIG_DTYPE_MAP = {
    'int': ['repl_no', 'sample_no'],
    'float': ['value', 'measurement'],
    'datetime': ['created_at', 'updated_at'],
    'bool': ['is_valid'],
    'category': ['status'],
    'geometry': []
}

GEOPANDAS_CONFIG = {
    'int_cols': ['id', 'repl_no'],
    'float_cols': ['area', 'value'],
    'geometry_cols': ['geometry'],
    'geometry_format': 'geodataframe'
}


def coerce_the_gsheets(df, dtype_map=None, int_cols=None, float_cols=None, datetime_cols=None, bool_cols=None, category_cols=None, geometry_cols=None, geometry_format='wkt'):
    """Coerce specified columns on a cleaned DataFrame using coercion helpers."""
    if not isinstance(df, pd.DataFrame):
        logger.error('coerce_the_gsheets: input is not a DataFrame')
        return df
    return coerce_columns(df, int_cols=int_cols, float_cols=float_cols, datetime_cols=datetime_cols, bool_cols=bool_cols, category_cols=category_cols, geometry_cols=geometry_cols, dtype_map=dtype_map, geometry_format=geometry_format)

In [None]:
clean_the_gsheets(gdf).dtypes

In [None]:
# Export attributes to CSV for downstream processing (optional)
output_csv = 'data/shapefile_attributes.csv'
gdf.drop(columns='geometry').to_csv(output_csv, index=False)
print(f'Attributes exported to {output_csv}')
