# Geopandas Shapefile Extraction

Utility notebook for loading a shapefile with **geopandas**, performing a simple inspection, and exporting data to a convenient format (e.g., CSV). This can be used as a starting point for any geospatial data preprocessing within the ETL pipeline.

In [None]:
# Install geopandas in the current environment if needed
# !pixi add --feature gis --pypi geopandas
# (Uncomment the line above and run the cell in the appropriate environment)


In [1]:
import geopandas as gpd
import matplotlib.pyplot as plt
""
# Path to your shapefile (update this path to point at the desired .shp file)
#shapefile_path = 'data/example_shapefile.shp'

# This is the path to the landiq dataset saved locally on Peter's machine
shapefile_path = '/Users/pjsmitty301/BioCirV/i15_crop_mapping_2023_provisional'

# Load the shapefile
gdf = gpd.read_file(shapefile_path)

# Quick inspection
print(gdf.head())
print(f'CRS: {gdf.crs}')


    DataStatus UniqueID DWR_REVISE SYMB_CLASS MULTIUSE CLASS1 SUBCLASS1  \
0  Provisional  4900289       None          G        S     **        **   
1  Provisional  4900338       None          G        S     **        **   
2  Provisional  4901032       None          G        S     **        **   
3  Provisional  4901050       None          G        S     **        **   
4  Provisional  4901182       None          G        D      V        **   

  SPECOND1 IRR_TYP1PA IRR_TYP1PB  ... REGION     ACRES  COUNTY    HYDRO_RGN  \
0        *          *          *  ...   NCRO  0.554073  Sonoma  North Coast   
1        *          *          *  ...   NCRO  3.381716  Sonoma  North Coast   
2        *          *          *  ...   NCRO  1.222060  Sonoma  North Coast   
3        *          *          *  ...   NCRO  1.247221  Sonoma  North Coast   
4        *          *          *  ...   NCRO  0.810619  Sonoma  North Coast   

            LIQ_REPORT MAIN_CROP MAIN_CROP_ Shape_Leng    Shape_Area  \
0 

In [16]:
gdf.dtypes

DataStatus      object
UniqueID        object
DWR_REVISE      object
SYMB_CLASS      object
MULTIUSE        object
CLASS1          object
SUBCLASS1       object
SPECOND1        object
IRR_TYP1PA      object
IRR_TYP1PB      object
PCNT1           object
CLASS2          object
SUBCLASS2       object
SPECOND2        object
IRR_TYP2PA      object
IRR_TYP2PB      object
PCNT2           object
CLASS3          object
SUBCLASS3       object
SPECOND3        object
IRR_TYP3PA      object
IRR_TYP3PB      object
PCNT3           object
CLASS4          object
SUBCLASS4       object
SPECOND4        object
IRR_TYP4PA      object
IRR_TYP4PB      object
PCNT4           object
UCF_ATT         object
YR_PLANTED       int64
SEN_CROP        object
ADOY_SEN       float64
CROPTYP1        object
CTYP1_NOTE      object
ADOY1          float64
CROPTYP2        object
CTYP2_NOTE      object
ADOY2          float64
CROPTYP3        object
CTYP3_NOTE      object
ADOY3          float64
CROPTYP4        object
CTYP4_NOTE 

In [18]:
import os
import pandas as pd
import geopandas as gpd
from prefect import task, get_run_logger
import ca_biositing.pipeline.utils.cleaning_functions.cleaning as cleaning_mod
import ca_biositing.pipeline.utils.cleaning_functions.coercion as coercion_mod
from ca_biositing.pipeline.utils.name_id_swap import normalize_dataframes

@task
def transform_landiq_record(
    gdf: gpd.GeoDataFrame,
    etl_run_id: str = None,
    lineage_group_id: int = None
) -> pd.DataFrame:
    """
    Transforms Land IQ GeoDataFrame into the LandiqRecord table format.

    Args:
        gdf: Raw GeoDataFrame from Land IQ shapefile.
        etl_run_id: ID of the current ETL run.
        lineage_group_id: ID of the lineage group.

    Returns:
        A pandas DataFrame formatted for the landiq_record table.
    """
    from ca_biositing.datamodels.schemas.generated.ca_biositing import (
        Dataset,
        Polygon,
        PrimaryAgProduct,
    )

    logger = get_run_logger()
    logger.info("Transforming Land IQ data for LandiqRecord table")

    if gdf is None or gdf.empty:
        logger.error("Input GeoDataFrame is empty or None")
        return pd.DataFrame()

    # 1. Initial Cleaning & Preparation
    # Convert GeoDataFrame to regular DataFrame to avoid issues with standard_clean
    df = pd.DataFrame(gdf.copy())

    # Set dataset name and version as requested
    df['dataset'] = 'landiq'
    df['version'] = 'land use 2023'

    # Map shapefile columns to model fields
    # MAIN_CROP is the main crop for single cropped fields
    if 'MAIN_CROP' in df.columns:
        df['main_crop'] = df['MAIN_CROP']
    if 'CLASS1' in df.columns:
        df['secondary_crop'] = df['CLASS1']
    if 'CLASS2' in df.columns:
        df['tertiary_crop'] = df['CLASS2']
    if 'CLASS3' in df.columns:
        df['quaternary_crop'] = df['CLASS3']

    # Map Confidence to confidence
    if 'CONFIDENCE' in df.columns:
        df['confidence'] = df['CONFIDENCE']

    # Load crop mapping
    # Load crop mapping
    try:
        mapping_path = os.path.join(os.path.dirname(__file__), 'crops_classification.csv')
        mapping_df = pd.read_csv(mapping_path)
        crop_map = {str(k).strip().upper(): v for k, v in zip(mapping_df['crop_code'], mapping_df['crop'])}
        logger.info(f"Loaded {len(crop_map)} crop mappings from {mapping_path}")

        # Convert crop codes to text
        for col in ['main_crop', 'secondary_crop', 'tertiary_crop', 'quaternary_crop']:
            if col in df.columns:
                # Ensure we handle potential whitespace and case sensitivity in codes
                df[col] = df[col].astype(str).str.strip().str.upper().map(crop_map).fillna(df[col])
    except Exception as e:
        # Use print as fallback if logger isn't initialized in some contexts
        msg = f"Could not load or apply crop mapping: {e}"
        try:
            logger.warning(msg)
        except:
            print(msg)

    # Map UniqueID to record_id for lineage and upsert
    if 'UniqueID' in df.columns:
        df['record_id'] = df['UniqueID']
    elif 'UNIQUEID' in df.columns:
        df['record_id'] = df['UNIQUEID']

    # Handle Irrigation status (IRR_TYP1PA/IRR_TYP2PA etc)
    if 'IRR_TYP1PA' in df.columns:
        df['irrigated'] = df['IRR_TYP1PA'].astype(str).str.lower().str.contains('irrigated')
    else:
        df['irrigated'] = False

    # 2. Standard Clean
    # We pass lowercase=False because standard_clean's to_lowercase_df implementation
    # has a bug where it tries to access .str on the DataFrame itself if columns is None.
    # 2. Standard Clean
    # We pass lowercase=False and replace_empty=False to avoid bugs in cleaning.py
    # that occur when processing DataFrames with certain column types.
    # Ensure crop columns are preserved after clean_names_df
    # We do this by explicitly passing them to clean_names_df if it supports it,
    # or re-adding them after.
    cleaned_df = cleaning_mod.clean_names_df(df)
    
    # Remove duplicate columns if any (e.g., if 'main_crop' already existed)
    # We do this BEFORE re-applying mapping to avoid 'DataFrame object has no attribute str'
    cleaned_df = cleaned_df.loc[:, ~cleaned_df.columns.duplicated()].copy()

    # Re-apply mapping to the cleaned dataframe to ensure correct values
    for col in ['main_crop', 'secondary_crop', 'tertiary_crop', 'quaternary_crop']:
        if col in cleaned_df.columns:
            cleaned_df[col] = cleaned_df[col].astype(str).str.strip().str.upper().map(crop_map).fillna(cleaned_df[col])

    # Manually lowercase string columns and handle empty strings
    # We iterate over columns and check if they are string-like to avoid AttributeError
    for i in range(len(cleaned_df.columns)):
        # Use iloc with integer index to handle potential duplicate column names
        # which can cause .loc to return a DataFrame instead of a Series
        series = cleaned_df.iloc[:, i]

        if series.dtype == "object" or pd.api.types.is_string_dtype(series):
            # Use Series-level .str accessor explicitly
            cleaned_df.iloc[:, i] = series.astype(str).str.lower().replace(r"^\s*$", None, regex=True)

    # Add lineage IDs
    if etl_run_id:
        cleaned_df['etl_run_id'] = etl_run_id
    if lineage_group_id:
        cleaned_df['lineage_group_id'] = lineage_group_id

    # 3. Coercion
    coerced_df = coercion_mod.coerce_columns(
        cleaned_df,
        float_cols=['acres'],
        int_cols=['confidence'] if 'confidence' in cleaned_df.columns else []
    )

    # 4. Normalization
    # We need to map names to IDs for related tables
    # We also normalize polygons using the geometry (WKT) as the identifier
    normalize_columns = {
        'dataset': (Dataset, 'name'),
        'main_crop': (PrimaryAgProduct, 'name'),
        'secondary_crop': (PrimaryAgProduct, 'name'),
        'tertiary_crop': (PrimaryAgProduct, 'name'),
        'quaternary_crop': (PrimaryAgProduct, 'name'),
        'geometry': (Polygon, 'geom'),
    }

    # Ensure geometry is in WKT format for normalization if it's a GeoSeries
    if 'geometry' in coerced_df.columns and hasattr(coerced_df['geometry'], 'to_wkt'):
        coerced_df['geometry'] = coerced_df['geometry'].to_wkt()

    normalized_df = normalize_dataframes(coerced_df, normalize_columns)

    # 5. Table Specific Mapping
    rename_map = {
        'record_id': 'record_id',
        'acres': 'acres',
        'version': 'version',
        'etl_run_id': 'etl_run_id',
        'lineage_group_id': 'lineage_group_id',
        'irrigated': 'irrigated',
        'confidence': 'confidence'
    }

    # Add normalized ID columns
    for col in normalize_columns.keys():
        norm_col = f"{col}_id"
        if norm_col in normalized_df.columns:
            # Special case: geometry_id maps to polygon_id in LandiqRecord
            target_col = 'polygon_id' if col == 'geometry' else norm_col
            rename_map[norm_col] = target_col

    # Ensure dataset_id is included if it was normalized
    if 'dataset_id' in normalized_df.columns:
        rename_map['dataset_id'] = 'dataset_id'

    # Ensure crop columns are preserved if they were normalized
    # We map the normalized ID columns (e.g., main_crop_id) back to the
    # model field names (e.g., main_crop) expected by the database.
    for col in ['main_crop', 'secondary_crop', 'tertiary_crop', 'quaternary_crop']:
        norm_col = f"{col}_id"
        if norm_col in normalized_df.columns:
            rename_map[norm_col] = col

    available_cols = [c for c in rename_map.keys() if c in normalized_df.columns]
    final_rename = {k: v for k, v in rename_map.items() if k in available_cols}

    try:
        record_df = normalized_df[available_cols].copy().rename(columns=final_rename)

        # Ensure record_id exists for lineage tracking
        if 'record_id' in record_df.columns:
            record_df = record_df.dropna(subset=['record_id'])
        else:
            logger.warning("record_id (UniqueID) missing from Land IQ transform")

        # Add geometry for polygon handling in load step
        if 'geometry' in gdf.columns:
            record_df['geometry'] = gdf['geometry'].values

        logger.info(f"Successfully transformed {len(record_df)} Land IQ records")
        return record_df

    except Exception as e:
        logger.error(f"Error during Land IQ transform: {e}", exc_info=True)
        return pd.DataFrame()


In [19]:
transform_landiq_record(gdf)

UnboundLocalError: cannot access local variable 'crop_map' where it is not associated with a value

In [None]:
# Plot the geometry (simple quick visual)
gdf.plot(figsize=(10, 6))
plt.title('Shapefile Overview')
plt.show()


In [None]:
import os
import sys
import pandas as pd
import numpy as np
import janitor as jn
import logging
from IPython.display import display
from sqlalchemy.orm import Session
from sqlalchemy import select

# --- Basic Logging Configuration for Notebook ---
# When running in a notebook, we use Python's standard logging.
# In the production pipeline, this will be replaced by Prefect's `get_run_logger()`
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

# --- Robustly find the project root ---
# This ensures that the notebook can be run from any directory within the project.
path = os.getcwd()
project_root = None
while path != os.path.dirname(path):
    if 'pixi.toml' in os.listdir(path):
        project_root = path
        break
    path = os.path.dirname(path)

if not project_root:
    raise FileNotFoundError("Could not find project root containing 'pixi.toml'.")

# Add the project root to the Python path to allow for module imports
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    logger.info(f"Added project root '{project_root}' to sys.path")
else:
    logger.info(f"Project root '{project_root}' is already in sys.path")


In [None]:
# Use the refactored cleaning/coercion helpers from the new package
from src.ca_biositing.pipeline.ca_biositing.pipeline.utils.cleaning_functions import standard_clean, coerce_columns, coerce_columns_list

def clean_the_gsheets(df, lowercase=True, replace_empty=True):
    """Wrapper that applies the standardized cleaning pipeline implemented in `cleaning_functions`."""
    logger.info('Starting DataFrame cleaning via standard_clean.')
    if not isinstance(df, pd.DataFrame):
        logger.error('Input is not a pandas DataFrame.')
        return None
    try:
        # Run the composed standard clean (names, empty->NA, lowercase, convert_dtypes)
        df_cleaned = standard_clean(df, lowercase=lowercase, replace_empty=replace_empty)
        # Preserve behaviour: drop rows missing key columns if present
        subset = [c for c in ['resource', 'value'] if c in df_cleaned.columns]
        if subset:
            df_cleaned = df_cleaned.dropna(subset=subset)
        logger.info(f'Cleaning complete; rows remaining: {len(df_cleaned)}')
        return df_cleaned
    except Exception as e:
        logger.error(f'An error occurred during DataFrame cleaning: {e}', exc_info=True)
        return None


# --- Coercion Configuration Templates ---
# You can define column coercions in two ways: explicit keyword arguments or a dtype_map dictionary.
# For geometry: use geopandas to load shapefiles (geometry column is already properly typed).
# Only use geometry_cols if you have WKT strings to parse.

# APPROACH 1: Explicit keyword arguments (clear and direct)
COERCION_CONFIG_EXPLICIT = {
    'int_cols': ['repl_no', 'sample_no'],
    'float_cols': ['value', 'measurement'],
    'datetime_cols': ['created_at', 'updated_at'],
    'bool_cols': ['is_valid'],
    'category_cols': ['status'],
    'geometry_cols': []  # Use only if you have WKT strings; prefer geopandas for shapefiles
}

# APPROACH 2: dtype_map dictionary (compact, useful for dynamic configs)
COERCION_CONFIG_DTYPE_MAP = {
    'int': ['repl_no', 'sample_no'],
    'float': ['value', 'measurement'],
    'datetime': ['created_at', 'updated_at'],
    'bool': ['is_valid'],
    'category': ['status'],
    'geometry': []  # Use only if you have WKT strings; prefer geopandas for shapefiles
}

# APPROACH 3: GeoPandas GeoDataFrame (for shapefiles and spatial data)
# When loading shapefiles with geopandas, geometry is already a GeoSeries.
# Use geometry_format='geodataframe' to skip coercion:
GEOPANDAS_CONFIG = {
    'int_cols': ['id', 'repl_no'],
    'float_cols': ['area', 'value'],
    'geometry_cols': ['geometry'],
    'geometry_format': 'geodataframe'  # Don't convert; already properly typed
}

# Usage: coerce_the_gsheets(df, **COERCION_CONFIG_EXPLICIT)
#   or: coerce_the_gsheets(df, dtype_map=COERCION_CONFIG_DTYPE_MAP)
#   or: coerce_the_gsheets(gdf, **GEOPANDAS_CONFIG)  # for GeoDataFrames


def coerce_the_gsheets(df, dtype_map=None, int_cols=None, float_cols=None, datetime_cols=None, bool_cols=None, category_cols=None, geometry_cols=None, geometry_format='wkt'):
    """Coerce specified columns on a cleaned DataFrame using coercion helpers.
    `dtype_map` is an alternative mapping where keys are 'int','float','datetime','bool','category','geometry'.
    `geometry_format` controls geometry coercion: 'wkt' (parse WKT strings) or 'geodataframe' (skip, already typed)."""
    if not isinstance(df, pd.DataFrame):
        logger.error('coerce_the_gsheets: input is not a DataFrame')
        return df
    return coerce_columns(df, int_cols=int_cols, float_cols=float_cols, datetime_cols=datetime_cols, bool_cols=bool_cols, category_cols=category_cols, geometry_cols=geometry_cols, dtype_map=dtype_map, geometry_format=geometry_format)


In [None]:
clean_the_gsheets(gdf).dtypes

In [None]:
# Export attributes to CSV for downstream processing (optional)
output_csv = 'data/shapefile_attributes.csv'
gdf.drop(columns='geometry').to_csv(output_csv, index=False)
print(f'Attributes exported to {output_csv}')
