# Cleaning Helpers
This notebook documents reusable data-cleaning helper functions for the CA Biositing ETL. Each helper is documented with purpose, behaviour, and examples so junior engineers can adopt them safely.


In [None]:
# Standard imports used by all helpers
import logging
import pandas as pd
import numpy as np
logger = logging.getLogger()

# Note: these helpers use `janitor.clean_names()`; ensure `janitor` is available when running interactively.


## 1) `clean_names_df` — standardize column names
Purpose: make column names predictable (snake_case, no spaces) so downstream code can reference columns reliably.
When to use: immediately after loading raw data from sheets/CSV.


In [None]:
def clean_names_df(df):
    """Return a copy of `df` with cleaned column names using `janitor.clean_names()`.

    Behavior:
    - If `df` is not a DataFrame, logs and returns the original value.
    - Returns a new DataFrame reference (does not mutate input).

    Example:
        df2 = clean_names_df(df)
    """
    if not isinstance(df, pd.DataFrame):
        logger.error('clean_names_df: input is not a DataFrame')
        return df
    # janitor.clean_names normalizes casing, removes punctuation/spaces, and converts to snake_case
    return df.clean_names()


## 2) `replace_empty_with_na` — convert empty / whitespace-only strings to NaN
Purpose: make empty strings behave like missing values so pandas functions (dropna, isna) work as expected.
Parameters:
- `columns`: list of columns to target; `None` means all columns.
- `regex`: regex used to match empty/whitespace strings (default `'^\s*$'`).


In [None]:
def replace_empty_with_na(df, columns=None, regex=r'^\s*$'):
    """Replace empty/whitespace-only strings with `np.nan`.

    Returns a new DataFrame; if `columns` specified, only those columns are replaced.
    """
    if not isinstance(df, pd.DataFrame):
        logger.error('replace_empty_with_na: input is not a DataFrame')
        return df
    if columns is None:
        # operate on entire DataFrame (string matches only affect string-like cells)
        return df.replace(regex, np.nan, regex=True)
    df = df.copy()
    # Limit replacement to the requested columns (if they exist)
    cols = [c for c in columns if c in df.columns]
    if not cols:
        logger.warning('replace_empty_with_na: no matching columns found; returning original DataFrame')
        return df
    df[cols] = df[cols].replace(regex, np.nan, regex=True)
    return df


## 3) `to_lowercase_df` — normalize string values to lowercase
Purpose: reduce variations in human-provided text (e.g., 'Corn', 'corn', 'CORN') to a single canonical form.
Notes: preserves missing values (NaN) and only acts on string-like columns. You can pass a subset of columns to limit changes.


In [None]:
def to_lowercase_df(df, columns=None):
    """Lowercase string columns.

    - `columns` selects which columns to lowercase; default is all object/string dtype columns.
    Returns a new DataFrame.
    """
    if not isinstance(df, pd.DataFrame):
        logger.error('to_lowercase_df: input is not a DataFrame')
        return df
    df = df.copy()
    if columns is None:
        str_cols = df.select_dtypes(include=['object', 'string']).columns
    else:
        str_cols = [c for c in columns if c in df.columns]
    for c in str_cols:
        # convert to pandas string dtype to get vectorized string methods; preserve NaN values
        df[c] = df[c].astype('string').str.lower().where(df[c].notna(), df[c])
    return df


## 4) `standard_clean` — composed pipeline for common cleaning steps
Purpose: provide a one-line convenience wrapper that composes name-cleaning, empty->NA replacement, lowercase conversion, and dtype inference.
When to use: for quick interactive cleaning or as a default pre-processing step in flows. For strict per-column coercions, use the coercion helpers below.


In [None]:
def standard_clean(df, lowercase=True, replace_empty=True):
    """Run a standard cleaning sequence and return the cleaned DataFrame.

    Steps:
    1. `clean_names_df`
    2. `replace_empty_with_na` (optional)
    3. `to_lowercase_df` (optional)
    4. `convert_dtypes()` to let pandas pick better dtypes
    """
    if not isinstance(df, pd.DataFrame):
        logger.error('standard_clean: input is not a DataFrame')
        return None
    df = clean_names_df(df)
    if replace_empty:
        df = replace_empty_with_na(df)
    if lowercase:
        df = to_lowercase_df(df)
    # convert_dtypes helps for nullable integers, booleans, and strings
    df = df.convert_dtypes()
    return df


## 5) Coercion helpers — targeted, explicit type conversions
Rationale: `standard_clean` calls `convert_dtypes()` which is useful, but for production ETL we need deterministic per-column coercion (Int64 nullable ints, float32 for numeric, datetime parsing, geometry parsing, etc.).
This section provides helpers for common coercion targets and a single `coerce_columns` wrapper.


In [None]:
def _coerce_int(df, cols):
    """Coerce listed columns to pandas nullable `Int64`. Non-parsable values become <NA>.
    """
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce').astype('Int64')
    return df

def _coerce_float(df, cols, float_dtype=np.float32):
    """Coerce columns to float (default `np.float32`) with errors coerced to NaN.
    """
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce').astype(float_dtype)
    return df

def _coerce_datetime(df, cols, **kwargs):
    """Parse datetime-like columns using `pd.to_datetime`. Extra kwargs (e.g., `dayfirst`) are passed through.
    """
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_datetime(df[c], errors='coerce', **kwargs)
    return df

def _coerce_bool(df, cols):
    """Coerce common boolean encodings to pandas nullable boolean dtype.
    Accepts True/False, 'true'/'false', '1'/'0'. Unknowns become <NA>.
    """
    mapping = {True: True, False: False, 'true': True, 'false': False, '1': True, '0': False}
    for c in cols:
        if c in df.columns:
            df[c] = df[c].map(mapping).astype('boolean')
    return df

def _coerce_category(df, cols):
    for c in cols:
        if c in df.columns:
            df[c] = df[c].astype('category')
    return df

def _coerce_geometry(df, cols, geom_format='wkt'):
    """Attempt to convert WKT strings to shapely geometry objects. If `shapely` is not installed, logs a warning and skips geometry coercion.
    Supported `geom_format`: currently only 'wkt'.
    """
    try:
        from shapely import wkt
    except Exception:
        logger.warning('shapely not available; geometry coercion skipped')
        return df
    for c in cols:
        if c in df.columns:
            df[c] = df[c].apply(lambda v: wkt.loads(v) if isinstance(v, str) and v.strip() else None)
    return df

def coerce_columns(df,
                   int_cols=None,
                   float_cols=None,
                   datetime_cols=None,
                   bool_cols=None,
                   category_cols=None,
                   geometry_cols=None,
                   dtype_map=None,
                   float_dtype=np.float32):
    """Top-level wrapper to coerce groups of columns.

    - `dtype_map` is an alternative mapping where keys are 'int','float','datetime','bool','category','geometry'
      and values are lists of column names. Explicit keyword lists override `dtype_map` entries.
    Returns a new DataFrame with coerced columns where possible.
    """
    if not isinstance(df, pd.DataFrame):
        logger.error('coerce_columns: input is not a DataFrame')
        return df
    df = df.copy()
    if dtype_map:
        int_cols = int_cols or dtype_map.get('int') or dtype_map.get('integer')
        float_cols = float_cols or dtype_map.get('float')
        datetime_cols = datetime_cols or dtype_map.get('datetime') or dtype_map.get('date')
        bool_cols = bool_cols or dtype_map.get('bool')
        category_cols = category_cols or dtype_map.get('category')
        geometry_cols = geometry_cols or dtype_map.get('geometry')

    if int_cols:
        df = _coerce_int(df, int_cols)
    if float_cols:
        df = _coerce_float(df, float_cols, float_dtype)
    if datetime_cols:
        df = _coerce_datetime(df, datetime_cols)
    if bool_cols:
        df = _coerce_bool(df, bool_cols)
    if category_cols:
        df = _coerce_category(df, category_cols)
    if geometry_cols:
        df = _coerce_geometry(df, geometry_cols)

    return df

def coerce_columns_list(dfs, **coerce_kwargs):
    """Apply `coerce_columns` to a list of dataframes and return the results in the same order.
    Non-DataFrame items are preserved (with a warning).
    """
    out = []
    for i, df in enumerate(dfs):
        if not isinstance(df, pd.DataFrame):
            logger.warning(f'Item {i} is not a DataFrame; skipping')
            out.append(df)
            continue
        out.append(coerce_columns(df.copy(), **coerce_kwargs))
    return out


## 6) Usage examples
These snippets demonstrate typical workflows; run them interactively with sample DataFrames.


In [None]:
# Example: a raw dataframe from Google Sheets (simulate)
raw = pd.DataFrame({
    'Sample No': ['1', '2', '  ', '4'],
    'Value': ['10.0', 'NaN', '', '7.5'],
    'Resource': ['Corn', 'Wheat', 'corn', None],
    'Created At': ['2020-01-01', '2020-02-01', '', 'not-a-date']
})

# 1) Standard clean: names, empty->NA, lowercase, dtype inference
cleaned = standard_clean(raw)
print(cleaned.dtypes)
display(cleaned)

# 2) Explicit coercion
coerced = coerce_columns(cleaned,
                         int_cols=['sample_no'],
                         float_cols=['value'],
                         datetime_cols=['created_at'])
print(coerced.dtypes)
display(coerced)


## Notes & Next steps
- Geometry coercion requires `shapely`: to add it for the pipeline environment use Pixi (see `pixi.toml`). Example (local):

```bash
pixi add --feature pipeline --pypi shapely
pixi install
```

- For production use we should move these helpers into `src/ca_biositing/pipeline/.../etl_utils.py` and decorate Prefect tasks where appropriate.


## 7) GeoPandas Integration for Shapefiles

For geospatial work, **geopandas** is the preferred approach to load and work with shapefiles and other vector data formats. Geopandas extends pandas DataFrames with a `geometry` column containing shapely geometry objects.

### Loading Shapefiles with GeoPandas

Instead of parsing WKT strings, you can load shapefiles directly:

```python
import geopandas as gpd

# Load a shapefile (creates a GeoDataFrame with geometry column)
gdf = gpd.read_file('path/to/shapefile.shp')

# The geometry column is already properly typed
print(gdf.geometry.dtype)  # geometry
print(type(gdf.geometry[0]))  # shapely.geometry object

# You can then merge this with your cleaned data
# or use spatial operations (intersects, buffer, distance, etc.)
```

### Coercing Geometry from GeoDataFrames

When you load a GeoDataFrame, the geometry column is already properly typed. Use `geometry_format='geodataframe'` to skip coercion:

```python
# After loading: gdf = gpd.read_file('shapefile.shp')
gdf_clean = standard_clean(gdf)  # clean as normal

# Coerce other columns but skip geometry (already correct type)
gdf_coerced = coerce_columns(
    gdf_clean,
    int_cols=['id'],
    float_cols=['area'],
    geometry_cols=['geometry'],
    geometry_format='geodataframe'  # Skip coercion; geometry is already GeoSeries
)
```

### Mixed Workflows: GeoDataFrames + Tabular Data

A common pattern for bioeconomy site selection:

```python
# 1. Load tabular data (Google Sheets)
df = pd.read_csv('sites.csv')  # columns: site_id, site_name, resource, value, etc.
df_clean = standard_clean(df)
df_coerced = coerce_columns(df_clean, int_cols=['site_id'], float_cols=['value'])

# 2. Load spatial data (shapefiles)
gdf = gpd.read_file('parcel_boundaries.shp')  # geometry column with polygons

# 3. Merge on spatial relationship (e.g., which parcels contain which sites)
# Use sjoin for spatial join
merged = gpd.sjoin(gdf, gpd.GeoDataFrame(geometry=gpd.points_from_xy(df_coerced['lon'], df_coerced['lat'])))

# 4. Now you have a GeoDataFrame with tabular + spatial info
print(merged.head())
```

### Setup: Adding GeoPandas to Your Environment

GeoPandas is included in the `vector` feature. Enable it:

```bash
pixi install --feature vector
# or for all GIS tools
pixi install --feature gis
```

This provides: `geopandas`, `shapely`, `pyproj`, and other geospatial libraries needed for shapefile work.
