In [2]:
# Standard imports used by geospatial helpers
import logging
import pandas as pd
import numpy as np

logger = logging.getLogger()

## 1) `detect_latlon_columns()` — Auto-detect latitude/longitude columns

**Purpose:** Intelligently find lat/lon columns by name pattern matching.

**Detects:**
- Standard names: `latitude`/`longitude`, `lat`/`lon`, `desc_lat`/`desc_lon`
- Prefixed names: `sampling_lat`/`sampling_lon`, `prod_lat`/`prod_lon`, etc.
- Combined columns: `latlong`, `lat_lon`, `latlng`, `location`, `coordinates`

**Returns:** Dict with keys `latitude`, `longitude`, and `combined` (each a list of matching columns)

In [7]:
# Example: detect columns in a sample DataFrame
from src.ca_biositing.pipeline.ca_biositing.pipeline.utils.cleaning_functions import detect_latlon_columns

sample_df = pd.DataFrame({
    'site_id': [1, 2, 3],
    'sampling_lat': [40.7128, 34.0522, 37.7749],
    'sampling_lon': [-74.0060, -118.2437, -122.4194],
    'prod_location': ['40.5,-74.0', '34.2,-118.5', '37.9,-122.4'],
    'name': ['Site A', 'Site B', 'Site C']
})

detected = detect_latlon_columns(sample_df)
print("Detected columns:")
print(f"  Latitude: {detected['latitude']}")
print(f"  Longitude: {detected['longitude']}")
print(f"  Combined: {detected['combined']}")

ModuleNotFoundError: No module named 'src'

## 2) `split_combined_latlon()` — Parse combined lat/lon columns

**Purpose:** Split a single column containing both latitude and longitude into two separate columns.

**Features:**
- Handles multiple delimiters: comma, semicolon, pipe, tab, space
- Auto-detects delimiter if not specified
- Graceful error handling (parsing failures → NaN)
- Optionally keeps original column for verification

**Example formats:**
- `"40.7128,-74.0060"` (comma + space)
- `"40.7128,-74.0060"` (comma only)
- `"40.7128 -74.0060"` (space)
- `"40.7128;-74.0060"` (semicolon)
- `"40.7128|-74.0060"` (pipe)

In [None]:
from src.ca_biositing.pipeline.ca_biositing.pipeline.utils.cleaning_functions import split_combined_latlon

# Create a DataFrame with combined lat/lon
combined_df = pd.DataFrame({
    'site_id': [1, 2, 3, 4],
    'prod_location': [
        '40.7128, -74.0060',      # comma + space
        '34.0522,-118.2437',      # comma only
        '37.7749 -122.4194',      # space only
        '41.8781;-87.6298'        # semicolon
    ],
    'name': ['NYC', 'LA', 'SF', 'Chicago']
})

print("Before split:")
print(combined_df)

# Split automatically detects delimiters
split_df = split_combined_latlon(
    combined_df,
    col='prod_location',
    lat_col='prod_lat',
    lon_col='prod_lon',
    keep_original=False
)

print("\nAfter split:")
print(split_df)
print("\nData types:")
print(split_df.dtypes)

## 3) `standardize_latlon()` — Full workflow: detect, split, rename, coerce

**Purpose:** One-step function to standardize all lat/lon columns in a DataFrame.

**Workflow:**
1. Auto-detect lat/lon columns by name pattern (optional)
2. Split any combined lat/lon columns
3. Rename detected separate columns to standard names (`desc_lat`, `desc_lon`)
4. Coerce to float64 with error handling

**Output:** DataFrame with standardized `desc_lat` and `desc_lon` columns

In [None]:
from src.ca_biositing.pipeline.ca_biositing.pipeline.utils.cleaning_functions import standardize_latlon

# Example: real-world scenario with mixed naming
messy_df = pd.DataFrame({
    'site_id': [1, 2, 3, 4, 5],
    'sampling_lat': [40.7128, 34.0522, 37.7749, '41.8781', None],  # Mixed types
    'sampling_lon': [-74.0060, -118.2437, -122.4194, '-87.6298', -120.0],
    'prod_location': ['40.5,-74.0', '34.2,-118.5', None, '', '37.5,-122.5'],  # Some invalid
    'name': ['A', 'B', 'C', 'D', 'E']
})

print("Before standardization:")
print(messy_df)
print(f"\nColumns: {messy_df.columns.tolist()}")

# One-step standardization with auto-detection
clean_df = standardize_latlon(
    messy_df,
    auto_detect=True,           # Auto-find lat/lon columns
    output_lat='desc_lat',
    output_lon='desc_lon',
    coerce_to_float=True        # Convert to float64, invalid → NaN
)

print("\nAfter standardization:")
print(clean_df[['site_id', 'desc_lat', 'desc_lon', 'name']])
print(f"\nData types: {clean_df[['desc_lat', 'desc_lon']].dtypes.to_dict()}")

## 4) Common Naming Patterns Detected

The auto-detection patterns (case-insensitive) include:

### Latitude Patterns
- `latitude` (exact)
- `lat` (exact)
- `desc_lat` (exact)
- `*_lat` (ends with `_lat`): e.g., `sampling_lat`, `prod_lat`, `site_lat`
- `lat_*` (starts with `lat_`): e.g., `lat_decimal`, `lat_dms`

### Longitude Patterns
- `longitude` (exact)
- `lon` (exact)
- `desc_lon` (exact)
- `*_lon` (ends with `_lon`): e.g., `sampling_lon`, `prod_lon`, `site_lon`
- `lon_*` (starts with `lon_`): e.g., `lon_decimal`, `lon_dms`

### Combined Patterns
- `*latlong*`: e.g., `latlong`, `location_latlong`
- `*lat_lon*`: e.g., `lat_lon`, `sampling_lat_lon`
- `*latitude_longitude*`
- `*latlng*`: e.g., `latlng`
- `*location*`: e.g., `location`, `geo_location`
- `*coordinates*`: e.g., `coordinates`, `geo_coordinates`

## 5) Advanced: Explicit Column Specification

If you prefer explicit control (e.g., handling multiple lat/lon pairs), you can specify columns:

In [1]:
from src.ca_biositing.pipeline.ca_biositing.pipeline.utils.cleaning_functions import (
    split_combined_latlon,
    standardize_latlon
)

# If you have multiple lat/lon pairs and want specific control:
df = pd.DataFrame({
    'sampling_lat': [40.7, 34.0],
    'sampling_lon': [-74.0, -118.2],
    'facility_location': ['40.5,-74.1', '34.1,-118.3'],
})

# Process sampling columns
result = standardize_latlon(
    df,
    lat_cols=['sampling_lat'],
    lon_cols=['sampling_lon'],
    combined_cols=['facility_location'],
    auto_detect=False,  # Disable auto-detect, use explicit columns
    output_lat='desc_lat',
    output_lon='desc_lon'
)

print(result)

ModuleNotFoundError: No module named 'src'

## 6) Error Handling & Data Quality

All functions handle missing/invalid data gracefully:

- **Missing values:** NaN in input → NaN in output
- **Parsing failures:** Invalid format → NaN (logged as debug/warning)
- **Type coercion:** Non-numeric strings → NaN (logged as warning)
- **Validation:** No range checking yet (planned for future)

In [None]:
# Example: Error handling in action
messy = pd.DataFrame({
    'location': [
        '40.7,-74.0',       # Valid
        'not,numbers',      # Invalid
        None,               # Missing
        '40.7',             # Incomplete (only one value)
        '',                 # Empty string
        '40.7, -74.0, extra'  # Extra values (takes first two)
    ]
})

result = standardize_latlon(messy, auto_detect=True)
print(result[['desc_lat', 'desc_lon']])
print(f"\nNull counts: lat={result['desc_lat'].isnull().sum()}, lon={result['desc_lon'].isnull().sum()}")

## 7) Future Enhancements

This geospatial module is designed to grow:

- **Validation:** Range checking (lat: -90 to 90, lon: -180 to 180)
- **Address parsing:** Extract/standardize address components
- **Projection support:** Convert between coordinate systems (WGS84, UTM, etc.)
- **Geocoding:** Look up lat/lon from addresses
- **Reverse geocoding:** Look up addresses from lat/lon
- **Distance calculations:** Compute distances between points