In [2]:
# setup
!pip install geopandas rasterio earthengine-api numpy pandas matplotlib scikit-learn scipy

import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import ee
from datetime import datetime
import urllib.request
import zipfile
from scipy import stats
from shapely.geometry import Point, box

# authenticate and initialize Earth Engine
ee.Authenticate()
ee.Initialize(project='colonia-detection')

# load the sample data from previous notebook
zip_url = "https://github.com/wvg1/colonia-detection/archive/refs/heads/main.zip"
urllib.request.urlretrieve(zip_url, 'repo.zip')

with zipfile.ZipFile('repo.zip', 'r') as zip_ref:
    zip_ref.extractall()

shapefile_path = 'colonia-detection-main/data/raw/colonias_shapefile/NewMexicoCandidateColoniaBlocks.shp'
colonias_gdf = gpd.read_file(shapefile_path)
colonias_gdf = colonias_gdf.to_crs('EPSG:4326')

# group by colonia
colonias_grouped = colonias_gdf.groupby('Colonia').apply(
    lambda x: gpd.GeoSeries(x.geometry).union_all()
).reset_index()

colonias_grouped.columns = ['Colonia', 'geometry']
colonias_grouped = gpd.GeoDataFrame(colonias_grouped, crs='EPSG:4326')
colonias_grouped['type'] = 'colonia'

print(f"Colonias loaded: {len(colonias_grouped)}")

Colonias loaded: 80


  colonias_grouped = colonias_gdf.groupby('Colonia').apply(


In [4]:
# calculate spectral indices for colonias
def extract_indices(geometry):
    """extract mean spectral indices for a geometry"""

    sentinel2 = ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED') \
        .filterBounds(geometry) \
        .filterDate('2024-01-01', '2024-12-31') \
        .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 20)) \
        .sort('CLOUD_COVERAGE_ASSESSMENT') \
        .first()

    nir = sentinel2.select('B8')
    red = sentinel2.select('B4')
    swir = sentinel2.select('B11')

    ndvi = nir.subtract(red).divide(nir.add(red))
    ndbi = swir.subtract(nir).divide(swir.add(nir))
    ndmi = nir.subtract(swir).divide(nir.add(swir))

    features = {
        'ndvi': ndvi.reduceRegion(ee.Reducer.mean(), geometry, 30).getInfo(),
        'ndbi': ndbi.reduceRegion(ee.Reducer.mean(), geometry, 30).getInfo(),
        'ndmi': ndmi.reduceRegion(ee.Reducer.mean(), geometry, 30).getInfo(),
    }

    return features

print("extracting features for colonias...")
colonia_features = []

for idx, row in colonias_grouped.iterrows():
    try:
        # convert shapely geometry to ee.Geometry, handles both Polygon and MultiPolygon
        if row.geometry.geom_type == 'Polygon':
            coords = list(row.geometry.exterior.coords)
            geom_ee = ee.Geometry.Polygon(coords)
        elif row.geometry.geom_type == 'MultiPolygon':
            # for MultiPolygon, use the first polygon or create a MultiPolygon
            coords = list(row.geometry.geoms[0].exterior.coords)
            geom_ee = ee.Geometry.Polygon(coords)
        else:
            # fallback: use the geo_interface but wrap coordinates properly
            geo_dict = row.geometry.__geo_interface__
            geom_ee = ee.Geometry(geo_dict)

        features = extract_indices(geom_ee)

        colonia_features.append({
            'Colonia': row['Colonia'],
            'type': 'colonia',
            'ndvi': list(features['ndvi'].values())[0] if features['ndvi'] else np.nan,
            'ndbi': list(features['ndbi'].values())[0] if features['ndbi'] else np.nan,
            'ndmi': list(features['ndmi'].values())[0] if features['ndmi'] else np.nan,
        })
        print(f"  {row['Colonia']}: done")
    except Exception as e:
        print(f"  {row['Colonia']}: error - {e}")

colonia_df = pd.DataFrame(colonia_features)
print(f"\nextracted features for {len(colonia_df)} colonias")
print(colonia_df.head())

extracting features for colonias...
  Alma: done
  Anthony: done
  Arenas Valley: done
  Bear Mountain: done
  Beaverhead: done
  Bent: done
  Berino: done
  Brazito: done
  Buckhorn: done
  Carlisle: done
  Catfish Farms: done
  Cattleland: done
  Chamberino: done
  Chaparral: done
  City of Sunland Park: done
  Cottage San: done
  Cotton: done
  Del Cerro: done
  Del Sol: done
  Dog Canyon: done
  Dona Ana: done
  Dungan: done
  El Milagro: done
  Fairacres: done
  Faywood: done
  Ft. Selden: done
  Garfield: done
  Gila Hot Springs: done
  Glen Acres: done
  Glenwood: done
  Hachita: done
  High Rolls: done
  Hill: done
  Joy Drive Subd.: done
  Keeler Farm Road: done
  La Mesa: done
  La Union: done
  Lake Roberts: done
  Las Palmeras: done
  Leasburg: done
  Luna: done
  Mayhill: done
  Mesquite: done
  Mimbres: done
  Mockingbird Hill: done
  Mogollon: done
  Montana Vista: done
  Moongate: done
  Mountain View: done
  Mule Creek: done
  Nogal: done
  Old Picacho: done
  Organ: d

In [8]:
# reload sample dataset
bounds = colonias_grouped.total_bounds

np.random.seed(42)
num_controls = len(colonias_grouped) * 2

random_lons = np.random.uniform(bounds[0], bounds[2], num_controls)
random_lats = np.random.uniform(bounds[1], bounds[3], num_controls)

control_points = gpd.GeoDataFrame(
    {'type': ['control'] * num_controls},
    geometry=[Point(lon, lat) for lon, lat in zip(random_lons, random_lats)],
    crs='EPSG:4326'
)

# remove controls inside colonias
control_points = gpd.sjoin(
    control_points,
    colonias_grouped[['geometry']],
    how='left',
    predicate='within'
)
control_points = control_points[control_points.index_right.isna()]
control_points = control_points[['type', 'geometry']].head(num_controls)

# combine colonias and controls
colonias_grouped['type'] = 'colonia'
sample_data = pd.concat([
    colonias_grouped[['type', 'geometry']],
    control_points
], ignore_index=True)

print(f"Sample dataset:")
print(f"Colonias: {(sample_data['type'] == 'colonia').sum()}")
print(f"Controls: {(sample_data['type'] == 'control').sum()}")

Sample dataset:
Colonias: 80
Controls: 152


In [9]:
# extract spectral indices for both colonias and control points
print("extracting features for colonias and controls...")
all_features = []

for idx, row in sample_data.iterrows():
    try:
        # convert shapely geometry to ee.Geometry
        if row.geometry.geom_type == 'Polygon':
            coords = list(row.geometry.exterior.coords)
            geom_ee = ee.Geometry.Polygon(coords)
        elif row.geometry.geom_type == 'MultiPolygon':
            coords = list(row.geometry.geoms[0].exterior.coords)
            geom_ee = ee.Geometry.Polygon(coords)
        elif row.geometry.geom_type == 'Point':
            # for point geometries, create a small buffer
            geom_ee = ee.Geometry.Point([row.geometry.x, row.geometry.y]).buffer(100)
        else:
            geo_dict = row.geometry.__geo_interface__
            geom_ee = ee.Geometry(geo_dict)

        features = extract_indices(geom_ee)

        all_features.append({
            'id': idx,
            'type': row['type'],
            'ndvi': list(features['ndvi'].values())[0] if features['ndvi'] else np.nan,
            'ndbi': list(features['ndbi'].values())[0] if features['ndbi'] else np.nan,
            'ndmi': list(features['ndmi'].values())[0] if features['ndmi'] else np.nan,
        })

        if (idx + 1) % 20 == 0:
            print(f"  processed {idx + 1} / {len(sample_data)} samples")

    except Exception as e:
        print(f"  id {idx} ({row['type']}): error - {e}")

sample_df = pd.DataFrame(all_features)
print(f"\nextracted features for {len(sample_df)} samples")
print(f"Colonias: {(sample_df['type'] == 'colonia').sum()}")
print(f"Controls: {(sample_df['type'] == 'control').sum()}")
print("\nSummary statistics by type:")
print(sample_df.groupby('type')[['ndvi', 'ndbi', 'ndmi']].describe())

extracting features for colonias and controls...
  processed 20 / 232 samples
  processed 40 / 232 samples
  processed 60 / 232 samples
  processed 80 / 232 samples
  processed 100 / 232 samples
  processed 120 / 232 samples
  processed 140 / 232 samples
  processed 160 / 232 samples
  processed 180 / 232 samples
  processed 200 / 232 samples
  processed 220 / 232 samples

extracted features for 232 samples
Colonias: 80
Controls: 152

Summary statistics by type:
          ndvi                                                             \
         count     mean       std       min       25%       50%       75%   
type                                                                        
colonia   80.0  0.23754  0.114562  0.026659  0.129757  0.224020  0.309354   
control  152.0  0.19190  0.112631  0.079401  0.122493  0.146609  0.218853   

                    ndbi            ...                       ndmi            \
              max  count      mean  ...       75%       max  count 