In [12]:
# setup and data loading
!pip install geopandas rasterio earthengine-api folium

import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import ee
from datetime import datetime
import folium
from shapely.geometry import Point, box
import urllib.request
import zipfile

# authenticate and initialize Earth Engine
ee.Authenticate()
ee.Initialize(project='colonia-detection')

# load colonias from NM shapefile
zip_url = "https://github.com/wvg1/colonia-detection/archive/refs/heads/main.zip"
urllib.request.urlretrieve(zip_url, 'repo.zip')

with zipfile.ZipFile('repo.zip', 'r') as zip_ref:
    zip_ref.extractall()

shapefile_path = 'colonia-detection-main/data/raw/colonias_shapefile/NewMexicoCandidateColoniaBlocks.shp'
colonias_gdf = gpd.read_file(shapefile_path)
colonias_gdf = colonias_gdf.to_crs('EPSG:4326')

print(f"Blocks loaded: {len(colonias_gdf)}")

Blocks loaded: 2369


In [14]:
# group blocks by colonia
colonias_grouped = colonias_gdf.groupby('Colonia').apply(
    lambda x: gpd.GeoSeries(x.geometry).unary_union
).reset_index()

colonias_grouped.columns = ['Colonia', 'geometry']
colonias_grouped = gpd.GeoDataFrame(colonias_grouped, crs='EPSG:4326')

print(f"Unique colonias: {len(colonias_grouped)}")
print(colonias_grouped.head())

# get study area bounds
bounds = colonias_grouped.total_bounds
print(f"Study area bounds: {bounds}")

# get study area bounds
bounds = colonias_grouped.total_bounds
print(f"Study area bounds: {bounds}")

# sample random control points (not in colonias)
np.random.seed(42)
num_controls = len(colonias_grouped) * 2

random_lons = np.random.uniform(bounds[0], bounds[2], num_controls)
random_lats = np.random.uniform(bounds[1], bounds[3], num_controls)

control_points = gpd.GeoDataFrame(
    {'type': ['control'] * num_controls},
    geometry=[Point(lon, lat) for lon, lat in zip(random_lons, random_lats)],
    crs='EPSG:4326'
)

# remove controls inside colonias
control_points = gpd.sjoin(
    control_points,
    colonias_grouped[['geometry']],
    how='left',
    predicate='within'
)
control_points = control_points[control_points.index_right.isna()]
control_points = control_points[['type', 'geometry']].head(num_controls)

# combine colonias and controls
colonias_grouped['type'] = 'colonia'
sample_data = pd.concat([
    colonias_grouped[['type', 'geometry']],
    control_points
], ignore_index=True)

print(f"\nSample dataset:")
print(f"Colonias: {(sample_data['type'] == 'colonia').sum()}")
print(f"Controls: {(sample_data['type'] == 'control').sum()}")

  lambda x: gpd.GeoSeries(x.geometry).unary_union


Unique colonias: 80
         Colonia                                           geometry
0           Alma  POLYGON ((-108.90198 33.36348, -108.90203 33.3...
1        Anthony  POLYGON ((-106.61849 32.0005, -106.6185 32.000...
2  Arenas Valley  POLYGON ((-108.19999 32.78222, -108.20014 32.7...
3  Bear Mountain  POLYGON ((-108.83893 33.00512, -108.83782 33.0...
4     Beaverhead  POLYGON ((-108.11628 33.46093, -108.11621 33.4...
Study area bounds: [-109.049169   31.783148 -104.357401   34.441029]
Study area bounds: [-109.049169   31.783148 -104.357401   34.441029]

Sample dataset:
Colonias: 80
Controls: 152


  colonias_grouped = colonias_gdf.groupby('Colonia').apply(


In [15]:
# query Sentinel-2
geometry = ee.Geometry.Rectangle([bounds[0], bounds[1], bounds[2], bounds[3]])

sentinel2 = ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED') \
    .filterBounds(geometry) \
    .filterDate('2024-01-01', '2024-12-31') \
    .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 20)) \
    .sort('CLOUD_COVERAGE_ASSESSMENT') \
    .first()

image_info = sentinel2.getInfo()
props = image_info['properties']

date_ms = props['system:time_start']
date = datetime.fromtimestamp(date_ms / 1000).strftime('%Y-%m-%d')

print(f"Sentinel-2 Image: {date}")
print(f"Cloud cover: {props['CLOUDY_PIXEL_PERCENTAGE']}%")

Sentinel-2 Image: 2024-01-16
Cloud cover: 0%


In [17]:
# visualize
# create interactive map
center_lat = (bounds[1] + bounds[3]) / 2
center_lon = (bounds[0] + bounds[2]) / 2

m = folium.Map(
    location=[center_lat, center_lon],
    zoom_start=7,
    tiles='OpenStreetMap'
)

# add colonias as polygons (handle both Polygon and MultiPolygon)
for idx, row in colonias_grouped.iterrows():
    geom = row.geometry

    if geom.geom_type == 'Polygon':
        coords = [[y, x] for x, y in zip(*geom.exterior.xy)]
        folium.Polygon(
            locations=coords,
            popup=row['Colonia'],
            color='red',
            fill=True,
            fillColor='red',
            fillOpacity=0.4,
            weight=1
        ).add_to(m)
    elif geom.geom_type == 'MultiPolygon':
        for poly in geom.geoms:
            coords = [[y, x] for x, y in zip(*poly.exterior.xy)]
            folium.Polygon(
                locations=coords,
                popup=row['Colonia'],
                color='red',
                fill=True,
                fillColor='red',
                fillOpacity=0.4,
                weight=1
            ).add_to(m)

# add control points
for idx, row in control_points.iterrows():
    folium.CircleMarker(
        location=[row.geometry.y, row.geometry.x],
        radius=3,
        color='blue',
        fill=True,
        fillColor='blue',
        fillOpacity=0.6
    ).add_to(m)

# add study area boundary
folium.Rectangle(
    bounds=[[bounds[1], bounds[0]], [bounds[3], bounds[2]]],
    color='green',
    fill=False,
    weight=2
).add_to(m)

m.save('nm_colonias_map.html')
print("Map saved!")
m

Map saved!
