# Analyze Maxar Image Availability

Takes in a CSVs of:
- All polygon features
- All Maxar images available for those polygons
- Filtered mage availability per polygon
- Filtered image availability per project (aggregated)
- Low coverage polygons

Analysis:

Outline:

Step 1: Load already computed CSVs for:
 - poly_img_avail_base_df
 - poly_img_avail_ev_df
 - project_img_avail_base_df
 - project_img_avail_ev_df
 - poly_gdf
 - maxar_gdf

Step 2: Merge plantstart years (from poly_gdf) into polygon-level image availability

Step 3: Calculate planting year distributions (by polygon)

Step 4: Assemble baseline & EV coverage from project-level files

Step 5: Calculate % area covered by baseline and by EV imagery

Step 6: Calculate % area with coverage at both points


In [1]:
import pandas as pd
import geopandas as gpd
import sys
sys.path.append('../src/')
import image_coverage as img_cover
import analyze_img_coverage as analyze
from datetime import datetime, timedelta

## Set file paths

In [2]:
# Set input file paths
# Polygon metadata & geometries from TM API
feats = '../data/tf_cohort1/tm_api_cohort1_2025-04-02.csv' 

# Metadata for Maxar images corresponding to polygons
maxar_md = '../data/tf_cohort1/imagery_availability/comb_img_availability_cohort1_2025-04-02.csv' 

# List of approved projects (with country codes)
approved_projects = '../projects_all_approved_202502211226.csv'

# Image availability
# Baseline
poly_img_avail_base = '../data/tf_cohort1/results/baseline/polygon_imagery_coverage_cohort1_2025-04-02.csv'
low_cov_poly_base = '../data/tf_cohort1/results/baseline/low_coverage_polygons_cohort1_2025-04-02.csv'
proj_img_avail_base = '../data/tf_cohort1/results/baseline/project_imagery_coverage_cohort1_2025-04-02.csv'

# Early Verification
poly_img_avail_ev = '../data/tf_cohort1/results/year_2/polygon_imagery_coverage_cohort1_2025-04-02.csv'
low_cov_poly_ev = '../data/tf_cohort1/results/year_2/low_coverage_polygons_cohort1_2025-04-02.csv'
proj_img_avail_ev = '../data/tf_cohort1/results/year_2/project_imagery_coverage_cohort1_2025-04-02.csv'

## Read in files|

In [3]:
# Polygon feature data
poly_df = pd.read_csv(feats)

# Maxar data
maxar_df = pd.read_csv(maxar_md)

# Image availability
# Baseline
poly_img_avail_base_df = pd.read_csv(poly_img_avail_base)
low_cov_poly_base_df = pd.read_csv(low_cov_poly_base)
proj_img_avail_base_df = pd.read_csv(proj_img_avail_base)

# Early Verification
poly_img_avail_ev_df = pd.read_csv(poly_img_avail_ev)
low_cov_poly_ev_df = pd.read_csv(low_cov_poly_ev)
proj_img_avail_ev_df = pd.read_csv(proj_img_avail_ev)

## Preprocess polygon and maxar image data

In [4]:
poly_gdf = img_cover.preprocess_polygons(poly_df, debug=True)
maxar_gdf = img_cover.preprocess_images(maxar_df, debug=True)

Processing polygon data...
Cleaning geometries...

ðŸ§¾ Geometry Cleaning Summary:
  âž¤ Total geometries processed: 13537
  âž¤ Invalid geometries:         0
  âž¤ Repaired with buffer(0):    0
  âž¤ Dropped:                    0
  âœ… Final valid polygons:       13537

Processing Maxar image data...
There are 175641 images for 12168 polygons in 78 projects in this dataset.


## Merge 'plantstart' and poly_geom into image availability data

In [5]:
# Merge plantstart and poly_geom into baseline and EV image availability
poly_img_avail_base_df = poly_img_avail_base_df.merge(poly_gdf[['poly_id', 'plantstart', 'poly_geom']], on='poly_id', how='left')
poly_img_avail_ev_df = poly_img_avail_ev_df.merge(poly_gdf[['poly_id', 'plantstart', 'poly_geom']], on='poly_id', how='left')

In [6]:
poly_img_avail_base_df.head(2)

Unnamed: 0,poly_id,project_id,best_image,num_images,poly_area_ha,overlap_area_ha,percent_img_cover,plantstart,poly_geom
0,a40e322b-42ff-4008-8407-e611b170a39c,389aad5b-6577-4cea-bf9f-446dcfd94966,,0,0.231599,0.0,0.0,2022-01-08,"POLYGON ((31.39719 1.51653, 31.3972 1.51652, 3..."
1,9dcccf42-cd63-471b-a251-abd1009fb819,389aad5b-6577-4cea-bf9f-446dcfd94966,,0,0.467551,0.0,0.0,2022-01-08,"POLYGON ((31.39611 1.51079, 31.39611 1.51075, ..."


## Calculate planting year stats by project

In [7]:
# Extract year from plantstart
poly_img_avail_base_df['plant_year'] = poly_img_avail_base_df['plantstart'].dt.year

# Group by project and calculate planting year distinctions
planting_stats = poly_img_avail_base_df.groupby('project_id').agg(
    num_poly=('poly_id', 'count'),
    pct_poly_plant_2022=('plant_year', lambda x: (x == 2022).sum() / len(x) * 100),
    pct_poly_plant_2023=('plant_year', lambda x: (x == 2023).sum() / len(x) * 100),
    pct_poly_plant_2024=('plant_year', lambda x: (x == 2024).sum() / len(x) * 100),
).reset_index()

## Calculate % polygons with imagery at baseline & early verification

In [8]:
# Count % of polygons with at least one baseline image
poly_img_avail_base_df['has_base_img'] = poly_img_avail_base_df['num_images'] > 0
base_img_stats = poly_img_avail_base_df.groupby('project_id')['has_base_img'].mean().reset_index()
base_img_stats = base_img_stats.rename(columns={'has_base_img': 'pct_poly_wi_base_img'})
base_img_stats['pct_poly_wi_base_img'] *= 100

# Count # of polygons with at least one early verification image
poly_img_avail_ev_df['has_ev_img'] = poly_img_avail_ev_df['num_images'] > 0
ev_img_stats = poly_img_avail_ev_df.groupby('project_id')['has_ev_img'].mean().reset_index()
ev_img_stats = ev_img_stats.rename(columns={'has_ev_img': 'pct_poly_wi_ev_img'})
ev_img_stats['pct_poly_wi_ev_img'] *= 100

## Calculate % of polygons with high image coverage (> 70%) at both time points

In [9]:
# Filter to only needed columns
base_cov = poly_img_avail_base_df[['poly_id', 'project_id', 'percent_img_cover']].copy()
ev_cov = poly_img_avail_ev_df[['poly_id', 'percent_img_cover']].copy()

# Rename for clarity before merge
base_cov = base_cov.rename(columns={'percent_img_cover': 'base_pct_img_cover'})
ev_cov = ev_cov.rename(columns={'percent_img_cover': 'ev_pct_img_cover'})

# Merge coverage values by poly_id
joined_cov = base_cov.merge(ev_cov, on='poly_id', how='inner')

# Define a coverage threshold for "high coverage"
cover_thresh = 70

# Check if both timepoints have > 70% coverage
joined_cov['high_cov_both'] = (
    (joined_cov['base_pct_img_cover'] >= cover_thresh) &
    (joined_cov['ev_pct_img_cover'] >= cover_thresh)
)

# Group by project and compute % of polygons with high coverage at both
high_cov_stats = joined_cov.groupby('project_id')['high_cov_both'].mean().reset_index()
high_cov_stats['pct_poly_wi_high_cov_both'] = high_cov_stats['high_cov_both'] * 100
high_cov_stats = high_cov_stats.drop(columns='high_cov_both')

## Assemble summary table at project level

In [10]:
# Start from planting_stats (project_id, number of polygons, and % that started planting each year)
project_summary_df = planting_stats.copy()

# Merge in baseline image availability
project_summary_df = project_summary_df.merge(base_img_stats, on='project_id', how='left')

# Merge in early verification image availability
project_summary_df = project_summary_df.merge(ev_img_stats, on='project_id', how='left')

# Merge in % polygons with high coverage at both time points
project_summary_df = project_summary_df.merge(high_cov_stats, on='project_id', how='left')

project_summary_df = project_summary_df.sort_values(by='pct_poly_wi_high_cov_both', ascending=False)
project_summary_df.head(1)

Unnamed: 0,project_id,num_poly,pct_poly_plant_2022,pct_poly_plant_2023,pct_poly_plant_2024,pct_poly_wi_base_img,pct_poly_wi_ev_img,pct_poly_wi_high_cov_both
13,33274073-8a4e-4eca-8b97-0e8da3833105,8,100.0,0.0,0.0,100.0,100.0,100.0


## Calculate % of project area with imagery at baseline and early verification

### Functions

In [11]:
def extract_img_id(title):
    """
    Extracts the Maxar image ID from the Maxar image title 
    """
    if isinstance(title, str) and title.startswith("Maxar"):
        return title.split()[-1]
    return None

### Create a dataframe with best image id and geometry for all polygons for baseline

In [12]:
# Add an 'img_id' column to the dataframes of polygon-level best image availability at baseline & EV
poly_img_avail_base_df['img_id_base'] = poly_img_avail_base_df['best_image'].apply(extract_img_id)
poly_img_avail_ev_df['img_id_ev'] = poly_img_avail_ev_df['best_image'].apply(extract_img_id)

In [14]:
# Create a dataframe with each unique img_id (and it associated geometry) from maxar_gdf
img_geom_lookup = maxar_gdf[['img_id', 'img_geom']].drop_duplicates()

In [18]:
# Merge the image's footprint geometry (img_geom) to each relevent row in poly_img_avail_base/ev_df

# Baseline
poly_img_avail_base_df = poly_img_avail_base_df.merge(
    img_geom_lookup.rename(columns={'img_id': 'img_id_base', 'img_geom': 'img_geom_base'}),
    on='img_id_base', how='left'
)

# Early verification
poly_img_avail_ev_df = poly_img_avail_ev_df.merge(
    img_geom_lookup.rename(columns={'img_id': 'img_id_ev', 'img_geom': 'img_geom_ev'}),
    on='img_id_ev', how='left'
)

In [36]:
## Build a dataframe with all relevent info for the best image at baseline & EV for each polygon ##

# Step 1: Select relevent columns from baseline dataframe and rename
base_cols = poly_img_avail_base_df[[
    'poly_id', 'project_id', 'plantstart', 'poly_geom',
    'best_image', 'img_id_base', 'percent_img_cover', 'img_geom_base'
]].rename(columns={
    'best_image': 'best_image_base',
    'percent_img_cover': 'percent_img_cover_base',
})

# Step 2: Select relevent columns from early verification dataframe and rename
ev_cols = poly_img_avail_ev_df[[
    'poly_id', 'best_image', 'img_id_ev', 'percent_img_cover', 'img_geom_ev'
]].rename(columns={
    'best_image': 'best_image_ev',
    'percent_img_cover': 'percent_img_cover_ev',
})

# Step 3: Merge baseline and EV info on poly_id (inner)
#  This creates a DF with ONLY polygons with a best image at both baseline & EV
poly_double_cov_df = base_cols.merge(ev_cols, on='poly_id', how='inner')

print(f"Length of poly_double_cov_df before filtering: {len(poly_double_cov_df)}")

# Step 4: Filter to only include polygons with a best image at both baseline & EV
poly_double_cov_df = poly_double_cov_df.dropna(subset=['best_image_base', 'best_image_ev'])
print(f"Length of poly_double_cov_df after filtering: {len(poly_double_cov_df)}")


Length of poly_double_cov_df before filtering: 13537
Length of poly_double_cov_df after filtering: 3753


In [65]:
# #53 - very small overlap
# #117
# test_row = poly_double_cov_df.iloc[117]
# print("Test row:")
# print(test_row)
# print()

# # Extract geometries
# poly_geom = test_row['poly_geom']
# img_geom_base = test_row['img_geom_base']
# img_geom_ev = test_row['img_geom_ev']

# # Get UTM CRS from polygon centroid
# centroid = poly_geom.centroid
# utm_crs = img_cover.get_utm_crs(centroid.x, centroid.y)
# print(f"Using UTM CRS: {utm_crs}")

# # Reproject all geometries to UTM
# poly_proj = gpd.GeoSeries([poly_geom], crs="EPSG:4326").to_crs(utm_crs).iloc[0]
# base_proj = gpd.GeoSeries([img_geom_base], crs="EPSG:4326").to_crs(utm_crs).iloc[0]
# ev_proj = gpd.GeoSeries([img_geom_ev], crs="EPSG:4326").to_crs(utm_crs).iloc[0]

# # Calculate polygon area (ha)
# poly_area_ha = poly_proj.area / 10_000
# print(f"Polygon area: {poly_area_ha:.2f} ha")

# # Get overlaps
# overlap_base = poly_proj.intersection(base_proj)
# overlap_ev = poly_proj.intersection(ev_proj)

# # Get intersection of overlaps
# overlap_both = overlap_base.intersection(overlap_ev)

# # Compute shared coverage area (ha)
# overlap_area_ha = overlap_both.area / 10_000
# percent_overlap = (overlap_area_ha / poly_area_ha) * 100

# # Print results
# print()
# print(f"Overlap with both images: {overlap_area_ha:.6f} ha")
# print(f"Percent polygon area with shared image coverage: {percent_overlap:.2f}%")

In [50]:
def compute_shared_image_overlap(row, debug=False):
    """
    Given a row with polygon and baseline/EV image footprints, compute shared area of image coverage
    (in hectares and as a % of the polygon's area)
    """
    # Extract geometries
    poly_geom = row['poly_geom']
    base_img = row['img_geom_base']
    ev_geom = row['img_geom_ev']

    # Use centroid to determine UTM zone
    centroid = poly_geom.centroid
    utm_crs = img_cover.get_utm_crs(centroid.x, centroid.y)

    # Reproject all geometries to UTM
    poly_proj = gpd.GeoSeries([poly_geom], crs="EPSG:4326").to_crs(utm_crs).iloc[0]
    base_proj = gpd.GeoSeries([img_geom_base], crs="EPSG:4326").to_crs(utm_crs).iloc[0]
    ev_proj = gpd.GeoSeries([img_geom_ev], crs="EPSG:4326").to_crs(utm_crs).iloc[0]

    # Compute area of polygon
    poly_area_ha = poly_proj.area / 10_000

    # Calculate overlap between polygon and imagery at baseline & EV
    overlap_base = poly_proj.intersection(base_proj)
    overlap_ev = poly_proj.intersection(ev_proj)

    # Get intersection of overlaps
    shared_overlap = overlap_base.intersection(overlap_ev)

    # Calculate area of shared overlap & % of polygon area
    shared_overlap_area_ha = shared_overlap.area / 10_000
    shared_pct_cover = (shared_overlap_area_ha / poly_area_ha) * 100 if poly_area_ha > 0 else 0

    if debug:
        print(f"Polygon area (ha): {poly_area_ha:.2f}")
        print(f"Shared overlap area (ha): {shared_overlap_area_ha:.4f}")
        print(f"Shared coverage (%): {shared_pct_cover:.2f}")

    return pd.Series({
        'poly_area_ha_actual': poly_area_ha,
        'shared_overlap_ha': shared_overlap_area_ha,
        'shared_pct_img_cover': shared_pct_cover
    })

In [61]:
poly_double_cov_df.iloc[117]

poly_id                                74523827-22f2-48a7-9f87-f190dcab8003
project_id                             943bb150-f1b7-4ad2-bb9e-60a559df2ebd
plantstart                                              2023-06-05 00:00:00
poly_geom                 POLYGON ((-1.735648250031435 5.119492900932012...
best_image_base                           Maxar WV02 Image 10300100D4549C00
img_id_base                                                10300100D4549C00
percent_img_cover_base                                                100.0
img_geom_base             POLYGON ((-1.859111 6.014522, -1.85881 5.98260...
best_image_ev                             Maxar WV02 Image 103001010F0DA300
img_id_ev                                                  103001010F0DA300
percent_img_cover_ev                                                  100.0
img_geom_ev               POLYGON ((-1.778579 4.962744, -1.604997 4.9798...
Name: 930, dtype: object