# Maxar Image Availability Analysis

The Maxar image availability workflow takes as input a list of TerraFund project ids and returns as output a csv listing every project and how much of that projectâ€™s area has Maxar imagery coverage.

#### Workflow:
1. Pull info on project characteristics for the entire portfolio using the TerraMatch API
    - Repo/notebook: terrafund-portfolio-analysis/tm-api.ipynb
    - Input: list of TerraFund project IDs
    - Output: csv of all project features
2. Using the TM API csv, pull Maxar metadata
    - Repo/notebook: maxar-tools/decision-tree-metadata.ipynb and maxar-tools/src/decision_tree.py (? may need to change b/c of my additions to the acquire_metadata function)
    - Input: csv of project features
    - Output: csv of maxar metadata
3. Create imagery features (??)
    - Repo/notebook: terrafund-portfolio-analysis/maxar-img-avail.py
    - Input: csv of maxar metadata and csv of TM project features
    - Output: csv of project features and percent imagery coverage
4. Identify projects with 100% imagery coverage

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from shapely.geometry import shape
from shapely.geometry import Polygon, Point
from shapely import union_all
import ast
from datetime import datetime, timedelta
import re
import os
import math
import requests
import yaml
import json
import pyproj
import sys
sys.path.append('../src/')
import image_availability as img
import process_api_results as clean
import decision_trees as tree
import tm_api_utils as api_request

%load_ext autoreload
%autoreload 2

### Parameters

In [2]:
# File paths
tm_auth_path = '../secrets.yaml'
tm_staging_url = "https://api-staging.terramatch.org/research/v3/sitePolygons?"                 # use for testing queries
tm_prod_url = "https://api.terramatch.org/research/v3/sitePolygons?"                            # Use to pull data for analysis'
approved_projects = '../terrafund-portfolio-analyses/projects_all_approved_202501091214.csv'    # List of projects with approved polygons
feats = '../data/tm_api_TEST.csv'                                                               # Polygon metadata & geometries from TM API
maxar_feats = '/home/darby/github_repos/maxar-tools/data/tm_api_TEST.csv'                       # Polygon metadata & geometries from TM API saved to maxar-tools repo
maxar_md = '../data/imagery_availability/comb_img_availability_2025-02-26.csv'                  # Metadata for Maxar images corresponding to polygons

# Define thesholds
cloud_thresh = 50             # Threshold for removing cloudy imagery
off_nadir_thresh = 30         # Threshold for removing imagery too far off nadir
sun_elev_thresh = 30          # Threshold for removing imagery with too steep of a sun angle
img_count = 1                 # Threshold for identifying image availability
baseline_range = (-366, 0)    # Baseline window (1 year before plantstart date)
ev_range = (730, 1095)        # Early verification window (2-3 years after plant start date)

### Step 1: Load & Preprocess Data
Inputs: 
- TM API csv
- Maxar metadata csv

In [3]:
# Load TM API polygons and convert to a GeoDataFrame
polygons = pd.read_csv(feats)
polygons.columns = polygons.columns.str.lower()   # Enforce lowercase column names

# Rename 'name' and 'geometry' columns
poly_df = polygons.rename(columns={'name': 'poly_name', 'geometry': 'poly_geom'})  

# Convert 'plantstart' column to a datetime
poly_df['plantstart'] = pd.to_datetime(poly_df['plantstart'], errors='coerce')

# Convert stringified 'poly_geom' dictionaries into real dictionaries
poly_df['poly_geom'] = poly_df['poly_geom'].apply(lambda x: shape(ast.literal_eval(x)) if isinstance(x, str) else shape(x))

# Convert 'poly_geom' (polygon geometries) from WKT to Shapely objects
poly_df['poly_geom'] = poly_df['poly_geom'].apply(shape)

# Add a field for the polygon centroid
poly_df['poly_centroid'] = poly_df['poly_geom'].iloc[0].centroid

# Convert DataFrame to GeoDataFrame
poly_gdf = gpd.GeoDataFrame(poly_df, geometry='poly_geom', crs="EPSG:4326")

In [4]:
# Load Maxar images metadata and convert to a GeoDataFrame
images = pd.read_csv(maxar_md)

# Select relevent columns
img_df = images[['title', 'project_id', 'poly_id', 'datetime', 'area:cloud_cover_percentage', 'eo:cloud_cover', 'area:avg_off_nadir_angle', 'view:sun_elevation', 'img_geom']]

# Convert 'datetime' column to a datetime and rename
img_df.loc[:, 'datetime'] = pd.to_datetime(img_df['datetime'], format='%Y-%m-%dT%H:%M:%S.%fZ', errors='coerce') # Convert to datetime type
img_df.loc[:, 'datetime'] = img_df['datetime'].apply(lambda x: x.replace(tzinfo=None) if pd.notna(x) else x)    # Remove time zone info
img_df = img_df.rename(columns={'datetime': 'img_date'})                                                        # Rename 'datetime' column 'img_date'

# Convert stringified 'poly_geom' dictionaries into real dictionaries
img_df['img_geom'] = img_df['img_geom'].apply(lambda x: shape(ast.literal_eval(x)) if isinstance(x, str) else shape(x))

# Convert 'img_geom' (image footprint geometries) from WKT to Shapely objects
img_df['img_geom'] = img_df['img_geom'].apply(shape)

# Add a field for the image centroid
img_df['img_centroid'] = img_df['img_geom'].iloc[0].centroid

# Convert DataFrame to GeoDataFrame
img_gdf = gpd.GeoDataFrame(img_df, geometry='img_geom', crs="EPSG:4326")

### Step 2: Merge Images with Polygons
Inputs:
- poly_gdf: geodataframe of polygon metadata
- img_gdf: geodataframe of maxar image metadata

Outputs:
- merged: merged geodataframe of maxar image metadata + associated polygon metadata

In [5]:
# Merge the image data with the polygon data (preserving image data rows and adding associated polygon attributes)
merged_gdf = img_gdf.merge(poly_gdf, on=['project_id', 'poly_id'], how='left')

# Ensure correct datetime format
merged_gdf['plantstart'] = pd.to_datetime(merged_gdf['plantstart'], errors='coerce')
merged_gdf['img_date'] = pd.to_datetime(merged_gdf['img_date'], errors='coerce')

### Step 3: PRE-FILTER IMAGE DATASET (GLOBAL FILTERING)
Inputs:
- merged: merged dataframe of maxar image metadata + associated polygon metadata

Outputs:
- filtered_merged: a filtered version of the merged dataframe of maxar image metadata + associated polygon metadata

### STEP 4: CREATE DICTIONARY FOR PROJECT-POLYGON MAPPING

### STEP 5: ITERATE THROUGH EACH PROJECT

#### STEP 5.1 Get All Polygons for This Project

In [None]:
# Step 1: LOAD AND PREPROCESS DATA
# 1.1: Load polygon dataset
poly_csv = gpd.GeoDataFrame(polygon geometries & metadata)

# 1.2 Load image dataset
img_csv = gpd.GeoDataFrame(maxar image geometries & metadata)

# 1.3 Preprocess the data
poly_gdf = preprocess_polygons(poly_csv) # Clean data, convert geometries, enforce CRS
img_gdf = preprocess_images(img_csv) # Clean data, convert geometries, enforce CRS


# Step 2: MERGE POLYGON DATA WITH IMAGE DATA
merged_gdf = img_gdf.merge(poly_gdf, on=['project_id', 'poly_id'], how='left')

# Step 3: PRE-FILTER IMAGES
filtered_images = merged_gdf where:
    (date is within allowed date range) &
    (cloud cover < cloud_thresh) &
    (off-nadir angle < off_nadir_thresh) &
    (sun elevation < sun_elev_thresh)

# Step 4: ITERATE THROUGH PROJECTS AND POLYGONS TO CALCULATE IMAGERY COVERAGE
# 4.1 Create a dictionary for project-polygon mapping
project_polygons = {project_id: list of poly_ids associated with that project} # Create a dictionary

# 4.2 Initialize list to store low coverage cases
low_img_coverage_log = []

# 4.3 Iterate through each project
for each project_id in project_polygons:

    # 4.4 Get all polygons for this project
    project_polygons_list = list of poly_ids for this project_id

    # 4.5 Iterate through each polygon in the project
    for each poly_id in project_polygons_list:
    
        # 4.6 Get all images associated with this polygon
        poly_images = filtered_images[filtered_images['poly_id'] == poly_id]

        # Count the number of available images
        num_images = len(poly_images)

        # If no valid image exists, record 0% coverage
        if poly_images is empty:
            store result: (poly_id, project_id, None, num_images, 0, 0) # No images available
            continue

        # 4.7 Select the best image (lowest cloud cover)
        best_image = select_best_image(poly_images)

        # 4.8 Get polygon and image geometries
        poly_geom = poly_gdf[poly_gdf['poly_id'] == poly_id].geometry.iloc[0]
        best_img_geom = best_image['img_geom']

        # 4.9 Compute UTM Zone and reproject geometries
        poly_centroid = compute centroid of poly_geom
        utm_crs = get UTM CRS from centroid
        poly_geom_reprojected = reproject poly_geom to utm_crs
        best_img_geom_reprojected = reproject best_img_geom to utm_crs

        # 4.10 Calculate the polygon area dynamically (in hectares)
        poly_area_ha = poly_geom_reprojected.area / 10000

        # 4.11 Calculate area of overlap
        overlap_area = poly_geom_reprojected union best_img_geom_reprojected
        overlap_area_ha = overlap_area / 10000

        # 4.12 Compute percent of polygon area covered
        percent_img_cover = (overlap_area / poly_area_ha) * 100

        # 4.13 Log cases where imagery coverage is unexpectedly low
        if percent_img_cover < 50:
            log_entry = {
                'poly_id': poly_id,
                'project_id': project_id,
                'best_image': best_image['title'],
                'num_images': num_images,
                'poly_area_ha': poly_area_ha,
                'overlap_area_ha': overlap_area_ha,
                'percent_img_cover': percent_img_cover
            }
            low_img_coverage_log.append(log_entry)

        # 4.14 Store results
        store result: (poly_id, project_id, best_image['title'], num_images, poly_area_ha, overlap_area_ha, percent_img_cover)

# STEP 5: EXPORT LOW COVERAGE LOG IF NEEDED
if low_img_coverage_log is not empty:
    export_to_csv(low_img_coverage_log, "low_coverage_polygons.csv")

In [None]:
##FUNCTIONS LIST 

# 1. LOAD AND PREPROCESS DATA
load_polygons(filepath)