# Maxar Image Availability Analysis PPC Tree Count Eligibility

The Maxar image availability workflow takes as input a list of TerraFund project ids and returns as output a csv listing every project and how much of that project’s area has Maxar imagery coverage.

#### Workflow:
1. Pull info on project characteristics for the entire portfolio using the TerraMatch API
    - Repo/notebook: terrafund-portfolio-analysis/tm-api.ipynb
    - Input: list of TerraFund project IDs
    - Output: csv of all project features
2. Using the TM API csv, pull Maxar metadata
    - Repo/notebook: maxar-tools/decision-tree-metadata.ipynb and maxar-tools/src/decision_tree.py (? may need to change b/c of my additions to the acquire_metadata function)
    - Input: csv of project features
    - Output: csv of maxar metadata
3. Calculate the percent area of each project with available Maxar imagery
    - Repo/notebook: terrafund-portfolio-analysis/maxar-img-avail.ipynb and terrafund-portfolio-analysis/src/image_coverage.py
    - Input: csv of maxar metadata and csv of TM project features
    - Output: csv of project features and percent imagery coverage, csv of percent imagery coverage aggregated to project level, csv of polygons with low imagery coverage
4. Identify projects with highest imagery coverage to use for the RS image availability simulation

In [None]:
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
import sys
sys.path.append('../src/')
import image_coverage as img_cover
import analyze_img_coverage as analyze
from datetime import datetime

### Parameters

In [None]:
# Naming convention
run_name = 'ppc_2025_tree_count_elig_final'
run_dir = 'ppc_tree_count_elig'
analysis = 'baseline' # must change if you change the date_range

# Today's date
today = datetime.today().strftime('%Y-%m-%d')
#today = '2025-04-02'

# File paths
feats = f'../data/{run_dir}/tm_api_{run_name}_{today}.csv' # CSV of polygon metadata & geometries from TM API (infile)
maxar_md = f'../data/{run_dir}/imagery_availability/comb_img_availability_{run_name}_{today}.csv' # CSV of metadata for Maxar images corresponding to polygons (infile)
task_date_ranges_path = f'../data/{run_dir}/task_date_ranges_{run_name}.csv' # CSV of the date range to search for each task
dropped_poly_path = f'../data/{run_dir}/dropped_poly_invalid_geom_{run_name}_{today}.csv'
results_path = f'../data/{run_dir}/results/{analysis}/' # File path to save results to

# Define filtering thesholds (stored in a dictionary)
filters = {
    'cloud_cover': 50,           # Remove images with >50% cloud cover
    'off_nadir': 30,             # Remove images with >30° off-nadir angle
    'sun_elevation': 30,         # Keep only images where sun elevation >30°
    #'date_range': (-366, 0),    # Date range of 1 year before plantstart (TerraFund baseline)
    'date_range': (-366, 90),   # Date range of 1 year before plantstart through 3 months after (PPC baseline)
    #'date_range': (-549, 90),    # Date range of 1.5 years before plantstart through 3 months after (modified PPC baseline)
    #'date_range': (-732, 90),    # Date range of 2 years before plantstart through 3 months after (modified PPC baseline)
    #'date_range': (-1586, 90),    # Date range of 4 1/3 years before plantstart through 3 months after (modified PPC baseline - CERT 2021)
    #'date_range': (-366, 736),    # Date range of 1 year before plantstart through 2 years after (modified PPC baseline - CERT 2021)
    #'date_range': (730, 9999),  # Date range of 2 years post-plantstart through today (upper bound of maxar_md dataset is today's date) (early verification)
    #'date_range': (-151, 213),  # Custom all of 2022 with plantstart June 1 2022 (Rwanda, Mozambique Lidar)
    #'date_range': (579, 883),   # Custom May - Oct 2024 with plantstart June 1 2022 (Kenya lidar)
    #'date_range': (-59, 305),   # Custom all of 2023 with plantstart March 1 2023 (GEDI Landscapes & Global Lidar)
    'img_count': 1,             # Threshold for identifying image availability (REASSESS)
}

### Calculate Image Availability by Project

In [None]:
### 1. LOAD POLYGON AND IMAGE DATA ###
poly_df = pd.read_csv(feats)
img_df = pd.read_csv(maxar_md)

In [None]:
### 2.1. PREPROCESS POLYGON DATA ###
poly_gdf = img_cover.preprocess_polygons(poly_df, debug=False, save_dropped=True, dropped_output_path=dropped_poly_path)

# Create task_id
plant_year = pd.to_datetime(poly_gdf["plantstart"], errors="coerce").dt.year
poly_gdf["task_id"] = poly_gdf["project_id"].astype(str) + "_" + plant_year.astype("Int64").astype(str)

In [None]:
### 2.2. PREPROCESS IMAGE DATA ###
img_gdf = img_cover.preprocess_images(img_df, debug=True)

In [None]:
### 3. MERGE POLYGON METADATA INTO IMAGE DATA ###
merged_gdf, missing_polygons_list = img_cover.merge_polygons_images(img_gdf, poly_gdf, debug=True)

In [None]:
### 4. FILTER IMAGES ###
# Read in task date ranges csv
task_ranges = pd.read_csv(task_date_ranges_path)

date_range_by_task = (
    task_ranges.set_index("task_id")[["date_range_start", "date_range_end"]]
    .apply(lambda r: (int(r["date_range_start"]), int(r["date_range_end"])), axis=1)
    .to_dict()
)

# Fallback date_range if task_id not in the csv
DEFAULT_DATE_RANGE = filters['date_range']

# Create a global filters only dictionary (cloud cover, off nadir angle, sun elevation angle)
filters_global = {k: v for k, v in filters.items() if k != "date_range"}

# Filter images by global filters only
img_gdf_filtered = img_cover.filter_images(merged_gdf, filters_global, debug=True)

In [None]:
### 5. COMPUTE POLYGON-LEVEL IMAGERY COVERAGE ###
# Initialize storage for results & low-coverage polygons list
low_img_coverage_log = []
results = []

# Iterate through all polygons and compute imagery coverage per polygon
for poly_id, project_id in zip(poly_gdf['poly_id'], poly_gdf['project_id']):
    result = img_cover.compute_polygon_image_coverage(poly_id, project_id, poly_gdf, img_gdf_filtered, low_img_coverage_log)
    results.append(result)

# Convert the results to a DataFrame
results_df = pd.DataFrame(results, columns=['poly_id', 'project_id', 'best_image', 'img_date', 'num_images',
                                            'poly_area_ha', 'overlap_area_ha', 'percent_img_cover'])
results_df['best_image'] = results_df['best_image'].fillna("None")

In [None]:
### 6. AGGREGATE TO PROJECT-LEVEL COVERAGE ###
project_results_df = img_cover.aggregate_project_image_coverage(results_df, debug=True)

In [None]:
### 7. SAVE RESULTS ###
# Percent imagery coverage by polygon
results_df.to_csv(f"{results_path}polygon_imagery_coverage_{run_name}_{analysis}_{today}.csv", index=False)

# Percent imagery coverage by project
project_results_df.to_csv(f"{results_path}project_imagery_coverage_{run_name}_{analysis}_{today}.csv", index=False)

# Polygons with low imagery coverage
if low_img_coverage_log:
    low_coverage_polygons_df = pd.DataFrame(low_img_coverage_log)
    print(f"Logging low image coverage polygons to {results_path}.")
    low_coverage_polygons_df['best_image'] = low_coverage_polygons_df['best_image'].fillna("None")
    low_coverage_polygons_df.to_csv(f"{results_path}low_coverage_polygons_{run_name}_{analysis}_{today}.csv", index=False)

print(f"Imagery coverage results saved to {results_path}")

## Analyze Maxar Image Availability

In [None]:
# Read in files
# Image availability by project
project_img_avail = pd.read_csv(f"{results_path}project_imagery_coverage_{run_name}_{analysis}_{today}.csv")

# Image availability by polygon
poly_img_avail = pd.read_csv(f"{results_path}polygon_imagery_coverage_{run_name}_{analysis}_{today}.csv")

# Low coverage polygons
low_coverage_poly = pd.read_csv(f"{results_path}low_coverage_polygons_{run_name}_{analysis}_{today}.csv")

In [None]:
# # Overall distribution of image availability
# analyze.img_avail_hist(project_img_avail)

In [None]:
# # High image availability projects
# qualifying_projects_list = analyze.count_projs_wi_img_avail(project_img_avail, 90)

In [None]:
# analyze.analyze_low_coverage_issues(low_coverage_poly)

In [None]:
# high_cov = project_img_avail[(project_img_avail['total_percent_area_covered'] > 90) & (project_img_avail['total_percent_area_covered'] <= 101)]
# print(len(high_cov))
# high_cov.sort_values('total_percent_area_covered', ascending=False)

### For PPC, calculate image availability by task (project_id + plantstart_year)

In [None]:
# Merge the 'plantstart' and 'plantstart_year' columns from poly_gdf into poly_img_avail (dataset of each polygon with associated best Maxar image)
poly_img_avail_wi_yrs = poly_img_avail.merge(poly_gdf[['poly_id', 'plantstart']], how='left', on='poly_id')
poly_img_avail_wi_yrs['plantstart_year'] = pd.to_datetime(poly_img_avail_wi_yrs['plantstart'], errors='coerce').dt.year

In [None]:
task_results_df = img_cover.aggregate_project_image_coverage_ppc(poly_img_avail_wi_yrs)

In [None]:
# Save percent imagery coverage by task as dataframe
# Percent imagery coverage by polygon
task_results_df.to_csv(f"{results_path}task_imagery_coverage_{run_name}_{analysis}_{today}.csv", index=False)