In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
import json
from shapely.geometry import shape, mapping
import sys
sys.path.append('../src/')
import geospatial_utils_NEW as utils
import analyze_img_coverage as analyze
import image_coverage as img_cover

In [None]:
# Read in CSVs
# List of approved projects (with country codes)
approved_projects = pd.read_csv('../projects_all_approved_202502211226.csv')

# Polygon-level image availability
baseline_poly = pd.read_csv('../data/tf_cohort1/results/baseline/polygon_imagery_coverage_cohort1_2025-04-02.csv')
ev_poly = pd.read_csv('../data/tf_cohort1/results/year_2/polygon_imagery_coverage_cohort1_2025-04-02.csv')

# Polygon-level low coverage 
baseline_low_cov = pd.read_csv('../data/tf_cohort1/results/baseline/low_coverage_polygons_cohort1_2025-04-02.csv')
ev_low_cov = pd.read_csv('../data/tf_cohort1/results/year_2/low_coverage_polygons_cohort1_2025-04-02.csv')

# Project-level image availability
baseline_proj = pd.read_csv('../data/tf_cohort1/results/baseline/project_imagery_coverage_cohort1_2025-04-02.csv')
ev_proj = pd.read_csv('../data/tf_cohort1/results/year_2/project_imagery_coverage_cohort1_2025-04-02.csv')

### Check # Low-Coverage Polygons

In [None]:
# TF Cohort 1 Polygons
num_poly = baseline_poly['poly_id'].nunique()
print('# Total TF Cohort 1 Polygons:', num_poly)
print('')

# Baseline Low Coverage
num_base_low_cov = baseline_low_cov['poly_id'].nunique()
num_base_no_img = len(baseline_low_cov[baseline_low_cov['num_images'] == 0])
num_base_1_img = len(baseline_low_cov[baseline_low_cov['num_images'] == 1])
num_base_mult_img = len(baseline_low_cov[baseline_low_cov['num_images'] > 1])

print('# Poly wi/ Low Cover at Baseline:', num_base_low_cov)
print(f'{(num_base_low_cov/num_poly):.1%} of polygons have low image cover at baseline')
print(f'Of low cover at baseline polygons, {num_base_no_img} have 0 available images.')
print(f'    - This is {num_base_no_img/num_base_low_cov:.1%} of low-coverage polygons.')
print(f'    - This is {num_base_no_img/num_poly:.1%} of all polygons.')
print(f'Of low cover at baseline polygons, {num_base_1_img} have 1 available image (the low-cover image).')
print(f'    - This is {num_base_1_img/num_base_low_cov:.1%} of low-coverage polygons.')
print(f'    - This is {num_base_1_img/num_poly:.1%} of all polygons.')
print(f'Of low cover at baseline polygons, {num_base_mult_img} have > 1 available images (possible other images to select from).')
print(f'    - This is {num_base_mult_img/num_base_low_cov:.1%} of low-coverage polygons.')
print(f'    - This is {num_base_mult_img/num_poly:.1%} of all polygons.')
print('')

# (1-Year Post-Planting) Low Coverage
num_ev_low_cov = ev_low_cov['poly_id'].nunique()
num_ev_no_img = len(ev_low_cov[ev_low_cov['num_images'] == 0])
num_ev_1_img = len(ev_low_cov[ev_low_cov['num_images'] == 1])
num_ev_mult_img = len(ev_low_cov[ev_low_cov['num_images'] > 1])

print('# Poly wi/ Low Cover at EV:', num_ev_low_cov)
print(f'{(num_ev_low_cov/num_poly):.1%} of polygons have low image cover at EV')
print(f'Of low cover at EV polygons, {num_ev_no_img} have 0 available images.')
print(f'    - This is {num_ev_no_img/num_ev_low_cov:.1%} of low-coverage polygons.')
print(f'    - This is {num_ev_no_img/num_poly:.1%} of all polygons.')
print(f'Of low cover at EV polygons, {num_ev_1_img} have 1 available image (the low-cover image).')
print(f'    - This is {num_ev_1_img/num_ev_low_cov:.1%} of low-coverage polygons.')
print(f'    - This is {num_ev_1_img/num_poly:.1%} of all polygons.')
print(f'Of low cover at EV polygons, {num_ev_mult_img} have > 1 available images (possible other images to select from).')
print(f'    - This is {num_ev_mult_img/num_ev_low_cov:.1%} of low-coverage polygons.')
print(f'    - This is {num_ev_mult_img/num_poly:.1%} of all polygons.')

In [None]:
def calc_low_cover_poly_stats(all_polygons_df, low_cover_polygons_df, analysis_period):
    """
    Calculate statistics about the # and % of polygons with no or low (defined as an image covering < 50% of the polygon) image coverage at a given analysis time period.
    
    Args:
    - all_polygons_df (DataFrame): Dataframe of all polygons in the analysis. Must have 1 row for each unique polygon, and a unique polygon ID column called 'poly_id'.
    - low_cover_polygons_df (DataFrame): DataFrame of all low cover polygons in the analysis. Must include a 'num_images' column containing the # of available Maxar images for that polygon
    - analysis_period (str): A string containing the time period for this analysis (Baseline or EV). Used for naming conventions 
    """
    num_poly = all_polygons_df['poly_id'].nunique()
    num_low_cov = low_cover_polygons_df['poly_id'].nunique()
    num_no_img = len(low_cover_polygons_df[low_cover_polygons_df['num_images'] == 0])
    num_1_img = len(low_cover_polygons_df[low_cover_polygons_df['num_images'] == 1])
    num_mult_img = len(low_cover_polygons_df[low_cover_polygons_df['num_images'] > 1])

    print(f'# Poly wi/ Low Cover at {analysis_period}:', num_low_cov)
    print(f'{(num_low_cov/num_poly):.1%} of polygons have low image cover at {analysis_period.lower()}')
    print(f'Of low cover at {analysis_period.lower()} polygons, {num_no_img} have 0 available images.')
    print(f'    - This is {num_no_img/num_low_cov:.1%} of low-coverage polygons.')
    print(f'    - This is {num_no_img/num_poly:.1%} of all polygons.')
    print(f'Of low cover at {analysis_period.lower()} polygons, {num_1_img} have 1 available image (the low-cover image).')
    print(f'    - This is {num_1_img/num_low_cov:.1%} of low-coverage polygons.')
    print(f'    - This is {num_1_img/num_poly:.1%} of all polygons.')
    print(f'Of low cover at {analysis_period.lower()} polygons, {num_mult_img} have > 1 available images (possible other images to select from).')
    print(f'    - This is {num_mult_img/num_low_cov:.1%} of low-coverage polygons.')
    print(f'    - This is {num_mult_img/num_poly:.1%} of all polygons.')
    print('')

In [None]:
calc_low_cover_poly_stats(baseline_poly, baseline_low_cov, 'Baseline')
calc_low_cover_poly_stats(ev_poly, ev_low_cov, 'Early Verification')

In [None]:
print(len(baseline_low_cov[(baseline_low_cov['overlap_area_ha'] > 0) * (baseline_low_cov['num_images'] > 1)]))
baseline_low_cov[(baseline_low_cov['overlap_area_ha'] > 0) * (baseline_low_cov['num_images'] > 1)]

In [None]:
# Merge the country codes from the list of approved projects into the csvs of image availability
# Polygon-level
baseline_poly = baseline_poly.merge(approved_projects[['project_id', 'country']], on='project_id', how='left')
ev_poly = ev_poly.merge(approved_projects[['project_id', 'country']], on='project_id', how='left')

# Project-level
baseline_proj = baseline_proj.merge(approved_projects[['project_id', 'country']], on='project_id', how='left')
ev_proj = ev_proj.merge(approved_projects[['project_id', 'country']], on='project_id', how='left')

In [None]:
# Filter by landscapes
landscape_countries = ['BI', 'CD', 'GH', 'KE', 'RW']

# Polygon-level
baseline_poly_landscapes = baseline_poly[baseline_poly['country'].isin(landscape_countries)]
ev_poly_landscapes = ev_poly[ev_poly['country'].isin(landscape_countries)]

# Project-level
baseline_proj_landscapes = baseline_proj[baseline_proj['country'].isin(landscape_countries)]
ev_proj_landscapes = ev_proj[ev_proj['country'].isin(landscape_countries)]

# Print results
print(f"There are {len(baseline_proj)} projects with imagery at baseline.")
print(f"There are {len(baseline_proj_landscapes)} projects in the TF focus landscapes with imagery at baseline.")
print()
print(f"There are {len(ev_proj)} projects with imagery 1 year+ post-plantstart.")
print(f"There are {len(ev_proj_landscapes)} projects in the TF focus landscapes with imagery 1 year+ post-plantstart.")

In [None]:
## FILTER BY COVERAGE THRESHOLD

# Set coverage threshold
thresh = 50

# Filter projects with >= X% overage
baseline_landscapes_thresh = baseline_proj_landscapes[baseline_proj_landscapes['total_percent_area_covered'] >= thresh]
ev_landscapes_thresh = ev_proj_landscapes[ev_proj_landscapes['total_percent_area_covered'] >= thresh]

# Print individual results
print(f"There are {len(baseline_landscapes_thresh)} projects with >={thresh}% coverage at baseline.")
print(f"There are {len(ev_landscapes_thresh)} projects with >={thresh}% coverage 1 year+ post-planting")
print()

# Find common project ids
common_project_ids_thresh = set(baseline_landscapes_thresh['project_id']).intersection(ev_landscapes_thresh['project_id'])

# Retrieve details of common projects
common_projects_baseline_thresh= baseline_landscapes_thresh[baseline_landscapes_thresh['project_id'].isin(common_project_ids_thresh)]
common_projects_ev_thresh = ev_landscapes_thresh[ev_landscapes_thresh['project_id'].isin(common_project_ids_thresh)]

# Display results
print(f"There are {len(common_project_ids_thresh)} projects with >= {thresh}% coverage at both baseline and 1-year post-plantstart")
print(list(common_project_ids_thresh))

In [None]:
# Create merged comparison dataframe for high coverage projects
merged_high_cov = common_projects_baseline_thresh[['project_id', 'total_percent_area_covered']].merge(
    common_projects_ev_thresh[['project_id', 'total_percent_area_covered']],
    on='project_id',
    suffixes=('_baseline', '_ev')
)
print(f"\n Coverage Comparison from Baseline to Early Verification for Projects with {thresh}% Coverage:")
merged_high_cov

In [None]:
# Create merged comparison dataframe for ALL projects
# Find common project ids
common_project_ids = set(baseline_proj_landscapes['project_id']).intersection(ev_proj_landscapes['project_id'])

# Retrieve details of common projects
common_projects_baseline = baseline_proj_landscapes[baseline_proj_landscapes['project_id'].isin(common_project_ids)]
common_projects_ev = ev_proj_landscapes[ev_proj_landscapes['project_id'].isin(common_project_ids)]

# Create merged comparison dataframe for all projects
merged = common_projects_baseline[['project_id', 'total_percent_area_covered']].merge(
    common_projects_ev[['project_id', 'total_percent_area_covered']],
    on='project_id',
    suffixes=('_baseline', '_ev')
)

merged.to_csv('../tf_cohort1_landscapes_baseline_ev_pct_cover_comparison.csv', index=False)

print(f"\n Coverage Comparison from Baseline to Early Verification for All Projects in Landscapes:")
print(len(merged))
merged

In [None]:
# TerraFund polygons Cohort 1
polygons = pd.read_csv('../data/tf_cohort1/tm_api_cohort1_2025-04-02.csv')
print(len(polygons))
polygons.head()

In [None]:
l = list(polygons.plantstart.unique())[0:8]
value = float('nan')
print(type(value))
for i in l:
    if i == value:
        print('nan')
    else:
        print(i)

In [None]:
# Filter by ARCOS
arcos = polygons[polygons['project_id'] == 'bbd88e69-cd85-429e-bebf-6234bf82dbb3'].copy()
print(len(arcos))
arcos.head()

In [None]:
arcos['geometry'].iloc[0]

In [None]:
# Export ARCOS polygons as geoJSON
utils.df_to_geojson(arcos, output_path='../arcos_polygons_2025-04-03.geojson')

### Calculating the % of Polygons with >X% Coverage for Both Baseline & EV

In [None]:
baseline_poly
ev_poly.head()

In [None]:
def compare_polygon_coverage(baseline_df, ev_df, threshold):
    # Create dataframes with only relevent columns and rename for clarity before merging
    base = baseline_df[['poly_id', 'project_id', 'percent_img_cover']].rename(
        columns={'percent_img_cover': 'base_pct_img_cover'})
    ev = ev_df[['poly_id', 'percent_img_cover']].rename(
        columns={'percent_img_cover': 'ev_pct_img_cover'})
    
    # Merge dataframes on poly_id
    merged = base.merge(ev, on='poly_id', how='inner')

    # Filter polygons that meet the threshold in *both* periods
    merged['both_high'] = (
        (merged['base_pct_img_cover'] >= threshold) &
        (merged['ev_pct_img_cover'] >= threshold)
    )

    # Group by project and compute:
    # - total number of shared polygons
    # - number of polygons that meet threshold in both
    summary = (
        merged.groupby('project_id')
        .agg(total_polygons=('poly_id', 'count'),
             polygons_high_both=('both_high', 'sum'))
        .reset_index()
    )

    # Add percent
    summary['percent_polygons_high_both'] = (
        summary['polygons_high_both'] / summary['total_polygons'] * 100
    )

    return summary

In [None]:
both_high_poly_cover = compare_polygon_coverage(baseline_poly_landscapes, ev_poly_landscapes, 10).sort_values(by='percent_polygons_high_both', ascending=False)
print(len(both_high_poly_cover))
both_high_poly_cover

In [None]:
both_high_poly_cover.to_csv('../poly_wi_gte_70_pct_cover_base_ev.csv', index=False)

### Calculating Overlap in Actual Imagery Coverage Between Baseline and EV Imagery Area 

In [None]:
### 1. LOAD POLYGON AND IMAGE DATA FOR COHORT 1 (ALL TIME PERIODS) ###
maxar_df = pd.read_csv('../data/tf_cohort1/imagery_availability/comb_img_availability_cohort1_2025-04-02.csv')
poly_df = pd.read_csv('../data/tf_cohort1/tm_api_cohort1_2025-04-02.csv')

In [None]:
### 2. PREPROCESS POLYGON AND IMAGE DATA ###
poly_gdf = img_cover.preprocess_polygons(poly_df, debug=True)
maxar_gdf = img_cover.preprocess_images(maxar_df, debug=True)

In [None]:
poly_gdf.head()

In [None]:
maxar_gdf.head()

In [None]:
### 3. MERGE POLYGON METADATA INTO IMAGE DATA ###
merged_gdf, missing_polygons_list = img_cover.merge_polygons_images(maxar_gdf, poly_gdf, debug=True)

In [None]:
### 4. DO INITIAL HARD FILTER OF IMAGES (INCLUDES DATE RANGE) ###
# For Baseline
# Set filters
base_filters = {
    'cloud_cover': 50,          # Remove images with >50% cloud cover
    'off_nadir': 30,            # Remove images with >30째 off-nadir angle
    'sun_elevation': 30,        # Keep only images where sun elevation >30째
    'date_range': (-366, 0),    # Date range of 1 year before plantstart (baseline)
    'img_count': 1,             # Threshold for identifying image availability (REASSESS)
}
# Filter gdf
base_img_gdf_filtered = img_cover.filter_images(merged_gdf, base_filters, debug=True)

# For early verification (1 year+ post plantstart)
# Set filters
ev_filters = {
    'cloud_cover': 50,          # Remove images with >50% cloud cover
    'off_nadir': 30,            # Remove images with >30째 off-nadir angle
    'sun_elevation': 30,        # Keep only images where sun elevation >30째
    'date_range': (365, 9999),    # Date range of y year post-plantstart through today (upper bound of maxar_md dataset is today's date) (year_2)
    'img_count': 1,             # Threshold for identifying image availability (REASSESS)
}
# Filter gdf
ev_img_gdf_filtered = img_cover.filter_images(merged_gdf, ev_filters, debug=True)

# Print results
print(f"Total images before filtering: {len(merged_gdf)}")
print()
print('BASELINE:')
print(f"Total images after filtering: {len(base_img_gdf_filtered)}")
print(f"Polygons with at least one valid image: {base_img_gdf_filtered['poly_id'].nunique()}")
print()
print('EARLY VERIFICATION:')
print(f"Total images after filtering: {len(ev_img_gdf_filtered)}")
print(f"Polygons with at least one valid image: {ev_img_gdf_filtered['poly_id'].nunique()}")

In [None]:
### 5. COMPUTE POLYGON-LEVEL IMAGERY COVERAGE ###
#### BASELINE ###
# Initialize storage for results & low-coverage polygons list
base_low_img_coverage_log = []
base_results = []

# Iterate through all polygons and compute imagery coverage per polygon
for poly_id, project_id in zip(poly_gdf['poly_id'], poly_gdf['project_id']):
    result = img_cover.compute_polygon_image_coverage(poly_id, project_id, poly_gdf, base_img_gdf_filtered, base_low_img_coverage_log)
    base_results.append(result)

# Convert the results to a DataFrame
base_results_df = pd.DataFrame(base_results, columns=['poly_id', 'project_id', 'best_image', 'num_images',
                                            'poly_area_ha', 'overlap_area_ha', 'percent_img_cover'])
base_results_df['best_image'] = base_results_df['best_image'].fillna("None")

In [None]:
print(len(base_results_df))
base_results_df.head()

In [None]:
### EARLY VERIFICATION ###
### 5. COMPUTE POLYGON-LEVEL IMAGERY COVERAGE ###
# Initialize storage for results & low-coverage polygons list
ev_low_img_coverage_log = []
ev_results = []

# Iterate through all polygons and compute imagery coverage per polygon
for poly_id, project_id in zip(poly_gdf['poly_id'], poly_gdf['project_id']):
    result = img_cover.compute_polygon_image_coverage(poly_id, project_id, poly_gdf, ev_img_gdf_filtered, ev_low_img_coverage_log)
    ev_results.append(result)

# Convert the results to a DataFrame
ev_results_df = pd.DataFrame(ev_results, columns=['poly_id', 'project_id', 'best_image', 'num_images',
                                            'poly_area_ha', 'overlap_area_ha', 'percent_img_cover'])
ev_results_df['best_image'] = ev_results_df['best_image'].fillna("None")

In [None]:
ev_results_df.head()