# Maxar Image Availability Analysis

The Maxar image availability workflow takes as input a list of TerraFund project ids and returns as output a csv listing every project and how much of that project’s area has Maxar imagery coverage.

#### Workflow:
1. Pull info on project characteristics for the entire portfolio using the TerraMatch API
    - Repo/notebook: terrafund-portfolio-analysis/tm-api.ipynb
    - Input: list of TerraFund project IDs
    - Output: csv of all project features
2. Using the TM API csv, pull Maxar metadata
    - Repo/notebook: maxar-tools/decision-tree-metadata.ipynb and maxar-tools/src/decision_tree.py (? may need to change b/c of my additions to the acquire_metadata function)
    - Input: csv of project features
    - Output: csv of maxar metadata
3. Calculate the percent area of each project with available Maxar imagery
    - Repo/notebook: terrafund-portfolio-analysis/maxar-img-avail.ipynb and terrafund-portfolio-analysis/src/image_coverage.py
    - Input: csv of maxar metadata and csv of TM project features
    - Output: csv of project features and percent imagery coverage, csv of percent imagery coverage aggregated to project level, csv of polygons with low imagery coverage
4. Identify projects with highest imagery coverage to use for the RS image availability simulation

In [None]:
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
import sys
sys.path.append('../src/')
import image_coverage as img_cover
import analyze_img_coverage as analyze
from datetime import datetime

### Parameters

In [None]:
# Naming convention
run_name = 'tm_polygons'
run_dir = 'maxar_spenddown'
analysis = 'most_recent' # must change if you change the date_range

# Today's date
today = datetime.today().strftime('%Y-%m-%d')
#today = '2025-04-02'

# File paths
feats = f'../data/{run_dir}/tm_api_{run_name}_{today}.csv' # CSV of polygon metadata & geometries from TM API (infile)
maxar_md = f'../data/{run_dir}/imagery_availability/comb_img_availability_{run_name}_{today}.csv' # CSV of metadata for Maxar images corresponding to polygons (infile)
dropped_poly_path = f'../data/{run_dir}/dropped_poly_invalid_geom_{run_name}_{today}.csv'
results_path = f'../data/{run_dir}/results/{analysis}/' # File path to save results to

# Define filtering thesholds (stored in a dictionary)
filters = {
    'cloud_cover': 50,          # Remove images with >50% cloud cover
    'off_nadir': 30,            # Remove images with >30° off-nadir angle
    'sun_elevation': 30,        # Keep only images where sun elevation >30°
    #'date_range': (-366, 0),    # Date range of 1 year before plantstart (TerraFund baseline)
    #'date_range': (-366, 90),    # Date range of 1 year before plantstart through 3 months after (PPC baseline)
    #'date_range': (365, 9999),    # Date range of 1 year post-plantstart through today (upper bound of maxar_md dataset is today's date) (year_2)
    #'date_range': (730, 9999),  # Date range of 2 years post-plantstart through today (upper bound of maxar_md dataset is today's date) (early verification)
    'date_range': (0, 9999),     # Date range of post plantstart through today (most recent)
    'img_count': 1,             # Threshold for identifying image availability (REASSESS)
}

### Calculate Image Availability by Project

In [None]:
### 1. LOAD POLYGON AND IMAGE DATA ###
poly_df = pd.read_csv(feats)
img_df = pd.read_csv(maxar_md)

In [None]:
len(poly_df)

In [None]:
ev_ids_list = ['449adf55-f6f8-4f17-97d3-ab6f6bf6676d', '9019106b-6e2d-4deb-97a5-2889f976a931', '39871658-bff0-49c2-aa20-ccac0b03a2c2', 'bad12444-7180-4b29-a14c-d2b4305b7f52', 
               'd5e0a4ff-8601-45d0-9020-8c104e5ea508', '24fc33cb-53ad-4383-82ca-f6e2ac3fd143', '96c86eae-d4f9-45d8-9780-69c55a9e36e9', '8a112e82-e191-44ad-b306-2578c064104b', 
               'f449aef3-4453-42c9-b542-57acc7c2e5eb', '62043c88-f03d-475e-ac9c-2f057536e2a8', 'b7f26543-0ddb-4d10-a215-abfc093b0ed0', 'ed1cadff-e20f-43a7-8627-aee10f48cc7a', 
               'e4fe2fa4-6869-4c1e-9347-ba9b135306f5']

most_recent_ids_list = ['82dd3a84-2562-4a6f-85d9-f83790daaaba', '9c93de5d-12cd-4e7e-b99c-49cb6f890e58', '179b90da-19b2-4103-9eb5-54e47378f100', '8f35f005-0876-4b87-b466-41aa86d6798f']

base_only_ids_list = ['5e8a3c5e-7a28-4ff4-be07-f950361f56b2', 'e4108d7a-58d8-4604-8dd8-2f95c9c181d5', '244eaf7e-e109-47b2-b84e-9ebe24508391']

In [None]:
# Filter to EV projects
poly_df_filt = poly_df.copy()
poly_df_filt = poly_df_filt[poly_df_filt['project_id'].isin(most_recent_ids_list)]
len(poly_df_filt)

In [None]:
### 2.1. PREPROCESS POLYGON DATA ###
poly_gdf = img_cover.preprocess_polygons(poly_df_filt, debug=False, save_dropped=True, dropped_output_path=dropped_poly_path)

In [None]:
### 2.2. PREPROCESS IMAGE DATA ###
img_gdf = img_cover.preprocess_images(img_df, debug=True)

In [None]:
### 3. MERGE POLYGON METADATA INTO IMAGE DATA ###
merged_gdf, missing_polygons_list = img_cover.merge_polygons_images(img_gdf, poly_gdf, debug=True)

In [None]:
### 4. FILTER IMAGES ###
img_gdf_filtered = img_cover.filter_images(merged_gdf, filters, debug=True)

In [None]:
### 5. COMPUTE POLYGON-LEVEL IMAGERY COVERAGE ###
# Initialize storage for results & low-coverage polygons list
low_img_coverage_log = []
results = []

# Iterate through all polygons and compute imagery coverage per polygon
for poly_id, project_id in zip(poly_gdf['poly_id'], poly_gdf['project_id']):
    result = img_cover.compute_polygon_image_coverage(poly_id, project_id, poly_gdf, img_gdf_filtered, low_img_coverage_log)
    results.append(result)

# Convert the results to a DataFrame
results_df = pd.DataFrame(results, columns=['poly_id', 'project_id', 'best_image', 'img_date', 'num_images',
                                            'poly_area_ha', 'overlap_area_ha', 'percent_img_cover'])
results_df['best_image'] = results_df['best_image'].fillna("None")

In [None]:
### 6. AGGREGATE TO PROJECT-LEVEL COVERAGE ###
project_results_df = img_cover.aggregate_project_image_coverage(results_df, debug=True)

In [None]:
### 7. SAVE RESULTS ###
# Percent imagery coverage by polygon
results_df.to_csv(f"{results_path}polygon_imagery_coverage_{run_name}_{analysis}_{today}.csv", index=False)

# Percent imagery coverage by project
project_results_df.to_csv(f"{results_path}project_imagery_coverage_{run_name}_{analysis}_{today}.csv", index=False)

# Polygons with low imagery coverage
if low_img_coverage_log:
    low_coverage_polygons_df = pd.DataFrame(low_img_coverage_log)
    print(f"Logging low image coverage polygons to {results_path}.")
    low_coverage_polygons_df['best_image'] = low_coverage_polygons_df['best_image'].fillna("None")
    low_coverage_polygons_df.to_csv(f"{results_path}low_coverage_polygons_{run_name}_{analysis}_{today}.csv", index=False)

print(f"Imagery coverage results saved to {results_path}")

## Analyze Maxar Image Availability

In [None]:
# Read in files
# Image availability by project
project_img_avail = pd.read_csv(f"{results_path}project_imagery_coverage_{run_name}_{analysis}_{today}.csv")

# Image availability by polygon
poly_img_avail = pd.read_csv(f"{results_path}polygon_imagery_coverage_{run_name}_{analysis}_{today}.csv")

# Low coverage polygons
low_coverage_poly = pd.read_csv(f"{results_path}low_coverage_polygons_{run_name}_{analysis}_{today}.csv")

In [None]:
# Overall distribution of image availability
analyze.img_avail_hist(project_img_avail)

In [None]:
# High image availability projects
qualifying_projects_list = analyze.count_projs_wi_img_avail(project_img_avail, 90)

In [None]:
analyze.analyze_low_coverage_issues(low_coverage_poly)

In [None]:
high_cov = project_img_avail[(project_img_avail['total_percent_area_covered'] > 90) & (project_img_avail['total_percent_area_covered'] <= 101)]
print(len(high_cov))
high_cov.sort_values('total_percent_area_covered', ascending=False)

### For PPC, calculate image availability by task (project_id + plantstart_year)

In [None]:
# Merge the 'plantstart' and 'plantstart_year' columns from poly_gdf into poly_img_avail (dataset of each polygon with associated best Maxar image)
poly_img_avail_wi_yrs = poly_img_avail.merge(poly_gdf[['poly_id', 'plantstart', 'plantstart_year']], how='left', on='poly_id')

In [None]:
task_results_df = img_cover.aggregate_project_image_coverage_ppc(poly_img_avail_wi_yrs)

In [None]:
# Save percent imagery coverage by task as dataframe
# Percent imagery coverage by polygon
task_results_df.to_csv(f"{results_path}task_imagery_coverage_{run_name}_{analysis}_{today}.csv", index=False)

In [None]:
x = img_gdf_filtered[img_gdf_filtered['title'].notna()]
x['date_diff'] = (x['img_date'] - x['plantstart']).dt.days
#x['title'].nunique()

In [None]:
x.head(2)

In [None]:
x.groupby('project_id')['date_diff'].mean()

In [None]:
x.groupby('project_id')['date_diff'].median()

In [None]:
best_img_df = pd.read_csv("../data/maxar_spenddown/results/most_recent/polygon_imagery_coverage_maxar_spenddown_most_recent_2025-08-13.csv")

In [None]:
best_img_df.head(2)

In [None]:
best_img_df['img_date'] = pd.to_datetime(best_img_df['img_date'], errors='coerce')
best_img_df = best_img_df.merge(poly_df[['project_id', 'poly_id', 'plantstart']], on=['project_id', 'poly_id'])
best_img_df
best_img_df['plantstart'] = pd.to_datetime(best_img_df['plantstart'], errors='coerce')
best_img_df['date_diff'] = (best_img_df['img_date'] - best_img_df['plantstart']).dt.days

In [None]:
best_img_df.head(2)

In [None]:
best_img_df.groupby('project_id')['best_image'].nunique()

In [None]:
x.groupby('project_id')['date_diff'].mean()

In [None]:
x.groupby('project_id')['img_date'].mean()

In [None]:
best_img_df.groupby('project_id')['date_diff'].mean()

In [None]:
best_img_df.groupby('project_id')['img_date'].median()

In [None]:
best_img_df.groupby('project_id')['plantstart'].mean()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from matplotlib.lines import Line2D

# Ensure datetime format
best_img_df['plantstart'] = pd.to_datetime(best_img_df['plantstart'])
best_img_df['img_date'] = pd.to_datetime(best_img_df['img_date'])

# Sort so projects appear nicely
best_img_df_sorted = best_img_df.sort_values(by=['project_id', 'plantstart'])

plt.figure(figsize=(12, 8))

# Plot planting start (orange)
sns.stripplot(
    data=best_img_df_sorted,
    x='plantstart',
    y='project_id',
    color='orange',
    alpha=0.7,
    size=4
)

# Plot best image (blue)
sns.stripplot(
    data=best_img_df_sorted,
    x='img_date',
    y='project_id',
    color='blue',
    alpha=0.7,
    size=4
)

# Add vertical line for today's date
today = pd.Timestamp(datetime.today().date())
plt.axvline(today, color='black', linestyle='--')

# Create custom legend handles
legend_elements = [
    Line2D([0], [0], marker='o', color='w', markerfacecolor='orange', markersize=6, label='Planting start'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='blue', markersize=6, label='Best image'),
    Line2D([0], [0], color='black', linestyle='--', label='Today')
]

# Move legend to top left
plt.legend(handles=legend_elements, loc='upper left', bbox_to_anchor=(0, 1))

# Labels and formatting
plt.xlabel("Date")
plt.ylabel("Project")
plt.title("Planting Start vs. Best Image Dates per Project")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from matplotlib.lines import Line2D

# --- Ensure datetime format ---
best_img_df['plantstart'] = pd.to_datetime(best_img_df['plantstart'])
best_img_df['img_date']   = pd.to_datetime(best_img_df['img_date'])

# --- Sort so projects appear nicely and lock in a stable y-order ---
best_img_df_sorted = best_img_df.sort_values(by=['project_id', 'plantstart'])
project_order = best_img_df_sorted['project_id'].dropna().unique().tolist()

plt.figure(figsize=(12, 8))

# --- Plantstart points (orange) ---
sns.stripplot(
    data=best_img_df_sorted,
    x='plantstart',
    y='project_id',
    order=project_order,
    color='orange',
    alpha=0.7,
    size=4
)

# --- Best image points (blue) ---
sns.stripplot(
    data=best_img_df_sorted,
    x='img_date',
    y='project_id',
    order=project_order,
    color='blue',
    alpha=0.7,
    size=4
)

ax = plt.gca()

# --- Vertical reference lines ---
today = pd.Timestamp(datetime.today().date())
ax.axvline(today, color='black', linestyle='--', linewidth=1.5)  # Today

# Global 90th percentile (across all projects)
plant_q90 = best_img_df_sorted['plantstart'].dropna().quantile(0.9)
q90_plus_1yr = plant_q90 + pd.DateOffset(years=1)
q90_plus_2yr = plant_q90 + pd.DateOffset(years=2)
ax.axvline(q90_plus_1yr, linestyle=':',  linewidth=2)  # 90th pct + 1 year
ax.axvline(q90_plus_2yr, linestyle='-.', linewidth=2)  # 90th pct + 2 years

# --- Per-project 90th percentile markers (one per project) ---
# Compute per-project Q90
proj_q90 = (
    best_img_df_sorted
    .dropna(subset=['plantstart'])
    .groupby('project_id')['plantstart']
    .quantile(0.9)
)

# Map project_id -> y-position used by seaborn (0-based)
# (Seaborn uses the category order we provided via `order=project_order`)
ypos_map = {pid: i for i, pid in enumerate(project_order)}

# Scatter one consistent marker per project at its Q90 date
# Using a triangle marker '^' with slight edge for visibility
for pid, qdate in proj_q90.items():
    if pd.notnull(qdate) and pid in ypos_map:
        ax.scatter(
            qdate, ypos_map[pid],
            marker='^',
            s=50,
            facecolor='none',    # hollow for clarity
            edgecolor='green',
            linewidth=1.5,
            zorder=3
        )

# --- Custom legend (single entry per thing) ---
legend_elements = [
    # Points
    Line2D([0], [0], marker='o', linestyle='None', color='orange',
           markerfacecolor='orange', markersize=6, label='Planting start'),
    Line2D([0], [0], marker='o', linestyle='None', color='blue',
           markerfacecolor='blue', markersize=6, label='Best image'),
    # Per-project Q90 marker
    Line2D([0], [0], marker='^', linestyle='None', color='green',
           markerfacecolor='none', markersize=7, markeredgewidth=1.5,
           label='90th pct (plantstart)'),
    # Vertical reference lines
    Line2D([0], [0], color='black', linestyle='--', linewidth=1.5, label='Today'),
    Line2D([0], [0], linestyle=':',  linewidth=2, label='90th pct + 1 year'),
    Line2D([0], [0], linestyle='-.', linewidth=2, label='90th pct + 2 years'),
]
plt.legend(handles=legend_elements, loc='upper left', bbox_to_anchor=(0, 1))

# --- Labels and formatting ---
plt.xlabel("Date")
plt.ylabel("Project")
plt.title("Planting Start vs. Best Image Dates per Project\n+ Per-Project 90th Percentile Markers")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from matplotlib.lines import Line2D

# --- Ensure datetime format ---
x['plantstart'] = pd.to_datetime(x['plantstart'])
x['img_date']   = pd.to_datetime(x['img_date'])

# --- Sort so projects appear nicely and lock in a stable y-order ---
x_sorted = x.sort_values(by=['project_id', 'plantstart'])
project_order = x_sorted['project_id'].dropna().unique().tolist()

plt.figure(figsize=(12, 8))

# --- Plantstart points (orange) ---
sns.stripplot(
    data=x_sorted,
    x='plantstart',
    y='project_id',
    order=project_order,
    color='orange',
    alpha=0.7,
    size=4
)

# --- Best image points (blue) ---
sns.stripplot(
    data=x_sorted,
    x='img_date',
    y='project_id',
    order=project_order,
    color='blue',
    alpha=0.7,
    size=4
)

ax = plt.gca()

# --- Vertical reference lines ---
today = pd.Timestamp(datetime.today().date())
ax.axvline(today, color='black', linestyle='--', linewidth=1.5)  # Today

# Global 90th percentile (across all projects)
plant_q90 = x_sorted['plantstart'].dropna().quantile(0.9)
q90_plus_1yr = plant_q90 + pd.DateOffset(years=1)
q90_plus_2yr = plant_q90 + pd.DateOffset(years=2)
ax.axvline(q90_plus_1yr, linestyle=':',  linewidth=2)  # 90th pct + 1 year
ax.axvline(q90_plus_2yr, linestyle='-.', linewidth=2)  # 90th pct + 2 years

# --- Per-project 90th percentile markers (one per project) ---
# Compute per-project Q90
proj_q90 = (
    x_sorted
    .dropna(subset=['plantstart'])
    .groupby('project_id')['plantstart']
    .quantile(0.9)
)

# Map project_id -> y-position used by seaborn (0-based)
# (Seaborn uses the category order we provided via `order=project_order`)
ypos_map = {pid: i for i, pid in enumerate(project_order)}

# Scatter one consistent marker per project at its Q90 date
# Using a triangle marker '^' with slight edge for visibility
for pid, qdate in proj_q90.items():
    if pd.notnull(qdate) and pid in ypos_map:
        ax.scatter(
            qdate, ypos_map[pid],
            marker='^',
            s=50,
            facecolor='none',    # hollow for clarity
            edgecolor='green',
            linewidth=1.5,
            zorder=3
        )

# --- Custom legend (single entry per thing) ---
legend_elements = [
    # Points
    Line2D([0], [0], marker='o', linestyle='None', color='orange',
           markerfacecolor='orange', markersize=6, label='Planting start'),
    Line2D([0], [0], marker='o', linestyle='None', color='blue',
           markerfacecolor='blue', markersize=6, label='Image date'),
    # Per-project Q90 marker
    Line2D([0], [0], marker='^', linestyle='None', color='green',
           markerfacecolor='none', markersize=7, markeredgewidth=1.5,
           label='90th pct (plantstart)'),
    # Vertical reference lines
    Line2D([0], [0], color='black', linestyle='--', linewidth=1.5, label='Today'),
    Line2D([0], [0], linestyle=':',  linewidth=2, label='90th pct + 1 year'),
    Line2D([0], [0], linestyle='-.', linewidth=2, label='90th pct + 2 years'),
]
plt.legend(handles=legend_elements, loc='upper left', bbox_to_anchor=(0, 1))

# --- Labels and formatting ---
plt.xlabel("Date")
plt.ylabel("Project")
plt.title("Planting Start vs. Image Dates per Project\n+ Per-Project 90th Percentile Markers")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
