In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import re
import os
import math
import requests
import yaml
import json

import sys
sys.path.append('../src/')

# data gathering
import process_api_results as clean
import api_utils as tm
from tm_api_utils import pull_tm_api_data

# branches
import image_availability as img
import canopy_cover as cover

# decision tree
import decision_trees as tree

%load_ext autoreload
%autoreload 2

dlopen(/Users/jessica.ertel/miniforge3/envs/dtree/lib/gdalplugins/ogr_PG.dylib, 0x0001): Library not loaded: /usr/lib/libpq.5.dylib
  Referenced from: <2257BC9C-CDBC-3646-B30C-03545BF1554B> /Users/jessica.ertel/miniforge3/envs/dtree/lib/gdalplugins/ogr_PG.dylib
  Reason: tried: '/usr/lib/libpq.5.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/usr/lib/libpq.5.dylib' (no such file), '/usr/lib/libpq.5.dylib' (no such file, not in dyld cache), '/usr/local/lib/libpq.5.dylib' (no such file)
dlopen(/Users/jessica.ertel/miniforge3/envs/dtree/lib/gdalplugins/ogr_PG.dylib, 0x0001): Library not loaded: /usr/lib/libpq.5.dylib
  Referenced from: <2257BC9C-CDBC-3646-B30C-03545BF1554B> /Users/jessica.ertel/miniforge3/envs/dtree/lib/gdalplugins/ogr_PG.dylib
  Reason: tried: '/usr/lib/libpq.5.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/usr/lib/libpq.5.dylib' (no such file), '/usr/lib/libpq.5.dylib' (no such file, not in dyld cache), '/usr/local/lib/libpq.5.dylib' (no such 

# PARAMS

In [2]:
param_path = '../params.yaml'
with open(param_path) as file:
    params = yaml.safe_load(file)

# cohort 1 and cohort 2 would be run separately, updating params file in between runs
# these could also be defined in the scripts and not fed into the notebook


In [3]:
today = '052025'
tm_auth_path = '../secrets.yaml'
tm_staging_url = "https://api-staging.terramatch.org/research/v3/sitePolygons?"       
tm_prod_url = "https://api.terramatch.org/research/v3/sitePolygons?"                 
portfolio = '../data/portfolio_csvs/terrafund_approved_projects_202504021529.csv'
c1_tm_response = f"../data/tm_api_response/c1_prod_{today}.json"
c2_tm_response = f"../data/tm_api_response/c2_prod_{today}.json"
c1_maxar_md = "../data/imagery_availability/comb_img_availability_2025-04-24_c1.csv"
c2_maxar_md = "../data/imagery_availability/comb_img_availability_2025-03-27_c2.csv"
c1_feats = f'../data/tm_api_c1_{today}.csv'  
c2_feats = f'../data/tm_api_c2_{today}.csv'   
c1_feats_maxar_query = f'/Users/jessica.ertel/github/maxar-tools/data/tm_api_c1_{today}.csv'
c2_feats_maxar_query = f'/Users/jessica.ertel/github/maxar-tools/data/tm_api_c2_{today}.csv'
# trimmed_branches = this is the clean csv for feeding into dtree

### Define tree thresholds ###
canopy_threshold = 40                                         # threshold for identifying open vs closed canopy projects
cloud_thresh = 50                                             # threshold for identifying image quality
img_count = 1                                                 # threshold for identifying image availability
baseline_range = (-365, 0)                                    # baseline window (1 year before plant start date)
ev_range = (730, 1095)                                        # EV window (2 years after plant start date)

# Decision Tree
The decision tree workflow takes as **input** a list of TerraFund project ids and returns as **output** a csv assigning a verification method to each polygon within a project. It uses APIs to pull data from two applications. The [TM API submodule](https://github.com/wri/terramatch-researcher-api) is used to interact with the TM platform and a wrapper function is used to interact with the Maxar platform. See the [TM API documentation](https://api-staging.terramatch.org/research-service/documentation/api#/default/ ) for full set of search parameters.
     
**Workflow**
1. Pull info on project characteristics for entire portfolio using TM API
   - repo/notebook: `terrafund-portfolio-analyses/tm-api.ipynb`
   - input: list of project ids
   - output: csv of all project features
3. Using TM API csv, pull Maxar metadata
    - repo/notebook: `maxar-tools/decision-tree-metadata.ipynb` and `maxar-tools/src/decision_tree.py`
    - input: csv of project features
    - output: csv of maxar metadata
5. Create imagery features using `image_availability.py` 
    - repo/notebook: `terrafund-portfolio-analyses/decision-tree.ipynb`
    - input: csv maxar metadata and csv of project features
    - output: csv of project features & img count
6. Run decision tree
    - input: master csv
    - output: decisions csv

**things to confirm:**
- inconsistent prj count for cohort 1 & 2 (`projects_all_approved.csv` identifies c1: 89 projects, c2: 85 projects. TM API returns c1: 67 and c2:64
- large increase in polygons for C1: had 11,423 polygons, increased to 617,031
- confirm count of proj ids and poly ids at each gather and branch stage
- how to determine which ttc_ value (i.e., year) is the best match for baseline and EV windows relative to plantstart — especially since the number and years of ttc_ columns will vary.


**bottlenecks:**
- erroneous or missing plantstart dates
- still struggling with a "source of truth". Can we be sure that TM has the right info?

**updates** 
- as of 4/2 filtering on cohort not framework_key
- incorporate the % of the project that informed the decision (how many polygons dropped due to lack of data?)

## GATHER: Projects & Project Attributes
Uses the TerraMatch API to download project features for a provided set of project ids. Performs a series of cleaning steps to correctly format the API output. Drops any polygons without a start date.

In [8]:
with open(tm_auth_path) as auth_file:
    auth = yaml.safe_load(auth_file)
headers = {
    'Authorization': f"Bearer {auth['access_token']}"
    }

In [9]:
full = pd.read_csv(portfolio)

cohort1 = full[(full.cohort == 'terrafund')]
cohort2 = full[(full.cohort == 'terrafund-landscapes')]

c1_ids = list(set(cohort1.project_id))
c2_ids = list(set(cohort2.project_id))
print(len(c1_ids), len(c2_ids))

80 76


In [10]:
c1_results = tm.pull_wrapper(tm_prod_url, headers, c1_ids, outfile=c1_tm_response)

Pulling Projects: 100%|████████████████████████████████████████████████████████████████████████████████████| 80/80 [12:41<00:00,  9.52s/project]


Results saved to ../data/tm_api_response/c1_prod_052025.json


In [13]:
c2_results = tm.pull_wrapper(tm_prod_url, headers, c2_ids, outfile=c2_tm_response)

Pulling Projects: 100%|████████████████████████████████████████████████████████████████████████████████████| 76/76 [13:47<00:00, 10.89s/project]


Results saved to ../data/tm_api_response/c2_prod_052025.json


In [12]:
c1_response_clean = clean.process_tm_api_results(c1_results,
                                                 outfile1=c1_feats, 
                                                 outfile2=c1_feats_maxar_query,
                                                 drop_missing=False)

  affected_rows.loc[is_feb_29] = non_leap_years
  affected_rows.loc[is_feb_29] = non_leap_years


⚠️ Polygons missing start and end plant date: 0
⚠️ Projects missing 'plantstart': 0
⚠️ Polygons missing 'plantstart': 0/13310
Projects fully affected: 0
Projects partially affected: 0
⚠️ Polygons missing 'ttc': 5068
⚠️ Polygons missing 'practice': 0
⚠️ Polygons missing 'target system': 0
0% data lost due to missing values.


In [15]:
c2_response_clean = clean.process_tm_api_results(c2_results,
                                                 outfile1=c2_feats, 
                                                 outfile2=c2_feats_maxar_query,
                                                 drop_missing=False)

  affected_rows.loc[is_feb_29] = non_leap_years
  affected_rows.loc[is_feb_29] = non_leap_years


⚠️ Polygons missing start and end plant date: 0
⚠️ Projects missing 'plantstart': 0
⚠️ Polygons missing 'plantstart': 0/17265
Projects fully affected: 0
Projects partially affected: 0
⚠️ Polygons missing 'ttc': 5095
⚠️ Polygons missing 'practice': 0
⚠️ Polygons missing 'target system': 0
0% data lost due to missing values.


### Run for BEF

In [43]:
csv = '../bef_shortlist_c1.csv'
shortlist = pd.read_csv(csv)
shortlist = shortlist[['poly_id', 'project_id', 'pct_img_cover_base','pct_img_cover_ev', 'pct_img_cover_both']] 
short_ids = list(set(shortlist.project_id))

results = tm.pull_wrapper(tm_prod_url, headers, short_ids, outfile=c1_tm_response)

Pulling Projects: 100%|████████████████████████████████████████████████████████████████████████████| 12/12 [00:18<00:00,  1.57s/project]


Results saved to ../data/tm_api_response/c1_prod_042425.json


In [44]:
c1_response_clean = clean.process_tm_api_results(results,
                                                 outfile1=c1_feats, 
                                                 outfile2=c1_feats_maxar_query)

Number of rows missing a 'plantstart' date: 0/460
Number of rows missing a 'plantend' date: 457/460
⚠️ Total rows missing start and end plant date: 0
⚠️ Total projects missing 'plantstart': 0
⚠️ Total polygons missing 'plantstart': 0
Projects fully removed: 0
Projects partially affected: 0


  affected_rows.loc[is_feb_29] = non_leap_years
  affected_rows.loc[is_feb_29] = non_leap_years


In [45]:
len(c1_response_clean.project_id.unique())

12

## Confirm Project Count
No projects are dropped during cleaning script, unsure why there is a difference in the response count

In [20]:
len(c1_response_clean.project_id.unique())

71

In [21]:
len(c2_response_clean.project_id.unique())

53

## Confirm categories clean
Yay! This looks good.

In [22]:
c1_response_clean.practice.value_counts()

practice
tree-planting                                                 10615
direct-seeding                                                  872
assisted-natural-regeneration                                   118
assisted-natural-regeneration,tree-planting                      78
assisted-natural-regeneration,direct-seeding,tree-planting       36
assisted-natural-regeneration,direct-seeding                      5
direct-seeding,tree-planting                                      4
Name: count, dtype: int64

In [23]:
c2_response_clean.practice.value_counts()

practice
tree-planting                                                 10626
direct-seeding                                                  367
assisted-natural-regeneration,tree-planting                      25
assisted-natural-regeneration                                    14
direct-seeding,tree-planting                                     12
assisted-natural-regeneration,direct-seeding,tree-planting       12
Name: count, dtype: int64

In [24]:
c1_response_clean.target_sys.value_counts()

target_sys
agroforest                  11413
natural-forest                145
woodlot-or-plantation          54
riparian-area-or-wetland       42
silvopasture                   39
mangrove                       34
Name: count, dtype: int64

In [25]:
c2_response_clean.target_sys.value_counts()

target_sys
agroforest                  10877
natural-forest                 93
woodlot-or-plantation          65
riparian-area-or-wetland       36
silvopasture                   13
mangrove                       11
urban-forest                    2
Name: count, dtype: int64

## Confirm TTC Clean
- There are erroneous ttc years (-1), (1898), 2041

In [44]:
c2_response_clean.describe()

Unnamed: 0,plantstart,plantend,ttc_2023,ttc_2024,ttc_2022
count,11098,11098,9200.0,127.0,12.0
mean,2024-07-07 18:48:35.552351744,2024-10-31 03:11:54.290863104,34.165435,14.566929,77.166667
min,2023-05-02 00:00:00,2023-05-05 00:00:00,0.0,0.0,5.0
25%,2024-04-11 00:00:00,2024-07-31 00:00:00,0.0,0.0,76.5
50%,2024-06-05 00:00:00,2024-12-09 00:00:00,25.0,0.0,84.5
75%,2024-10-25 00:00:00,2024-12-31 00:00:00,63.0,23.0,92.5
max,2042-10-26 00:00:00,2029-07-21 00:00:00,97.0,97.0,96.0
std,,,32.790043,21.672377,25.757023


In [32]:
c1_nulls = null_ttc(c1_response_clean)

(3134, 15)


In [34]:
c2_nulls = null_ttc(c2_response_clean)

(1759, 13)


## GATHER: image metadata
This step is performed in the [maxar-tools repo](https://github.com/wri/maxar-tools/tree/jessica_meta).

## BRANCH: Image Availability
Takes in all of the maxar metadata for each polygon and returns 2 columns of image count at baseline and at early verification.

In [47]:
# Projects with no images available need to somehow be reincorporated into this df

In [50]:
c1_prj = pd.read_csv(c1_feats)
c1_img = pd.read_csv(c1_maxar_md)

c1_main = img.analyze_image_availability(c1_prj, 
                                       c1_img, 
                                       baseline_range, 
                                       ev_range, 
                                       cloud_thresh)

In [36]:
c2_prj = pd.read_csv(c2_feats)
c2_img = pd.read_csv(c2_maxar_md)

c2_main = img.analyze_image_availability(c2_prj, 
                                       c2_img, 
                                       baseline_range, 
                                       ev_range, 
                                       cloud_thresh)

## BRANCH: Canopy Cover
Takes in all of the TTC statistics and returns a single canopy designation (open or closed) at baseline and early verification.
For Cohort 1, the earliest planting date was 2021-02-11.
For Cohort 2, the earliest planting date was ..  


To figure out:
- for polys with more than one ttc number, should the earliest one be used as baseline? This is never the case - only one number for now.
- where did plantstart come into play for ttc calc?
- how to treat projects with multiple start dates for each polygon.
- Why some polygons are still all NaN values

In [51]:
c1_main_ttc = cover.apply_canopy_classification(c1_main,
                                                canopy_threshold,
                                                baseline_range,
                                                ev_range)

In [38]:
c2_main_ttc = cover.apply_canopy_classification(c2_main,
                                                canopy_threshold,
                                                baseline_range,
                                                ev_range)

In [39]:
c1_main_ttc.baseline_canopy.value_counts()

baseline_canopy
open       5271
closed     3408
invalid    3134
Name: count, dtype: int64

In [40]:
c2_main_ttc.baseline_canopy.value_counts()

baseline_canopy
open       5838
closed     3501
invalid    1759
Name: count, dtype: int64

In [94]:
# TODO -- determine how to handle invalids

# Run Decision Tree

In [52]:
rules = pd.read_csv('../data/rule_template.csv')
rules.head(20)

Unnamed: 0,baseline_canopy,target_sys,practice,img_count,decision
0,open,mangrove,tree-planting,>=1,field
1,open,mangrove,tree-planting,<1,field
2,open,mangrove,assisted-natural-regeneration,,field
3,open,natural-forest,tree-planting,>=1,remote
4,open,natural-forest,tree-planting,<1,field
5,open,natural-forest,direct-seeding,>=1,remote
6,open,natural-forest,direct-seeding,<1,field
7,open,natural-forest,assisted-natural-regeneration,,field
8,open,woodlot-or-plantation,tree-planting,>=1,remote
9,open,woodlot-or-plantation,tree-planting,<1,field


In [53]:
c1_final, c1_decision_summary = tree.apply_rules_baseline(c1_main_ttc, save_to_csv=f'../data/c1_dtree_baseline_042425.csv')

In [54]:
c1_decision_summary

Unnamed: 0,decision,count,proportion
0,remote,389,0.845652
1,field,36,0.078261
2,review required,35,0.076087


In [41]:
c2_final, c2_decision_summary = tree.apply_rules_baseline(c2_main_ttc, save_to_csv=f'../data/c2_dtree_baseline_040325.csv')

In [110]:
c2_decision_summary

Unnamed: 0,decision,count,proportion
0,remote,8064,0.726617
1,review required,1903,0.171472
2,field,1131,0.10191


# Review & Selection
Select 4 projects with the highest image availability at baseline and year 2 in cohort 1

In [29]:
#c1_final

In [141]:
def summarize_results(df):
    total_projects = df['project_id'].nunique()
    print(f"{total_projects} total projects")
    
    # 2. Total number of polygon_ids per project
    polygon_counts = df.groupby('project_id')['poly_id'].nunique().reset_index(name='polygon_count')
    
    # 3. Proportion of remote vs field decisions per project
    decision_counts = df.groupby(['project_id', 'decision']).size().unstack(fill_value=0)    
    decision_proportions = decision_counts.div(decision_counts.sum(axis=1), axis=0)
    decision_proportions = (decision_proportions * 100).round(2).reset_index()
    
    # Merge polygon counts and decision proportions into one summary
    summary = polygon_counts.merge(decision_proportions, on='project_id')
    return summary

In [142]:
test = summarize_results(c1_final)
sorted_test = test.sort_values(by='remote', ascending=False)

71 total projects


In [121]:
sorted_test[:20]

Unnamed: 0,project_id,polygon_count,field,remote,review required
6,243f93d2-0d4b-4dac-8b23-997e6528dc8e,1,0.0,100.0,0.0
64,ed1cadff-e20f-43a7-8627-aee10f48cc7a,114,0.0,100.0,0.0
56,c3d2858f-aa3d-458e-8e88-5a7f376e6ef4,58,0.0,100.0,0.0
65,f449aef3-4453-42c9-b542-57acc7c2e5eb,2,0.0,100.0,0.0
21,449adf55-f6f8-4f17-97d3-ab6f6bf6676d,6,0.0,100.0,0.0
9,292bbbda-ec32-40e2-a127-3670a052423d,5,0.0,100.0,0.0
8,24fc33cb-53ad-4383-82ca-f6e2ac3fd143,3,0.0,100.0,0.0
45,9100baf3-9ac4-4db5-85a7-bc12b236a370,6,0.0,100.0,0.0
42,8a112e82-e191-44ad-b306-2578c064104b,19,0.0,100.0,0.0
49,a8940698-ff28-456c-b8cd-f7289e612913,290,0.69,99.31,0.0


In [128]:
test2.columns

Index(['project_id', 'poly_id', 'site_id', 'geometry', 'plantstart',
       'plantend', 'practice', 'target_sys', 'dist', 'project_phase',
       'ttc_2021', 'ttc_2023', 'ttc_2022', 'ttc_2024', 'ttc_2020',
       'baseline_img_count', 'ev_img_count', 'plantstart_year',
       'baseline_canopy', 'ev_canopy', 'plantstart_dt', 'decision'],
      dtype='object')

In [139]:
c1remotes = c1_final[c1_final.decision == 'remote'].sort_values(by='baseline_img_count', ascending=False)
c1remotes = c1remotes[['project_id', 'poly_id','baseline_img_count', 'ev_img_count', 'decision']]
c1remotes

Unnamed: 0,project_id,poly_id,baseline_img_count,ev_img_count,decision
6007,01918b25-6544-4027-82aa-6703c7c04784,bb064fa6-c72c-4768-a1cb-4bc55a0c32f8,12.0,0.0,remote
3190,5b02c3a0-0f64-4506-8cc5-719dfa8c1641,1acb3826-8bed-4490-a6b9-73ee21ecf72c,11.0,1.0,remote
3203,5b02c3a0-0f64-4506-8cc5-719dfa8c1641,1975b036-eb84-4235-9032-2abe48eb6bcc,11.0,1.0,remote
3202,5b02c3a0-0f64-4506-8cc5-719dfa8c1641,bc83d062-de54-4968-93a1-d33a626e2658,11.0,1.0,remote
3201,5b02c3a0-0f64-4506-8cc5-719dfa8c1641,e39e3b35-2391-4a9b-b46a-af7ea60e0bdb,11.0,1.0,remote
...,...,...,...,...,...
26,62043c88-f03d-475e-ac9c-2f057536e2a8,22092217-3321-47ab-81de-abfe4e4a3686,1.0,0.0,remote
25,62043c88-f03d-475e-ac9c-2f057536e2a8,d040f75c-6676-4416-ac27-16c926514cd7,1.0,0.0,remote
23,62043c88-f03d-475e-ac9c-2f057536e2a8,1264152b-a87a-47b5-808d-9bd4afa39a7a,1.0,0.0,remote
22,62043c88-f03d-475e-ac9c-2f057536e2a8,23aaf2b1-5b48-4f6a-85a3-ea11a95798a2,1.0,0.0,remote


In [136]:
review_req = c1_final[['project_id', 'poly_id', 'site_id', 'plantstart',
       'practice', 'target_sys', 'dist','ttc_2021', 'ttc_2023', 'ttc_2022', 'ttc_2024', 'ttc_2020',
       'baseline_img_count', 'ev_img_count', 'plantstart_year', 'baseline_canopy', 'ev_canopy','plantstart_dt', 'decision']]
review_req = review_req[review_req.decision == 'review required']
review_req[:3]

Unnamed: 0,project_id,poly_id,site_id,plantstart,practice,target_sys,dist,ttc_2021,ttc_2023,ttc_2022,ttc_2024,ttc_2020,baseline_img_count,ev_img_count,plantstart_year,baseline_canopy,ev_canopy,plantstart_dt,decision
3,62043c88-f03d-475e-ac9c-2f057536e2a8,2c7114b9-86dd-4323-83ba-d977e4c6a551,4daec6c6-307d-4b40-9d1e-3e33ff1815f8,2024-09-01,tree-planting,,full,,87.0,,,,0.0,0.0,2024,closed,not available,2024-09-01,review required
107,36504a4e-f7a3-4963-9ff2-9aa9982cf990,7d0c980b-cafb-4b04-93f9-c967fabac446,9d0e5c0a-aba6-435f-b77c-6ca0d8f6b791,2023-11-29,,agroforest,,,,,,,1.0,0.0,2023,invalid,invalid,2023-11-29,review required
108,36504a4e-f7a3-4963-9ff2-9aa9982cf990,1a74f19e-7e27-401c-9942-eaf629766967,9d0e5c0a-aba6-435f-b77c-6ca0d8f6b791,2023-12-08,,agroforest,,,,,,,2.0,0.0,2023,invalid,invalid,2023-12-08,review required


In [138]:
prj_id = 'ed1cadff-e20f-43a7-8627-aee10f48cc7a'
test = c1_final[c1_final.project_id == prj_id]
test

Unnamed: 0,project_id,poly_id,site_id,geometry,plantstart,plantend,practice,target_sys,dist,project_phase,...,ttc_2022,ttc_2024,ttc_2020,baseline_img_count,ev_img_count,plantstart_year,baseline_canopy,ev_canopy,plantstart_dt,decision
5882,ed1cadff-e20f-43a7-8627-aee10f48cc7a,e1aca94a-0227-4a94-82eb-6d493422679d,49a14713-de2e-4ffe-ab58-44100d756275,"{'type': 'Polygon', 'coordinates': [[[1.028873...",2023-03-10,NaT,tree-planting,agroforest,full,,...,37.0,,,1.0,0.0,2023,open,investigate,2023-03-10,remote
5883,ed1cadff-e20f-43a7-8627-aee10f48cc7a,5f1f07a6-4158-417b-8168-7ce68e267494,49a14713-de2e-4ffe-ab58-44100d756275,"{'type': 'Polygon', 'coordinates': [[[0.991051...",2023-03-10,NaT,tree-planting,agroforest,full,,...,67.0,,,1.0,0.0,2023,closed,investigate,2023-03-10,remote
5884,ed1cadff-e20f-43a7-8627-aee10f48cc7a,90418d50-32d7-4cb7-82ef-cb680b0973a2,49a14713-de2e-4ffe-ab58-44100d756275,"{'type': 'Polygon', 'coordinates': [[[0.993159...",2023-03-10,NaT,tree-planting,agroforest,full,,...,38.0,,,1.0,0.0,2023,open,investigate,2023-03-10,remote
5885,ed1cadff-e20f-43a7-8627-aee10f48cc7a,478a3329-30e8-43f7-b6bf-e917f4705ee9,49a14713-de2e-4ffe-ab58-44100d756275,"{'type': 'Polygon', 'coordinates': [[[0.995090...",2023-03-10,NaT,tree-planting,agroforest,full,,...,29.0,,,1.0,0.0,2023,open,investigate,2023-03-10,remote
5886,ed1cadff-e20f-43a7-8627-aee10f48cc7a,3b8ca1b4-6977-464e-975b-14c44fb7a6b8,49a14713-de2e-4ffe-ab58-44100d756275,"{'type': 'Polygon', 'coordinates': [[[0.990367...",2023-03-10,NaT,tree-planting,agroforest,full,,...,32.0,,,1.0,0.0,2023,open,investigate,2023-03-10,remote
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5991,ed1cadff-e20f-43a7-8627-aee10f48cc7a,de890bfb-a84f-4203-9763-264854c78676,99517d95-b0a2-4b78-a666-f7e79076a6c1,"{'type': 'Polygon', 'coordinates': [[[1.097087...",2023-03-10,2023-05-31,tree-planting,agroforest,full,,...,43.0,,,2.0,0.0,2023,closed,investigate,2023-03-10,remote
5992,ed1cadff-e20f-43a7-8627-aee10f48cc7a,e84c15e6-b296-484b-b237-b0f9092ce7f0,99517d95-b0a2-4b78-a666-f7e79076a6c1,"{'type': 'Polygon', 'coordinates': [[[1.096960...",2023-03-10,2023-05-31,tree-planting,agroforest,full,,...,35.0,,,2.0,0.0,2023,open,investigate,2023-03-10,remote
5993,ed1cadff-e20f-43a7-8627-aee10f48cc7a,9e5cee4d-c388-4bf4-b51e-0d5a9a831e78,99517d95-b0a2-4b78-a666-f7e79076a6c1,"{'type': 'Polygon', 'coordinates': [[[1.094125...",2023-03-10,2023-05-31,tree-planting,agroforest,full,,...,20.0,,,2.0,0.0,2023,open,investigate,2023-03-10,remote
5994,ed1cadff-e20f-43a7-8627-aee10f48cc7a,522ebcc2-40de-4961-a306-3fa729b43cdc,99517d95-b0a2-4b78-a666-f7e79076a6c1,"{'type': 'Polygon', 'coordinates': [[[1.098329...",2023-03-10,2023-05-31,tree-planting,agroforest,full,,...,30.0,,,2.0,0.0,2023,open,investigate,2023-03-10,remote


### Analysis for BEF

In [153]:
c1remotes = c1_final[c1_final.decision == 'remote']

grouped_summary = c1remotes.groupby("project_id").agg(
    num_polygons=("poly_id", "nunique"),
    total_baseline_images=("baseline_img_count", "sum"),
    total_ev_images=("ev_img_count", "sum")
).reset_index()


grouped_summary

Unnamed: 0,project_id,num_polygons,total_baseline_images,total_ev_images
0,01918b25-6544-4027-82aa-6703c7c04784,2,21.0,0.0
1,0733b059-5538-4fd8-8c9a-7adb78aea248,31,51.0,0.0
2,16b297b3-30a3-4624-bcc9-4333919f66fc,3,4.0,0.0
3,243f93d2-0d4b-4dac-8b23-997e6528dc8e,1,2.0,0.0
4,24fc33cb-53ad-4383-82ca-f6e2ac3fd143,3,7.0,1.0
5,292bbbda-ec32-40e2-a127-3670a052423d,5,26.0,3.0
6,2f4b5b55-49e2-4cec-a40a-521c5c142c91,2,3.0,0.0
7,33ee81da-5421-410e-a642-37fc3b0cb935,71,257.0,75.0
8,3418f474-5d5d-422f-a12b-459040034223,4,8.0,0.0
9,36504a4e-f7a3-4963-9ff2-9aa9982cf990,26,42.0,5.0


In [158]:
sorted_grouped_summary = grouped_summary.sort_values(by=["total_baseline_images", "total_ev_images"], ascending=[False, False])

In [159]:
sorted_grouped_summary.to_csv('../data/results/sorted_dtree_040325.csv')

In [161]:
sorted_grouped_summary

Unnamed: 0,project_id,num_polygons,total_baseline_images,total_ev_images
15,47673f46-0af9-4047-afc7-b4f4fefaa5e7,470,474.0,0.0
46,f4b92387-705a-40bf-a7bb-e84f942de1ec,390,423.0,0.0
33,a8940698-ff28-456c-b8cd-f7289e612913,288,362.0,423.0
7,33ee81da-5421-410e-a642-37fc3b0cb935,71,257.0,75.0
19,5b02c3a0-0f64-4506-8cc5-719dfa8c1641,29,243.0,24.0
44,ed1cadff-e20f-43a7-8627-aee10f48cc7a,114,163.0,0.0
22,62043c88-f03d-475e-ac9c-2f057536e2a8,98,158.0,0.0
10,389aad5b-6577-4cea-bf9f-446dcfd94966,41,148.0,0.0
27,802bb88c-5eb5-4ce2-836f-19bc8e0ddfc4,123,139.0,99.0
12,4124a403-45ca-4e48-be6f-6c15b3b48617,87,139.0,0.0


In [160]:

darby = pd.read_csv('../data/results/darby_analysis.csv')
darby

Unnamed: 0,project_id,total_polygons,polygons_high_both,percent_polygons_high_both
0,bbd88e69-cd85-429e-bebf-6234bf82dbb3,79,55,69.620253
1,f81c1422-025c-45b1-a2e1-d354177523ca,33,16,48.484848
2,943bb150-f1b7-4ad2-bb9e-60a559df2ebd,2727,1226,44.957829
3,f449aef3-4453-42c9-b542-57acc7c2e5eb,5,2,40.0
4,39871658-bff0-49c2-aa20-ccac0b03a2c2,5,2,40.0
5,36504a4e-f7a3-4963-9ff2-9aa9982cf990,1541,555,36.015574
6,cf16b937-a02b-4691-b816-28669ec348f2,23,8,34.782609
7,abdb9d09-7c55-4e26-8961-1aa26e991bbc,10,3,30.0
8,9100baf3-9ac4-4db5-85a7-bc12b236a370,6,1,16.666667
9,aa0f8df7-b668-48f0-a8ce-bf5558f2a9d6,305,34,11.147541


# Description

* total_polygons: The total number of polygons in that project
* polygons_high_both: The number of polygons in the project that have >=70% imagery coverage (70% overlap between the best maxar image footprint and the polygon geometry) at baseline and EV
* percent_polygons_high_both: (polygons_high_both / total_polygons) * 100


In [168]:
merged_df = sorted_grouped_summary.merge(darby, on='project_id', how='inner')
merged_df

Unnamed: 0,project_id,num_polygons,total_baseline_images,total_ev_images,total_polygons,polygons_high_both,percent_polygons_high_both
0,bbd88e69-cd85-429e-bebf-6234bf82dbb3,76,110.0,0.0,79,55,69.620253
1,47118e50-d4d1-4ba0-8094-59cfa441dbb0,28,89.0,1.0,118,11,9.322034
2,f81c1422-025c-45b1-a2e1-d354177523ca,31,78.0,22.0,33,16,48.484848
3,aa0f8df7-b668-48f0-a8ce-bf5558f2a9d6,37,53.0,37.0,305,34,11.147541
4,36504a4e-f7a3-4963-9ff2-9aa9982cf990,26,42.0,5.0,1541,555,36.015574
5,abdb9d09-7c55-4e26-8961-1aa26e991bbc,9,36.0,0.0,10,3,30.0
6,9019106b-6e2d-4deb-97a5-2889f976a931,9,24.0,0.0,10,0,0.0
7,d6481438-9603-4c68-b152-6586ed825b0a,19,19.0,38.0,447,10,2.237136
8,cf16b937-a02b-4691-b816-28669ec348f2,18,18.0,0.0,23,8,34.782609
9,e4fe2fa4-6869-4c1e-9347-ba9b135306f5,1,10.0,0.0,2,0,0.0


In [163]:
merged_df.to_csv('../data/results/priority_prj_bef.csv')

In [164]:
full.head()

Unnamed: 0,project_id,project_name,country,cohort,landscape,framework_key
0,c462918b-47f7-4ed5-99e0-7fec6e342036,"""Nakuru Eco-Reforestation Project""",KE,terrafund-landscapes,Greater Rift Valley of Kenya,enterprises
1,617601e0-9839-49fd-b48e-6c07404e7140,Afram Headwaters Restoration Initiative (AHRI),GH,terrafund-landscapes,Ghana Cocoa Belt,enterprises
2,943bb150-f1b7-4ad2-bb9e-60a559df2ebd,Agriculture and forest landscape restoration -...,GH,terrafund,Ghana Cocoa Belt,terrafund
3,0f4bdbba-adf9-4554-ba29-fb28d3b9c1fd,AGRICULTURE AND FOREST LANDSCAPES RESTORATION ...,GH,terrafund-landscapes,Ghana Cocoa Belt,terrafund-landscapes
4,47f07709-b503-4eeb-98fa-aea62e979ca8,Agroecology Project - Centre ValBio,MG,terrafund,,terrafund


In [169]:
merged_df = merged_df.merge(full[['project_id', 'landscape']], on='project_id', how='inner')
merged_df = merged_df.dropna(subset=['landscape'])
merged_df

Unnamed: 0,project_id,num_polygons,total_baseline_images,total_ev_images,total_polygons,polygons_high_both,percent_polygons_high_both,landscape
0,bbd88e69-cd85-429e-bebf-6234bf82dbb3,76,110.0,0.0,79,55,69.620253,Lake Kivu & Rusizi River Basin
1,47118e50-d4d1-4ba0-8094-59cfa441dbb0,28,89.0,1.0,118,11,9.322034,Ghana Cocoa Belt
4,36504a4e-f7a3-4963-9ff2-9aa9982cf990,26,42.0,5.0,1541,555,36.015574,Lake Kivu & Rusizi River Basin
5,abdb9d09-7c55-4e26-8961-1aa26e991bbc,9,36.0,0.0,10,3,30.0,Ghana Cocoa Belt
7,d6481438-9603-4c68-b152-6586ed825b0a,19,19.0,38.0,447,10,2.237136,Greater Rift Valley of Kenya
8,cf16b937-a02b-4691-b816-28669ec348f2,18,18.0,0.0,23,8,34.782609,Greater Rift Valley of Kenya
9,e4fe2fa4-6869-4c1e-9347-ba9b135306f5,1,10.0,0.0,2,0,0.0,Greater Rift Valley of Kenya
11,9100baf3-9ac4-4db5-85a7-bc12b236a370,6,6.0,0.0,6,1,16.666667,Ghana Cocoa Belt
14,243f93d2-0d4b-4dac-8b23-997e6528dc8e,1,2.0,0.0,1,0,0.0,Ghana Cocoa Belt
16,529e1bae-2187-473f-a2a3-17e577720aba,1,1.0,0.0,2,0,0.0,Ghana Cocoa Belt


In [170]:
merged_df.to_csv('../data/results/priority_prj_bef.csv')

In [171]:
merged_df.shape

(10, 8)

### Opentopo testing

In [4]:
import richdem as rd
print(rd.GDAL_AVAILABLE)

True


In [20]:
from api_utils import opentopo_pull_wrapper

config_path = "../secrets.yaml"
with open(config_path) as conf_file:
    config = yaml.safe_load(conf_file)
    
dem_url='https://portal.opentopography.org/API/globaldem'
api_key = config['opentopo_key']
df = pd.read_csv(c1_feats)
df = df.iloc[500:1500, :] #test run
outfile = '../data/slope/slopestats_test.csv'

opentopo_pull_wrapper(dem_url, api_key, df, outfile)

Processing Projects:   0%|                                                                                           | 0/1 [00:00<?, ?project/s]
A Slope calculation (rise/run)[39m
C Horn, B.K.P., 1981. Hill shading and the reflectance map. Proceedings of the IEEE 69, 14–47. doi:10.1109/PROC.1981.11918[39m


A Aspect attribute calculation[39m
C Horn, B.K.P., 1981. Hill shading and the reflectance map. Proceedings of the IEEE 69, 14–47. doi:10.1109/PROC.1981.11918[39m

Processing Projects: 100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [00:14<00:00, 14.07s/project]


In [26]:
gdf = gpd.read_file('../data/slope/replaced_plots/replaced_plots.shp')
gdf.head()

Unnamed: 0,plot_id,polygon_id,PlantDate,Name,IntervType,Project,Country,SiteName,geometry
0,0,1001,2023-11-23,RV1209,Agroforestry,Birdlife International,RW,Bweyeye-Butare,"POLYGON ((29.17459 -2.59658, 29.17462 -2.5966,..."
1,1,827,2023-12-06,NU0743,Agroforestry,Birdlife International,RW,Bweyeye-Butare,"POLYGON ((29.17714 -2.61357, 29.17715 -2.61349..."
2,2,1277,2023-11-28,NU0800,Agroforestry,Birdlife International,RW,Bweyeye-Butare,"POLYGON ((29.18021 -2.61257, 29.17997 -2.61268..."
3,3,1436,2023-03-02,BA0102,Agroforestry,Birdlife International,RW,Bweyeye-Butare,"POLYGON ((29.17129 -2.59375, 29.17157 -2.59368..."
4,4,929,2023-12-07,RV1263,Agroforestry,Birdlife International,RW,Bweyeye-Butare,"POLYGON ((29.1767 -2.60008, 29.17667 -2.60012,..."


In [28]:
import tempfile
import ast
from shapely.geometry import shape
from osgeo import gdal
from rasterstats import zonal_stats
import richdem as rd
import rasterio as rs

gdf = gpd.read_file('../data/slope/replaced_plots/replaced_plots.shp')
all_polygons = []
# Project-level bounding box
total_bounds = gdf.total_bounds  # (minx, miny, maxx, maxy)

query = {
    'demtype': 'NASADEM',
    'south': str(total_bounds[1]),
    'north': str(total_bounds[3]),
    'west': str(total_bounds[0]),
    'east': str(total_bounds[2]),
    'outputFormat': 'GTiff',
    'API_Key': api_key
}
# Use a temporary file instead of manual file path
# keep delete = False because file is used later
with tempfile.NamedTemporaryFile(suffix=".tif", delete=False) as tmpfile:
    temp_path = tmpfile.name

try:
    # Download the DEM to the temp file
    response = requests.get(dem_url, stream=True, params=query)
    if response.status_code != 200:
        print(f"Error downloading DEM for project {id}: {response.text}")
        raise Exception(f"Failed to download DEM for project {id}")

    # Save the DEM content to the temp file
    with open(temp_path, 'wb') as f:
        for chunk in response.iter_content(1024):
            f.write(chunk)

    # Load the DEM
    dem = rd.LoadGDAL(temp_path)
    slope = rd.TerrainAttribute(dem, attrib='slope_riserun')
    aspect = rd.TerrainAttribute(dem, attrib='aspect')

    # Open the DEM with rasterio to get affine
    with rs.open(temp_path) as dem_r:
        affine = dem_r.transform

    for idx, row in gdf.iterrows():
        poly = row['geometry']

        # Initialize stats as NaN
        slope_stats = {stat: np.nan for stat in ["min", "max", "mean", 
                                                 "median", "majority"]}
        aspect_stats = {stat: np.nan for stat in ["min", "max", "mean", 
                                                  "median", "majority"]}

        try:
            slope_zs = zonal_stats(poly, slope, affine=affine,
                                   stats=["min", "max", "mean", 
                                          "median", "majority"])
            aspect_zs = zonal_stats(poly, aspect, affine=affine,
                                    stats=["min", "max", "mean", 
                                           "median", "majority"])
            # Update stats if available
            if slope_zs and slope_zs[0] is not None:
                slope_stats = slope_zs[0]
            if aspect_zs and aspect_zs[0] is not None:
                aspect_stats = aspect_zs[0]
        except Exception as e:
            print(f"Warning: Unable to calculate stats for polygon {row['poly_id']} in project {id}: {e}")

        all_polygons.append({
            'plot_id': row['plot_id'],
            'poly_id': row['polygon_id'],
            'name':row['Name'],
            'slope_stats': slope_stats,
            'aspect_stats': aspect_stats
        })
# now delete the temp file
finally:
    if os.path.exists(temp_path):
        os.remove(temp_path)

# Write results to CSV
result_df = pd.DataFrame(all_polygons)
result_df = pd.concat([result_df.drop(['slope_stats', 'aspect_stats'], axis=1),
               result_df['slope_stats'].apply(pd.Series).add_suffix('_slope'),
               result_df['aspect_stats'].apply(pd.Series).add_suffix('_aspect')], axis=1)
result_df.to_csv('../data/slope/birdlife_slope_stats.csv', index=False)


A Slope calculation (rise/run)[39m
C Horn, B.K.P., 1981. Hill shading and the reflectance map. Proceedings of the IEEE 69, 14–47. doi:10.1109/PROC.1981.11918[39m


A Aspect attribute calculation[39m
C Horn, B.K.P., 1981. Hill shading and the reflectance map. Proceedings of the IEEE 69, 14–47. doi:10.1109/PROC.1981.11918[39m

