In [2]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import re
import os
import math
import requests
import yaml
import json
import sys
sys.path.append('../src/')
import image_availability as img
import process_api_results as clean
import decision_trees as tree
import tm_api_utils as api_request

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Decision Tree
The decision tree workflow takes as input a list of TerraFund project ids and returns as output a csv assigning a verification method to each polygon within a project. It uses APIs to pull data from two applications. [TM API documentation](https://api-staging.terramatch.org/research-service/documentation/api#/default/ ) for full set of search parameters:
     
**Workflow**
1. Pull info on project characteristics for entire portfolio using TM API
   - repo/notebook: `terrafund-portfolio-analyses/tm-api.ipynb`
   - input: list of project ids
   - output: csv of all project features
3. Using TM API csv, pull Maxar metadata
    - repo/notebook: `maxar-tools/decision-tree-metadata.ipynb` and `maxar-tools/src/decision_tree.py`
    - input: csv of project features
    - output: csv of maxar metadata
    - **bottleneck:** add functionality to track dropped projects
5. Create imagery features using `image_availability.py` 
    - repo/notebook: `terrafund-portfolio-analyses/decision-tree.ipynb`
    - input: csv maxar metadata and csv of project features
    - output: csv of project features & img count
6. Run decision tree
    - input: master csv
    - output: decisions csv

**bottlenecks:**
- staging environment subject to changes which breaks the request & cleaning functions
- inconsistent syntax for practice and target sys
- missing projects
- missing ttc stats

# PARAMS

In [3]:
tm_auth_path = '../secrets.yaml'
tm_staging_url = "https://api-staging.terramatch.org/research/v3/sitePolygons?"       # use for testing queries
tm_prod_url = "https://api.terramatch.org/research/v3/sitePolygons?"                  # Use to pull data for analysis
approved_projects = '../projects_all_approved_202501091214.csv'
#maxar_md = "../data/imagery_availability/comb_img_availability_2025-02-19.csv"
maxar_md = "/home/darby/github_repos/maxar-tools/data/metadata/final/comb_img_availability_2025-02-24.csv"
#feats = '../data/tm_api_021925.csv'
feats = '../data/tm_api_TEST.csv'                           
#maxar_feats = '/Users/jessica.ertel/github/maxar-tools/data/tm_api_021925.csv'
maxar_feats = '/home/darby/github_repos/maxar-tools/data/tm_api_TEST.csv'

### Define tree thresholds ###
canopy_threshold = 40                                         # threshold for identifying open vs closed canopy projects
cloud_thresh = 50                                             # threshold for identifying image quality
img_count = 1                                                 # threshold for identifying image availability
baseline_range = (-365, 0)                                    # baseline window (1 year before plant start date)
ev_range = (730, 1095)                                        # EV window (2-3 years after plant start date)

## Gather Projects & Attributes
Uses the TerraMatch API to download project features for a provided set of project ids.

In [5]:
with open(tm_auth_path) as auth_file:
    auth = yaml.safe_load(auth_file)
headers = {
    'Authorization': f"Bearer {auth['access_token']}"
    }

In [6]:
full = pd.read_csv(approved_projects)
full = full[(full.framework_key == 'terrafund-landscapes') | (full.framework_key == 'terrafund')]
display(full.framework_key.value_counts())

ids = list(set(full.project_id))
print(len(ids))

framework_key
terrafund               108
terrafund-landscapes     99
Name: count, dtype: int64

207


In [7]:
project_results = api_request.pull_tm_api_data(tm_prod_url, headers, ids, outfile="../data/tm_api_response_prod_TEST.json")

Processing Projects: 100%|██████████| 207/207 [06:55<00:00,  2.01s/project]


## Clean Attributes
Performs a series of cleaning steps to correctly format the API output.

In [9]:
# with open("../data/tm_api_response_prod.json", "r") as file:
#     project_results = json.load(file)

with open("../data/tm_api_response_prod_TEST.json", "r") as file:
    project_results = json.load(file)

In [10]:
clean_api = clean.process_tm_api_results(project_results, 
                                       outfile1=feats, 
                                       outfile2=maxar_feats) # save to maxar-tools repo

  df.loc[is_feb_29 & non_leap_years, column_name] = df.loc[is_feb_29 & non_leap_years, column_name].apply(
  df.loc[is_feb_29 & non_leap_years, column_name] = df.loc[is_feb_29 & non_leap_years, column_name].apply(


Number of rows missing a 'plantstart' date: 630/19701
Number of rows missing a 'plantend' date: 6836/19701
⚠️ Total rows missing start and end plant date: 560
⚠️ Total projects missing 'plantstart': 18
⚠️ Total polygons missing 'plantstart': 626
Projects fully removed: 0
Projects partially affected: 18


In [11]:
clean_api.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19071 entries, 0 to 19700
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   project_id  19071 non-null  object        
 1   poly_id     19071 non-null  object        
 2   site_id     19071 non-null  object        
 3   geometry    19071 non-null  object        
 4   plantstart  19071 non-null  datetime64[ns]
 5   plantend    12795 non-null  datetime64[ns]
 6   practice    19029 non-null  object        
 7   target_sys  19069 non-null  object        
 8   dist        18995 non-null  object        
dtypes: datetime64[ns](2), object(7)
memory usage: 1.5+ MB


In [12]:
clean_api.project_id.nunique()

138

In [13]:
clean_api.target_sys.value_counts()

target_sys
agroforest                                        17213
woodlot-or-plantation                               664
agroforesty                                         566
natural-forest                                      307
riparian-area-or-wetland                            101
silvopasture                                         86
mangrove                                             64
Null                                                 43
riparian-area-or-wetland,woodlot-or-plantation       10
Tree Planting                                         9
urban-forest                                          5
peatland                                              1
Name: count, dtype: int64

In [14]:
clean_api.practice.value_counts()

practice
tree-planting                                                   17411
direct-seedling                                                   820
assisted-natural-regeneration                                     219
tree planting                                                     147
direct-seeding                                                    115
Null                                                              107
tree-planting, assisted-natural-regeneration                       83
assisted-natural-regeneration,tree-planting                        38
assisted-natural-regeneration,tree-planting,direct-seeding         36
direct-seeding, tree-planting, assisted-natural-regeneration       11
direct-seeding, tree-planting                                      10
Agroforestry                                                        9
direct-seeding,tree-planting                                        4
assisted-naturalregeneration                                        4
direct-seed

In [None]:
# checking missing ttc
with open('../data/terrafund_projects_tiles.json', 'r') as file:
    missing_ttc_json = json.load(file)

missing_ids = list(missing_ttc_json.keys())
len(missing_ids)

## Gather image metadata
This step is performed in the [maxar-tools repo](https://github.com/wri/maxar-tools/tree/jessica_meta).

## Create image features

In [15]:
## Image features
proj_df = pd.read_csv(feats)
img_df = pd.read_csv(maxar_md)

main = img.analyze_image_availability(proj_df, 
                               img_df, 
                               baseline_range, 
                               ev_range, 
                               cloud_thresh)

KeyError: "['site_id'] not in index"

In [33]:
main.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16592 entries, 0 to 16591
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   project_id          16592 non-null  object 
 1   poly_id             16592 non-null  object 
 2   site_id             16592 non-null  object 
 3   geometry            16592 non-null  object 
 4   plantstart          16592 non-null  object 
 5   plantend            10316 non-null  object 
 6   practice            16551 non-null  object 
 7   target_sys          16590 non-null  object 
 8   dist                16516 non-null  object 
 9   baseline_img_count  16592 non-null  float64
 10  ev_img_count        16592 non-null  float64
dtypes: float64(2), object(9)
memory usage: 1.5+ MB


In [38]:
# image count on left, poly count on right
main.baseline_img_count.value_counts().sort_index(ascending=True)

0.0     7249
1.0     4106
2.0     2951
3.0     1147
4.0      410
5.0      132
6.0      113
7.0       26
8.0       11
9.0        9
10.0      16
11.0      41
12.0      87
13.0       5
14.0       3
15.0       3
16.0       1
17.0       3
18.0      19
19.0      11
20.0       1
21.0       2
22.0       6
23.0       5
24.0      13
25.0       6
28.0       1
29.0      37
30.0      55
31.0      17
32.0      66
33.0       5
34.0       6
36.0       8
40.0       2
42.0       1
46.0       2
58.0       6
60.0       6
64.0       4
Name: baseline_img_count, dtype: int64