# Pull Polygons from TerraMatch API for PPC (Simple)

This notebook sets up the process to pull polygon geometries and metadata from the TerraMatch API specifically for PPC projects (includes the hectares by
WWF ecoregion indicator).

Updated to include indicators and simplify preprocessing

In [1]:
import yaml
import pandas as pd
from tm_api_utils import pull_tm_api_data, patch_tm_api_data
from tqdm import tqdm
import json
import sys
from datetime import datetime
sys.path.append('../src/')
import api_utils as api
import process_tm_api_results as clean

## Set file paths

In [55]:
## PARAMS
# Naming convention
run_dir = 'ppc_tree_count_elig'
run_name = 'ppc_2025_tree_count_elig'

# Today's date
today = datetime.today().strftime('%Y-%m-%d') # Check computer date before running (if out of sync, run sudo hwclock -s)

In [56]:
## FILES
# Input Files
# List of approved projects
approved_projects_file = '../projects_all_approved_202502211226.csv'

# PPC 2025 Batch 1 Projects
#ppc_batch1_file = '/home/darby/github_repos/tf-biophysical-monitoring/data/ppc/ppc_batch1_projects_20250501.csv'

# PPC 2025 Potential Tree Count Projects (Eligibility Check #1)
ppc_tree_count_file = '../data/ppc_tree_count_elig/ppc_2025_potential_tree_count_projects_2025-07-22.csv'

# Output Files
# JSON file to store the results of the TM API pull; read it back in to clean the results (outfile, infile)
tm_api_pull_results_file = f'../data/{run_dir}/tm_api_response_prod_{run_name}_{today}.json'

# CSV file to save the results of the TM API pull
polygon_features_file = f'../data/{run_dir}/tm_api_{run_name}_{today}.csv'

## Set up token & API URL

In [57]:
# Set up token access
auth_path = '../secrets.yaml'
with open(auth_path) as auth_file:
    auth = yaml.safe_load(auth_file)
headers = {
    'Authorization': f"Bearer {auth['access_token']}"
    }

In [58]:
# TerraMatch API URLs
staging_url = "https://api-staging.terramatch.org/research/v3/sitePolygons?" # Use for testing queries
prod_url = "https://api.terramatch.org/research/v3/sitePolygons?" # Use to pull data for analysis

## Create list of projects to pull

In [59]:
# Read in list of approved projects (2025-02-21)
full = pd.read_csv(approved_projects_file)

# Read in list of potential 2025 tree count projects (2025-07-16)
tree_count_df = pd.read_csv(ppc_tree_count_file)

In [None]:
# Create lists of projects by Cohort (and split cohort 1 into projects within the TF landscapes and outside of the TF landscapes)
cohort1 = full[full['cohort'] == 'terrafund']
cohort1_landscapes = cohort1[cohort1['country'].isin(['BI', 'CD', 'RW', 'KE', 'GH'])]
cohort1_non_landscapes = cohort1[~cohort1['country'].isin(['BI', 'CD', 'RW', 'KE', 'GH'])]
cohort2 = full[full['cohort'] == 'terrafund-landscapes']

ppc = full[full['cohort'] == 'ppc']

In [None]:
# Create a list of project ids to query
#ids = list(set(cohort1.project_id))
#len(ids)

# Create a short list of ids for testing 
ids = ['244eaf7e-e109-47b2-b84e-9ebe24508391', '24d8c9a2-b8ef-481c-930b-78c9aeaf239e', 'f17dd6cf-8187-4edd-895e-07013d4990c9', '1115dda6-0165-4099-b52f-0ac53595c3a9',
       '465f543e-d53a-4356-ae8d-9790aa42d30e', 'ad149677-7ee0-479c-8d23-aa8c3bf58532', '1977b649-908c-46c3-836d-f4f6485427c2', '6d9089aa-2a6f-4dc0-8064-32c5b67ffed6']
len(ids)

ppc_batch1 = pd.read_csv(ppc_batch1_file)
ids = list(ppc_batch1['project_id'].unique())
ids

#ids = ['465f543e-d53a-4356-ae8d-9790aa42d30e', '1977b649-908c-46c3-836d-f4f6485427c2']
ids

In [61]:
# Create a list of project IDs to query
## From the list of potential tree count projects (7/16/25)
tree_count_proj_ids = list(tree_count_df['project_id'].unique())
tree_count_proj_ids

['244eaf7e-e109-47b2-b84e-9ebe24508391',
 '5e8a3c5e-7a28-4ff4-be07-f950361f56b2',
 'e4108d7a-58d8-4604-8dd8-2f95c9c181d5',
 'd2c2a1fe-c5e8-435a-b865-00dce7a9809f']

## Pull projects from TerraMatch API

In [62]:
results = api.pull_wrapper(prod_url, headers, tree_count_proj_ids, outfile=tm_api_pull_results_file)

Pulling Projects: 100%|██████████| 4/4 [00:06<00:00,  1.72s/project]

Results saved to ../data/ppc_tree_count_elig/tm_api_response_prod_ppc_2025_tree_count_elig_2025-07-22.json





## Parse and save the API output

In [63]:
# Load the saved JSON file
with open(tm_api_pull_results_file, 'r') as file:
    results = json.load(file)

In [65]:
# Convert the JSON output into a dataframe with selected fields
results_df = api.parse_tm_api_results(results, outfile = polygon_features_file, parse_indicators=True)

### Filter PPC polygons by desired plantstart years

In [66]:
# Convert the plantstart column to a datetime
results_df['plantstart'] = pd.to_datetime(results_df['plantstart'], errors='coerce')

# Extract the year into a new column
results_df['plantstart_year'] = results_df['plantstart'].dt.year

In [67]:
# Create a dictionary mapping project_id to list of allowed years
year_dict = {
    '244eaf7e-e109-47b2-b84e-9ebe24508391': [2022],              # EMA Maranhão
    '5e8a3c5e-7a28-4ff4-be07-f950361f56b2': [2022],              # CERT Rondonia
    'e4108d7a-58d8-4604-8dd8-2f95c9c181d5': [2023, 2024],        # MDPS Flagship
    'd2c2a1fe-c5e8-435a-b865-00dce7a9809f': [2022, 2023]         # Faja Lobi Reforestation Project
}

In [68]:
# Build a boolean mask for valid polygons
mask = results_df.apply(
    lambda row: row['plantstart_year'] in year_dict.get(row['project_id'], []),
    axis=1
)

# Filter the dataframe
results_df_filt = results_df[mask].copy()

In [69]:
# Save the filtered dataframe
results_df_filt.to_csv(f"../data/{run_dir}/tm_api_{run_name}_filt_by_yr_{today}.csv", index=False)

In [70]:
print(f"results_df has {len(results_df)} polygons")
print(f"results_df_filt has {len(results_df_filt)} polygons")

results_df has 201 polygons
results_df_filt has 196 polygons
