# Pull Polygons from TerraMatch API for PPC (Simple)

This notebook sets up the process to pull polygon geometries and metadata from the TerraMatch API specifically for PPC projects (includes the hectares by
WWF ecoregion indicator).

Updated to include indicators and simplify preprocessing

In [None]:
import yaml
import pandas as pd
from tm_api_utils import pull_tm_api_data, patch_tm_api_data
from tqdm import tqdm
import json
import sys
from datetime import datetime
sys.path.append('../src/')
import api_utils as api
import process_tm_api_results as clean
import geospatial_utils_NEW as geo

## Set file paths

In [None]:
## PARAMS
# Naming convention
run_dir = 'ppc_tree_count_elig'
run_name = 'ppc_tree_count_elig_final'

# Today's date
today = datetime.today().strftime('%Y-%m-%d') # Check computer date before running (if out of sync, run sudo hwclock -s)

In [None]:
## FILES
# Input Files
# List of approved projects
approved_projects_file = '../projects_all_approved_202502211226.csv'

# PPC 2025 Batch 3 Projects
#ppc_batch3_file = '../data/ppc/ppc_2025_batch3_project_list.csv'

# PPC 2025 Potential Tree Count Projects (Eligibility Check #3)
ppc_tree_count_file = '../data/ppc_tree_count_elig/ppc_2025_potential_tree_count_projects_final_2025-12-10.csv'

# Output Files
# JSON file to store the results of the TM API pull; read it back in to clean the results (outfile, infile)
tm_api_pull_results_file = f'../data/{run_dir}/tm_api_response_prod_{run_name}_{today}.json'

# CSV file to save the results of the TM API pull
polygon_features_file = f'../data/{run_dir}/tm_api_{run_name}_{today}.csv'

## Set up token & API URL

In [None]:
# Set up token access
auth_path = '../secrets.yaml'
with open(auth_path) as auth_file:
    auth = yaml.safe_load(auth_file)
headers = {
    'Authorization': f"Bearer {auth['terramatch']['access_token']}"
    }

In [None]:
# TerraMatch API URLs
staging_url = "https://api-staging.terramatch.org/research/v3/sitePolygons?" # Use for testing queries
prod_url = "https://api.terramatch.org/research/v3/sitePolygons?" # Use to pull data for analysis

## Create list of projects to pull

In [None]:
# Read in list of approved projects (2025-02-21)
full = pd.read_csv(approved_projects_file)

# Read in list of projects
projects = pd.read_csv(ppc_tree_count_file)

In [None]:
# # Create lists of projects by Cohort (and split cohort 1 into projects within the TF landscapes and outside of the TF landscapes)
# cohort1 = full[full['cohort'] == 'terrafund']
# cohort1_landscapes = cohort1[cohort1['country'].isin(['BI', 'CD', 'RW', 'KE', 'GH'])]
# cohort1_non_landscapes = cohort1[~cohort1['country'].isin(['BI', 'CD', 'RW', 'KE', 'GH'])]
# cohort2 = full[full['cohort'] == 'terrafund-landscapes']

# ppc = full[full['cohort'] == 'ppc']

In [None]:
# Create a list of project ids to query
#ids = list(set(cohort1.project_id))
#len(ids)

# Create a short list of ids for testing 
#ids = ['244eaf7e-e109-47b2-b84e-9ebe24508391', '24d8c9a2-b8ef-481c-930b-78c9aeaf239e', 'f17dd6cf-8187-4edd-895e-07013d4990c9', '1115dda6-0165-4099-b52f-0ac53595c3a9']
#len(ids)

In [None]:
# Create a list of project IDs to query
# From the list of Tree Count Eligibility (Round 2) projects (9/5/25)
# tree_count_proj_ids = list(projects['project_id'].unique())
# tree_count_proj_ids

# tree_count_proj_ids = ['5e8a3c5e-7a28-4ff4-be07-f950361f56b2']
# tree_count_proj_ids


# ## From the list of PPC Batch 2 Projects (9/22/25)
batch3_ids = list(projects['project_id'].unique())
batch3_ids

## Pull projects from TerraMatch API

In [None]:
results = api.pull_wrapper(prod_url, headers, batch3_ids, modified_since=None, outfile=tm_api_pull_results_file)

## Parse and save the API output

In [None]:
# Load the saved JSON file
with open(tm_api_pull_results_file, 'r') as file:
    results = json.load(file)

In [None]:
# Convert the JSON output into a dataframe with selected fields
results_df = api.parse_tm_api_results(results, outfile = polygon_features_file, parse_indicators=True)
print(len(results_df))

In [None]:
results_df['project_id'].nunique()

### Filter PPC polygons by desired plantstart years

In [None]:
# Read in polygons dataframe
df = pd.read_csv(polygon_features_file)

In [None]:
# Filter the polygons dataframe by the year(s) of interest
results_df_filt = clean.filter_by_years_of_interest(polygons_df=df, years_df=projects)

In [None]:
print(f"results_df had {len(results_df)} polygons")
print(f"results_df_filt has {len(results_df_filt)} polygons")

In [None]:
results_df.groupby('project_id')['poly_id'].count()

In [None]:
results_df_filt.groupby('project_id')['poly_id'].count()

In [None]:
# Save the filtered dataframe (overwrite the unfiltered data)
results_df_filt.to_csv(f"../data/{run_dir}/tm_api_{run_name}_{today}.csv", index=False)
print(f"Saved {len(results_df_filt)} features to ../data/{run_dir}/tm_api_{run_name}_{today}.csv")

In [None]:
# Export the dataframe as a geojson (optional)
geo.df_to_geojson(df=results_df_filt, geometry_col='geometry', output_path=f'../data/{run_dir}/tm_api_{run_name}_{today}.geojson', crs='EPSG:4326')