# Pull PPC Polygons from TerraMatch API

This notebook sets up the process to pull PPC polygon geometries and metadata from the TerraMatch API.

In [None]:
import yaml
import pandas as pd
from tm_api_utils import pull_tm_api_data, patch_tm_api_data
from tqdm import tqdm
import json
import sys
from datetime import datetime
sys.path.append('../src/')
import api_utils as api
import process_tm_api_results as clean
import geospatial_utils_NEW as geo

## Set file paths

In [None]:
# Naming convention
run_name = 'ppc_2025_batch2'
#run_dir = 'ppc_batch2'

# Today's date
today = datetime.today().strftime('%Y-%m-%d') # Check computer date before running

## Input Files
# List of all approved projects on TerraMatch
approved_projects_file = '../projects_all_approved_202502211226.csv'

# PPC Batch 2 Project List
batch2_projects = "../data/ppc/ppc_2025_batch2_project_list.csv"

# PPC Prospective Tree Count (Group 1) Projects
#tree_count_group1_file = '/home/darby/github_repos/tf-biophysical-monitoring/data/ppc/ppc_tree_count_projects_group1_20250509.csv'
#tree_count_file = '~/github_repos/tf-biophysical-monitoring/data/ppc/ppc_2025_potential_tree_count_projects_2025-07-16.csv'

## Output Files
# A JSON file that stores the results of the TM API pull; we'll read it back in to clean the results (outfile, infile)
tm_api_pull_results_file = f'/home/darby/github_repos/tf-biophysical-monitoring/data/ppc/tm_api_response_prod_{run_name}_{today}.json'

# The cleaned polygon features csv
polygon_features_file = f'/home/darby/github_repos/tf-biophysical-monitoring/data/ppc/tm_api_{run_name}_{today}.csv' 

## Read in files

In [None]:
# List of all approved projects on TerraMatch
approved_projects_df = pd.read_csv(approved_projects_file)

# PPC Prospective Tree Count (Group 1) Projects
#tree_count_df = pd.read_csv(tree_count_group1_file)
#tree_count_df = pd.read_csv(tree_count_file)

# PPC 2025 Batch 2 Projects
batch2_projects_df = pd.read_csv(batch2_projects)

## Set up token and API URL

In [None]:
# Set up token access
auth_path = '../secrets.yaml'
with open(auth_path) as auth_file:
    auth = yaml.safe_load(auth_file)
headers = {
    'Authorization': f"Bearer {auth['access_token']}"
    }

In [None]:
# TerraMatch API URLs
staging_url = "https://api-staging.terramatch.org/research/v3/sitePolygons?" # Use for testing queries
prod_url = "https://api.terramatch.org/research/v3/sitePolygons?" # Use to pull data for analysis

## Create list of projects to pull

#### Pull Projects from group list (Batch 1, prospective tree count group 1, etc.)

In [None]:
# Make a list of the unique project_ids from the Batch 1 projects
batch2_proj_ids = list(batch2_projects_df['project_id'].unique())
batch2_proj_ids
# batch2_proj_ids = ['02b3119e-9505-4dba-b58d-f2a967b71ef9', '5e8a3c5e-7a28-4ff4-be07-f950361f56b2', '5bb542b2-0efb-4b52-841f-2b5898f533b8',
#                     'ad149677-7ee0-479c-8d23-aa8c3bf58532', '7e7d390b-1894-4a1b-acc2-c531f213c1ca', 'd2c2a1fe-c5e8-435a-b865-00dce7a9809f',
#                     'c8ef8d8e-a75a-46f4-88d4-8057ed5a50f8', 'e4108d7a-58d8-4604-8dd8-2f95c9c181d5']

# Potential tree count projects list from Asana: https://app.asana.com/1/25496124013636/project/1208493878648584/task/1210412708479452?focus=true
#tree_count_proj_ids = list(tree_count_df['project_id'].unique())
#tree_count_proj_ids

#### OR Pull projects from list of all approved polygons

In [None]:
# Filter the list of all approved projects by cohort ('ppc'), project_id (the Batch 1 list), 
ppc = approved_projects_df[approved_projects_df['cohort'] == 'ppc']

# Filter to just the batch 1 projects list
batch2 = ppc[ppc['project_id'].isin(batch2_proj_ids)]

## Pull polygons from TM API

In [None]:
results = api.pull_wrapper(prod_url, headers, batch2_proj_ids, outfile=tm_api_pull_results_file)

In [None]:
df = pd.DataFrame(results)

In [None]:
print(len(df))
print(f"df has {df.project_id.nunique()} unique projects")
print(f"df has {df.poly_id.nunique()} unique polygons")
df['project_id'].value_counts()
df.head(2)

In [None]:
# Check for NA values
#df.isna().sum()

## Clean attributes and save as csv

In [None]:
# Load the saved JSON file
with open(tm_api_pull_results_file, 'r') as file:
    project_results = json.load(file)

In [None]:
# Clean the csv and transform it into a dataframe
## Identifies and converts invalid plantstart and plantend dates to NaT
## Saves one copy of the polygon features csv to the terrafund-portfolio-analysis repo and one to the maxar-tools repo
clean_api = clean.process_tm_api_results(project_results,
                                         '2020-01-01',
                                         outfile1 = polygon_features_file,
                                         outfile2 = None)

#### Check Processed CSV

In [None]:
tc_df = pd.read_csv('/home/darby/github_repos/tf-biophysical-monitoring/data/ppc/batch2_2025/tm_api_ppc_2025_batch2_2025-08-22.csv')

In [None]:
print(len(tc_df))
print(f"df has {tc_df.project_id.nunique()} unique projects")
print(f"df has {tc_df.poly_id.nunique()} unique polygons")
tc_df['project_id'].value_counts()
tc_df.head()

In [None]:
tc_df.isna().sum()

## Filter PPC polygons by desired plantstart years

In [None]:
tc_df_filt = clean.filter_by_years_of_interest(tc_df, batch2_projects_df)
print(tc_df_filt.shape)
tc_df_filt.head(2)

In [None]:
# Save the filtered csv (overwrite the previous csv unless otherwise specified)
tc_df_filt.to_csv(polygon_features_file, index=False)

In [None]:
tc_df = pd.read_csv(polygon_features_file)
print(tc_df.shape)

In [None]:
# Check for NA values
tc_df.isna().sum()