# Pull Polygons from TerraMatch API (Simple)

This notebook sets up the process to pull polygon geometries and metadata from the TerraMatch API.

Updated to include indicators and simplify preprocessing

In [1]:
import yaml
import pandas as pd
from tm_api_utils import pull_tm_api_data, patch_tm_api_data
from tqdm import tqdm
import json
import sys
from datetime import datetime
sys.path.append('../src/')
import api_utils as api
import process_tm_api_results as clean

## Set file paths

In [2]:
## PARAMS
# Naming convention
run_name = 'ppc_test'
run_dir = 'test'

# Today's date
today = datetime.today().strftime('%Y-%m-%d') # Check computer date before running

In [3]:
## FILES
# List of approved projects
approved_projects_file = '../projects_all_approved_202502211226.csv'

# JSON file to store the results of the TM API pull; read it back in to clean the results (outfile, infile)
tm_api_pull_results_file = f'../data/{run_dir}/tm_api_response_prod_{run_name}_{today}.json'


## Set up token & API URL

In [4]:
# Set up token access
auth_path = '../secrets.yaml'
with open(auth_path) as auth_file:
    auth = yaml.safe_load(auth_file)
headers = {
    'Authorization': f"Bearer {auth['access_token']}"
    }

In [5]:
# TerraMatch API URLs
staging_url = "https://api-staging.terramatch.org/research/v3/sitePolygons?" # Use for testing queries
prod_url = "https://api.terramatch.org/research/v3/sitePolygons?" # Use to pull data for analysis

## Create list of projects to pull

In [6]:
# Read in list of approved projects (2025-02-21)
full = pd.read_csv(approved_projects_file)

In [7]:
# Create lists of projects by Cohort (and split cohort 1 into projects within the TF landscapes and outside of the TF landscapes)
cohort1 = full[full['cohort'] == 'terrafund']
cohort1_landscapes = cohort1[cohort1['country'].isin(['BI', 'CD', 'RW', 'KE', 'GH'])]
cohort1_non_landscapes = cohort1[~cohort1['country'].isin(['BI', 'CD', 'RW', 'KE', 'GH'])]
cohort2 = full[full['cohort'] == 'terrafund-landscapes']

ppc = full[full['cohort'] == 'ppc']

In [8]:
# Create a list of project ids to query
ids = list(set(ppc.project_id))
len(ids)

# Create a short list of ids for testing 
ids = ['465f543e-d53a-4356-ae8d-9790aa42d30e', '1977b649-908c-46c3-836d-f4f6485427c2']
ids = ['1977b649-908c-46c3-836d-f4f6485427c2']
ids

['1977b649-908c-46c3-836d-f4f6485427c2']

## Pull projects from TerraMatch API

In [9]:
results = api.pull_wrapper(prod_url, headers, ids, outfile=tm_api_pull_results_file)

Pulling Projects: 100%|██████████| 1/1 [00:00<00:00,  1.41project/s]

Results saved to ../data/test/tm_api_response_prod_ppc_test_2025-05-16.json





In [10]:
# Load the saved JSON file
with open(tm_api_pull_results_file, 'r') as file:
    project_results = json.load(file)

In [11]:
project_results

[{'status': 'approved',
  'plantStart': '2024-12-01',
  'calcArea': 1.056797182082,
  'plantEnd': '2025-01-31',
  'practice': 'direct-seeding',
  'targetSys': 'natural-forest',
  'distr': 'partial',
  'numTrees': 0,
  'name': '3561-02',
  'siteId': 'a7b60544-61e1-42de-8a3e-8d88e459d1eb',
  'projectId': '1977b649-908c-46c3-836d-f4f6485427c2',
  'indicators': [{'indicatorSlug': 'treeCoverLoss',
    'yearOfAnalysis': 2025,
    'value': {'2015': 0,
     '2016': 0,
     '2017': 0,
     '2018': 0,
     '2019': 0,
     '2020': 0,
     '2021': 0,
     '2022': 0,
     '2023': 0,
     '2024': 0}},
   {'indicatorSlug': 'treeCoverLossFires',
    'yearOfAnalysis': 2025,
    'value': {'2015': 0,
     '2016': 0,
     '2017': 0,
     '2018': 0,
     '2019': 0,
     '2020': 0,
     '2021': 0,
     '2022': 0,
     '2023': 0,
     '2024': 0}},
   {'indicatorSlug': 'restorationByStrategy',
    'yearOfAnalysis': 2025,
    'value': {'direct-seeding': 1.0567971820820106}},
   {'indicatorSlug': 'restorationBy

In [13]:
project_df = pd.DataFrame(project_results)
project_df.columns

Index(['status', 'plantStart', 'calcArea', 'plantEnd', 'practice', 'targetSys',
       'distr', 'numTrees', 'name', 'siteId', 'projectId', 'indicators',
       'siteName', 'geometry', 'establishmentTreeSpecies', 'reportingPeriods',
       'lightResource', 'poly_id', 'project_id'],
      dtype='object')

In [None]:
def parse_tm_api_results(results, outfile):
    """
    Converts TerraMatch API results JSON into a structured DataFrame with selected fields.
    
    Args:
        results (list): Raw JSON results from the API (list of dicts)
        min_valid_plantstart (str): Earliest acceptable date for planting (e.g., "2010-01-01")
        outfile1 (str): Path to save cleaned CSV output
        outfile2 (str): Secondary save location
    Returns:
        final_df (pd.DataFrame): Structured dataframe with selected fields 
    """
    extracted_data = []
    input_ids = {project.get('project_id') for project in results if project.get('project_id')}

    # Mapping from indicator slug to preferred column name 
    slug_to_colname = {
        'treeCover': 'tree_cover',
        'treeCoverLoss': 'tree_cover_loss',
        'treeCoverLossFires': 'tree_cover_loss_fires',
        'restorationByStrategy': 'restoration_by_strat',
        'restorationByLandUse': 'restoration_by_land_use'
        }

    # Iterate over each entry in the results JSON to extract polygon
    for poly in results:
        # Basic attributes
        row_data = {
            'project_id': poly.get('project_id'),
            'poly_id': poly.get('poly_id'),
            'site_id': poly.get('siteId'),
            'geometry': poly.get('geometry'),
            'plantstart': poly.get('plantStart'),
            'plantend': poly.get('plantEnd'),
            'practice': poly.get('practice'),
            'target_sys': poly.get('targetSys'),
            'dist': poly.get('distr'),
            'project_phase': poly.get('projectPhase', '')  # default if missing
        }

        # Parse the 'indicators' list into separate columns
        indicators = poly.get('indicators', [])
        for indicator in indicators:
            slug = indicator.get('indicatorSlug')
            if slug in slug_to_colname:
                col_name = slug_to_colname[slug]
                row_data[col_name] = indicator  # Keep the full dictionary here

        extracted_data.append(row_data)

    final_df = pd.DataFrame(extracted_data)
    # final_df.columns = final_df.columns.str.lower()
    # pre_clean_ids = list(set(final_df['project_id']))

    # # Clean up dates using your existing helper functions
    # final_df = clean_datetime_column(final_df, 'plantstart', min_valid_plantstart)
    # final_df = clean_datetime_column(final_df, 'plantend', min_valid_plantstart)

    # # Report missing planting dates (assumes missing_planting_dates() exists)
    # final_df = missing_planting_dates(final_df)

    # # Validate integrity
    # output_ids = list(set(final_df['project_id']))
    # assert len(input_ids) == len(pre_clean_ids) == len(output_ids)
    # missing_projects = input_ids - set(final_df['project_id'])
    # if missing_projects:
    #     print(f"Missing prj ids: {missing_projects}")

    # # Save results
    # final_df.to_csv(outfile1, index=False)
    # final_df.to_csv(outfile2, index=False)

    return final_df

In [27]:
test = parse_tm_api_results(results, None)
test

Unnamed: 0,project_id,poly_id,site_id,geometry,plantstart,plantend,practice,target_sys,dist,project_phase,tree_cover_loss,tree_cover_loss_fires,restoration_by_strat,restoration_by_land_use
0,1977b649-908c-46c3-836d-f4f6485427c2,0e9b287a-fd4f-4eb1-8665-2e9e9cc269fc,a7b60544-61e1-42de-8a3e-8d88e459d1eb,"{'type': 'Polygon', 'coordinates': [[[-40.1244...",2024-12-01,2025-01-31,direct-seeding,natural-forest,partial,,"{'indicatorSlug': 'treeCoverLoss', 'yearOfAnal...","{'indicatorSlug': 'treeCoverLossFires', 'yearO...","{'indicatorSlug': 'restorationByStrategy', 'ye...","{'indicatorSlug': 'restorationByLandUse', 'yea..."
1,1977b649-908c-46c3-836d-f4f6485427c2,2005a901-3d5d-4591-805a-b765a82b995b,a7b60544-61e1-42de-8a3e-8d88e459d1eb,"{'type': 'Polygon', 'coordinates': [[[-40.1193...",2024-12-01,2025-01-31,direct-seeding,natural-forest,partial,,"{'indicatorSlug': 'treeCoverLoss', 'yearOfAnal...","{'indicatorSlug': 'treeCoverLossFires', 'yearO...","{'indicatorSlug': 'restorationByStrategy', 'ye...","{'indicatorSlug': 'restorationByLandUse', 'yea..."


In [20]:
test.iloc[0].tree_cover_loss

{'indicatorSlug': 'treeCoverLoss',
 'yearOfAnalysis': 2025,
 'value': {'2015': 0,
  '2016': 0,
  '2017': 0,
  '2018': 0,
  '2019': 0,
  '2020': 0,
  '2021': 0,
  '2022': 0,
  '2023': 0,
  '2024': 0}}

In [None]:
slug_to_colname = {
    'treeCover': 'tree_cover',
    'treeCoverLoss': 'tree_cover_loss',
    'treeCoverLossFires': 'tree_cover_loss_fires',
    'restorationByStrategy': 'restoration_by_strategy',
    'restorationByLandUse': 'restoration_by_land_use'
}

def extract_indicators(row):
    indicators_list = row['indicators']
    result = {}
    for indicator in indicators_list:
        slug = indicator.get('indicatorSlug')
        if slug in slug_to_colname:
            col_name = slug_to_colname[slug]
            result[col_name] = indicator
    return pd.Series(result)

indicator_cols = project_df.apply(extract_indicators, axis=1)
project_df = pd.concat([project_df, indicator_cols], axis=1)

In [None]:
project_df