In [None]:
# IMPORTANT: The parameters below are set only for running this notebook independently. 
# When executing the full Ploomber pipeline, these values will be overridden by the settings in `pipeline.yaml`. 
# Any modifications made here will not persist when running the pipeline.
upstream = None
COUNTRY =  'ARM' # Code of the Country
product = {'data': f'../data/processed/{COUNTRY}/CPD_priorities.xlsx'}  # Path to save the final data product (stored under the 'data' key)
data_source = 'data/raw/unicef-ecaro-cpd-priorities.xlsx'  # Path to the source data directory

# Relevance

This Notebook uses an LLM model to extract priorities from Country Programme Documents (CPDs) by matching section titles (copy of **CPD Mining** in 01_main). The Notebook uses a manually generated dataset compiled by copy-pasting text from the respective Country Programme Documents (CPDs) sections.

In [None]:
import re
import pandas as pd
from pathlib import Path
import unicef_cpe as cpe
from tqdm import tqdm
from unicef_cpe.config import PROJ_ROOT

MODEL = 'gpt-4o'
country_map = {k:v for k,v in cpe.utils.get_ecaro_countries_mapping(priority=False).items() if k in COUNTRY}
country_code_map = {v:k for k,v in country_map.items() }


In [None]:
file_path = PROJ_ROOT / data_source
df_priorities = pd.read_excel(file_path)

print('Shape before:', df_priorities.shape)

mask = df_priorities['country'].eq(COUNTRY)
df_priorities = df_priorities[mask]
print('Shape after:', df_priorities.shape)
print(df_priorities.head())

In [None]:
print('Shape before:', df_priorities.shape)

df_priorities['input'] = df_priorities.apply(lambda row: '### {priority} ({period})\n\n{text}'.format(**row), axis=1)
for index, row in tqdm(df_priorities.iterrows()):
    df_priorities.loc[index, 'summary'] = cpe.genai.summarise_cpd_priority(row['input'], model=MODEL)
df_priorities.drop('input', axis=1, inplace=True)

print('Shape after:', df_priorities.shape)
print(df_priorities.head())

In [None]:
output_path = Path(product['data'])
output_path.parent.mkdir(parents=True, exist_ok=True)  # Create missing directories
df_priorities.to_excel(product['data'], index=False)