In [None]:
# IMPORTANT: The parameters below are set only for running this notebook independently. 
# When executing the full Ploomber pipeline, these values will be overridden by the settings in `pipeline.yaml`. 
# Any modifications made here will not persist when running the pipeline.

COUNTRY =  'ARM' # Code of the Country

upstream = {
    "03_coars_narratives": {
        "data": f"../data/processed/{COUNTRY}/narrative_by_section.xlsx",
    },
}
product = {
    "data": f"../data/processed/{COUNTRY}/coar_needs_by_goal_area.xlsx",
}




This Notebook uses an LLM model to analyze the coar summaries to identify the main needs and challenges. These are categorized into one or more goal areas.

In [None]:
import  unicef_cpe
import pandas as pd
from unicef_cpe.config import PROJ_ROOT
from pathlib import Path
from unicef_cpe import utils, genai

In [None]:
# Restrict only to these countries to avoid unexpected costs and long executing times.
country_map = {k:v for k,v in utils.get_ecaro_countries_mapping(priority=False).items() if k in COUNTRY}
country_code_map = {v:k for k,v in country_map.items() }
MODEL = 'gpt-4o'

### Extract needs from COARs

In [None]:
df_coars = pd.read_excel(upstream['03_coars_narratives']['data'])
print('Shape:', df_coars.shape)
print(df_coars.head())


In [None]:
## Get ONLY the first paragraph for Each Country and each Year as describing the context.
df_coars = df_coars.groupby(['country_code', 'country', 'year']).first().reset_index()
print(df_coars.head())

In [None]:
## NOTE. Let's focus only context AFTER 2015
print('Shape before:', df_coars.shape)
year_mask = df_coars['year'] > 2015
df_coars = df_coars[year_mask].copy()
print('Shape After:', df_coars.shape)

In [None]:
df_list = []

country_name = country_map.get(COUNTRY)

country_mask = df_coars['country'] == country_name

years = sorted(df_coars[country_mask]['year'].unique())

for year in years:
    year_mask = df_coars['year'] == year
    
    try:
        # We take only the text from the first section as it is the **context** section. This section is were we can find the **needs**
        text = df_coars[country_mask & year_mask]['section_text'].iloc[0]
    except IndexError:
        # Skip if there's no narrative text for this combination of country and year
        print(f"No narrative text available for {country_name} in {year}")
        continue

    # Extract needs and categorize them by goal area
    subjects = unicef_cpe.genai.extract_by_goal_area(text, subject = "needs and challenges", model=MODEL, api_type='openai')

    subjects ='\n'+ subjects
    subject_list = subjects.split('\n- **')[1:]  # Skip the first empty element after the split
    for subject in subject_list:
        # Split each need into goal area and its description
        subject_description = subject.split('**:')
        
        goal_area = subject_description[0].replace('- **','').replace('**', '').replace('\n', '').strip()
        new_subject = subject_description[1].replace('\n', '').strip()
        df_list.append([COUNTRY,country_name, year, goal_area, new_subject])

df_needs = pd.DataFrame(df_list, columns=['country_code', 'country', 'year', 'goal_area', 'need'])

In [None]:
print(df_needs.head())

In [None]:
output_path = Path(product['data'])
output_path.parent.mkdir(parents=True, exist_ok=True)  # Create missing directories
df_needs.to_excel(product['data'], index=False)

In [10]:
################################################################################################################################################################################################