In [None]:
# IMPORTANT: The parameters below are set only for running this notebook independently. 
# When executing the full Ploomber pipeline, these values will be overridden by the settings in `pipeline.yaml`. 
# Any modifications made here will not persist when running the pipeline.
upstream = None
COUNTRY =  'ARM' # Code of the Country
product = {'data': f'../data/processed/{COUNTRY}/sitans_reccomendations_by_goal_area.xlsx'}  # Path to save the final data product (stored under the 'data' key)
data_source = 'data/corpora/sitans/'  # Path to the source data directory

This Notebook uses an LLM model to analyze the Situation Analysis Reports to identify recommendations. These are categorized into one or more goal areas.

In [None]:
import re
from pathlib import Path
import pandas as pd

import unicef_cpe 
from unicef_cpe.config import PROJ_ROOT


In [None]:
MODEL = 'gpt-4o'
country_map = {k:v for k,v in unicef_cpe.utils.get_ecaro_countries_mapping(priority=False).items() if k in COUNTRY}
country_code_map = {v:k for k,v in country_map.items() }

### Extract Reccomendations from SitAns

In [None]:
data = []
source_path =  PROJ_ROOT / Path(data_source)
# for file_path in upstream_data['sitans'].glob('sitans/*.pdf'):
for file_path in source_path.glob('*.pdf'):
    with open(file_path, 'r') as file:
        country_code, *_ = file_path.name.split('-')
        country_code = country_code.upper()
        country = country_map.get(country_code)
        year = re.search(r'\d{4}', file_path.name).group()
        text = unicef_cpe.extraction.extract_text_from_file(file_path)
        data.append((file_path.name, country_code, country, int(year), text))
df_sitans = pd.DataFrame(data, columns=['file_name', 'country_code','country', 'year', 'section_text'])
df_sitans.sort_values(['country_code', 'country', 'year', 'file_name'], ignore_index=True, inplace=True)
print('Shape:', df_sitans.shape)
print(df_sitans.head())

In [None]:
df_list = []

country_name = country_map.get(COUNTRY)


country_mask = df_sitans['country'] == country_name

years = sorted(df_sitans[country_mask]['year'].unique())

for year in years:
    year_mask = df_sitans['year'] == year
    
    try:
        text = df_sitans[country_mask & year_mask]['section_text'].iloc[0]
    except IndexError:
        # Skip if there's no narrative text for this combination of country and year
        print(f"No narrative text available for {country_name} in {year}")
        continue
    print(f"Summarizing recommendations for {country_name} in {year}")
    # Extract needs and categorize them by goal area
    subjects = unicef_cpe.genai.extract_by_goal_area(text[:110_000], subject = "recommendations", model=MODEL, api_type='openai')

    subjects ='\n'+ subjects
    subject_list = subjects.split('\n- **')[1:]  # Skip the first empty element after the split
    for subject in subject_list:
        # Split each need into goal area and its description
        subject_description = subject.split('**:')
        
        goal_area = subject_description[0].replace('- **','').replace('**', '').replace('\n', '').strip()
        new_subject = subject_description[1].replace('\n', '').strip()
        df_list.append([COUNTRY, country_name, year, goal_area, new_subject])

In [7]:
df_recommendations = pd.DataFrame(df_list, columns=['country_code', 'country', 'year', 'goal_area', 'recommendation'])

In [None]:
print(df_recommendations.head())

In [None]:
output_path = Path(product['data'])
output_path.parent.mkdir(parents=True, exist_ok=True)  # Create missing directories
df_recommendations.to_excel(product['data'], index=False)

In [10]:
################################################################################################################################################################################################