In [None]:
# IMPORTANT: The parameters below are set only for running this notebook independently. 
# When executing the full Ploomber pipeline, these values will be overridden by the settings in `pipeline.yaml`. 
# Any modifications made here will not persist when running the pipeline.
upstream = None
COUNTRY =  'ARM' # Code of the Country
product = {'data': f'../data/processed/{COUNTRY}/unsdcf_df_priorities_by_goal_area.xlsx'}  # Path to save the final data product (stored under the 'data' key)
data_source = 'data/corpora/unsdcf/'  # Path to the source data directory

This Notebook uses an LLM model to analyze the United Nations sustainable development cooperation framework to summarize national priorities for all countries. These priorities are categorized into one or more goal areas.

In [None]:
import re
from pathlib import Path
import pandas as pd
import unicef_cpe 
from unicef_cpe.config import PROJ_ROOT

In [None]:
MODEL = 'gpt-4o'
country_map = {k:v for k,v in unicef_cpe.utils.get_ecaro_countries_mapping(priority=False).items() if k in COUNTRY}
country_code_map = {v:k for k,v in country_map.items() }

### Extract Reccomendations from UNSDCF

In [None]:
data = []
source_path =  PROJ_ROOT / Path(data_source)

#for file_path in upstream_data['unsdcf'].glob('unsdcf/*.pdf'):
for file_path in source_path.glob('*.pdf'):    
    with open(file_path, 'r') as file:
        country_code, *_ = file_path.name.split('-')
        country_code = country_code.upper()
        country = country_map.get(country_code)
        year = re.search(r'\d{4}-\d{4}', file_path.name).group()
        text = unicef_cpe.extraction.extract_text_from_file(file_path)
        data.append((file_path.name, country_code, country, year, text))
df_doc = pd.DataFrame(data, columns=['file_name', 'country_code','country', 'year', 'section_text'])
df_doc.sort_values(['country_code', 'country', 'year', 'file_name'], ignore_index=True, inplace=True)
print('Shape:', df_doc.shape)
print(df_doc.head())


In [5]:
def iterate_until_non_one_start(items):
    found_one = False  # Flag to indicate when we start finding strings starting with "1"
    result = []

    for item in items:
        # Check if the item starts with "1" followed by a period or space
        if re.match(r"^1(\.| )", item):
            found_one = True  # Set the flag once we find a string starting with "1"
            result.append(item)
        # elif found_one:
        #     # If we've started finding "1"-prefixed items, break on the first non-"1" prefix
        #     break
        else:
            # Continue iterating without appending until we start with "1"
            continue
    
    return result

In [6]:
data = []

# Iterate over each document in the original DataFrame
for index, row in df_doc.iterrows():
    
    # Extract ection titles
    section_titles = unicef_cpe.document_processing.extract_section_titles(row['section_text'], threshold=50, base_min_lines_between_titles=10, start_with_digit=True)

    # Filter only Section from First Chapter. This is the relevant one
    filtered_section_titles = iterate_until_non_one_start(section_titles)
    
    # Extract sections and their text based on the section titles
    sections = unicef_cpe.document_processing.extract_sections_with_text(row['section_text'], filtered_section_titles)
    # Add each section and its text to the new DataFrame
    for section_title, section_text in sections:
        # Clean the text 
        cleaned_section_text = unicef_cpe.utils.clean_text(section_text)
        data.append({
            'country_code': row['country_code'],
            'country': row['country'],
            'section_title': section_title,
            'section_text': cleaned_section_text,
            'year': row['year'],
            'file_name': row['file_name'],
        })

# Create a new DataFrame from the collected data
df_sections = pd.DataFrame(data)

In [None]:
df_list = []

country_name = country_map.get(COUNTRY)


country_mask = df_sections['country'] == country_name

years = sorted(df_sections[country_mask]['year'].unique())

for year in years:
    year_mask = df_sections['year'] == year
    
    try:
        df_filtered = df_sections[country_mask & year_mask].copy()
        df_filtered['title_text_combined'] = df_filtered['section_title'] + '\n' + df_filtered['section_text']

        # Join all rows by '\n\n'
        text = '\n\n'.join(df_filtered['title_text_combined'].tolist())

    except IndexError:
        # Skip if there's no narrative text for this combination of country and year
        print(f"No narrative text available for {country_name} in {year}")
        continue
    print(f"Summarizing national priorities for {country_name} in {year}")
    # Extract needs and categorize them by goal area
    subjects = unicef_cpe.genai.extract_by_goal_area(text, subject = "national priorities", model=MODEL, api_type='openai')

    subjects ='\n'+ subjects
    subject_list = subjects.split('\n- **')[1:]  # Skip the first empty element after the split
    for subject in subject_list:
        # Split each need into goal area and its description
        subject_description = subject.split('**:')
        if len(subject_description)>1:
            goal_area = subject_description[0].replace('- **','').replace('**', '').replace('\n', '').strip()
            new_subject = subject_description[1].replace('\n', '').strip()
            df_list.append([country_code, country_name, year, goal_area, new_subject])

In [8]:
df_priorities = pd.DataFrame(df_list, columns=['country_code', 'country', 'year', 'goal_area', 'priority'])

In [None]:
print(df_priorities.head())

In [None]:
output_path = Path(product['data'])
output_path.parent.mkdir(parents=True, exist_ok=True)  # Create missing directories
df_priorities.to_excel(product['data'], index=False)

In [11]:
################################################################################################################################################################################################