In [None]:
# IMPORTANT: The parameters below are set only for running this notebook independently. 
# When executing the full Ploomber pipeline, these values will be overridden by the settings in `pipeline.yaml`. 
# Any modifications made here will not persist when running the pipeline.
upstream = None
COUNTRY =  'ARM' # Code of the Country
product = {'data': f'../data/processed/{COUNTRY}/programme_structure.xlsx'}  # Path to save the final data product (stored under the 'data' key)
data_source = 'data/raw/insight-programme-programme-structure/Programme Structure.csv'  # Path to the source data directory

# Relevance

This Notebook extracts programme structure including outcomes and outputs, their codes, descriptions, and country programme.

In [None]:
import re
import pandas as pd
import unicef_cpe
from unicef_cpe.config import PROJ_ROOT

from unicef_cpe.utils import *
from unicef_cpe.genai import *

In [None]:
country_map = {k:v for k,v in unicef_cpe.utils.get_ecaro_countries_mapping(priority=False).items() if k in COUNTRY}
country_code_map = {v:k for k,v in country_map.items() }

In [None]:
file_path = PROJ_ROOT / data_source
df_programme = pd.read_csv(file_path)

In [None]:
def cp_start_end_year(row):
    # Regular expression to extract two years
    year_regex = re.compile(r'(\d{4})\D+(\d{4})')
    
    # Extract the 'BUSINESS_AREA' field
    cp_name = row['COUNTRY_PROGRAMME_NAME']
        
    # Search for the year pattern in the 'BUSINESS_AREA' field
    match = year_regex.search(cp_name)
    
    if match:
        # If a match is found, extract start and end years
        start_year, end_year = match.groups()
        return int(start_year), int(end_year), f'CP ({start_year}-{end_year})'
    else:
        # If no match is found, return None for both years
        return None, None, None

# Apply the function to the DataFrame and expand the result into two new columns
df_programme[['CP START YEAR', 'CP END YEAR', 'CP']] = df_programme.apply(cp_start_end_year, axis=1, result_type='expand')

In [None]:
df_programme['country'] = df_programme['BUSINESS_AREA'].apply(lambda x: x.split('-')[0].strip())
df_programme['country_code'] = df_programme['country'].replace(country_code_map)

In [None]:
df_programme = df_programme[df_programme['country_code'].eq(COUNTRY)].copy()
df_programme['country_code'].value_counts()

In [None]:
df_wp = df_programme[(df_programme['CP START YEAR'] > 2014)].copy()
df_wp['UTILIZED'] = df_wp['UTILIZED'].str.replace(',','').astype(float)

df_wp['IR_FULL_TEXT'] = df_wp['IR_FULL_TEXT'].str.replace('New Element', 'No Description Available')



In [None]:
df_wp['COUNTRY_PROGRAMME_NAME'] = df_wp['COUNTRY_PROGRAMME_NAME'].str.title()
df_wp['PCR_NAME'] = df_wp['PCR_NAME'].str.title()
df_wp['INTERMEDIATE_RESULT_NAME'] = df_wp['INTERMEDIATE_RESULT_NAME'].str.title()

In [None]:
df_wp = df_wp.groupby(['country','country_code', 'CP' ,'PCR_NAME', 'INTERMEDIATE_RESULT_NAME', 'IR_FULL_TEXT']).agg(
    {'UTILIZED': 'sum'}
).reset_index()

In [None]:

output_path = Path(product['data'])
output_path.parent.mkdir(parents=True, exist_ok=True)  # Create missing directories
df_wp.to_excel(product['data'], index=False)

In [None]:
################################################################################################################################################################################################