In [None]:
# IMPORTANT: The parameters below are set only for running this notebook independently. 
# When executing the full Ploomber pipeline, these values will be overridden by the settings in `pipeline.yaml`. 
# Any modifications made here will not persist when running the pipeline.
upstream = None
COUNTRY =  'ARM' # Code of the Country
product = {'data': f'../data/processed/{COUNTRY}/funds_goal_result_view.xlsx'}  # Path to save the final data product (stored under the 'data' key)
data_source = 'data/raw/insight-funds-utilization-by-goal-and-result-area'  # Path to the source data directory

This Notebook summarizes funds utilization by goal area and results area.

In [None]:
import re
import pandas as pd
from pathlib import Path
from unicef_cpe.config import PROJ_ROOT

from unicef_cpe.utils import *

In [None]:
country_map = {k:v for k,v in get_ecaro_countries_mapping(priority=False).items() if k in COUNTRY}
country_code_map = {v:k for k,v in country_map.items() }

In [3]:
def prepare_df_from_funds_utilization_xlsx(df_funds, columns):
    # Step 1: Find the index of the row that contains "Row Labels"
    row_labels_index = df_funds[df_funds['Year'] == 'Row Labels'].index[0]

    # Step 2: Slice the DataFrame to remove all rows before "Row Labels"
    df_funds = df_funds.loc[row_labels_index+1:]

    # Reset the index after slicing (optional, to clean up the DataFrame index)
    df_funds.reset_index(drop=True, inplace=True)

    # Step 3: Rename the columns
    df_funds.columns = columns
    return df_funds.copy()

In [4]:
# Parsing function to extract and categorize details
def extract_details(row):
    text_line = row['Row Labels']
    # Matching patterns
    if re.match(r"^[A-Za-z\s]+ - \d+$", text_line):
        # Extract country name and code
        business_area = text_line.split()[0].strip()
        business_area_code = ' '.join(text_line.split()[2:]).strip()
        return business_area, business_area_code, None, None, None, None
    elif re.match(r"^\d{2} [A-Za-z\s]+$", text_line):
        # Extract goal area code and description
        goal_area_code = text_line.split()[0]
        goal_area = ' '.join(text_line.split()[1:])
        return None, None, goal_area, goal_area_code, None, None
    elif re.match(r"^\d{2}-\d{2}", text_line):
        # Extract result area code and description
        parts = text_line.split()
        result_area_code = parts[0]
        result_area = ' '.join(parts[1:])
        return None, None, None, None, result_area, result_area_code
    else:
        return None, None, None, None, None, None

In [5]:
def normalize_dataframe(df_funds):
    # Apply the function to each row and expand the DataFrame
    df_funds[['Business Area', 'Business Area Code', 'Goal Area', 'Goal Area Code', 'Result Area', 'Result Area Code']] = df_funds.apply(extract_details, axis=1, result_type='expand')

    # Forward fill to fill the country and goal area for the subsequent rows
    df_funds['Business Area'] = df_funds['Business Area'].ffill()
    df_funds['Business Area Code'] = df_funds['Business Area Code'].ffill()
    df_funds['Goal Area'] = df_funds['Goal Area'].ffill()
    df_funds['Goal Area Code'] = df_funds['Goal Area Code'].ffill()

    # Filter out rows where both Goal Area and Result Area are None
    df_funds = df_funds.dropna(subset=['Goal Area Code', 'Result Area Code'], how='any').copy()
    # Clean the DataFrame - Drop duplicates if needed and fill NaNs appropriately
    df_funds.drop_duplicates(inplace=True)

    return df_funds.copy()

In [None]:
df_list = []
file_paths = PROJ_ROOT / data_source
for file_path in sorted(Path(file_paths).glob('*.xlsx')):
    year = file_path.name.split('_')[1]
    if not year:
        print(f'Could not find a match in {file_path.name}. Skipping...')
        continue
    df_funds = pd.read_excel(file_path)

    columns = ['Row Labels', 'Allocation', 'Utilized']
    df_funds = prepare_df_from_funds_utilization_xlsx(df_funds, columns)

    df_funds = normalize_dataframe(df_funds)
    df_funds['year'] = int(year)
    df_list.append(df_funds)

df_funds = pd.concat(df_list, axis=0, ignore_index=True)
print('Shape:', df_funds.shape)

In [None]:
df_funds['country'] = df_funds['Business Area'].replace('Bosnia', 'Bosnia and Herzegovina') 
df_funds['country'] = df_funds['Business Area'].replace('Macedonia', 'North Macedonia')

df_funds['country_code'] = df_funds['country'].replace(country_map)

In [8]:
total_utilized_by_goal = df_funds.groupby('Goal Area')['Utilized'].sum().sort_values(ascending=False)
sorted_goal_areas = total_utilized_by_goal.index.tolist()

df_funds['Goal Area'] = pd.Categorical(df_funds['Goal Area'], categories=sorted_goal_areas, ordered=True)

In [None]:

output_path = Path(product['data'])
output_path.parent.mkdir(parents=True, exist_ok=True)  # Create missing directories
df_funds.to_excel(product['data'], index=False)

In [10]:
################################################################################################################################################################################################