In [None]:
# IMPORTANT: The parameters below are set only for running this notebook independently. 
# When executing the full Ploomber pipeline, these values will be overridden by the settings in `pipeline.yaml`. 
# Any modifications made here will not persist when running the pipeline.
upstream = None
COUNTRY =  'ARM' # Code of the Country
product = {'data': f'../data/processed/{COUNTRY}/activities_programme_stucture.xlsx'}  # Path to save the final data product (stored under the 'data' key)
data_source = 'data/raw/insight-programme-cubes/Funds Utilization Analysis Cube - Activity By Year.xlsx'  # Path to the source data directory

In [None]:
import pandas as pd
from unicef_cpe.config import PROJ_ROOT
import unicef_cpe
from pathlib import Path

In [None]:

file_path = PROJ_ROOT / data_source
df_activities = unicef_cpe.processing.read_vision_programme_data(file_path)

mask = df_activities['country'].eq(COUNTRY)
df_activities = df_activities[mask].copy()
print(df_activities.head())

In [None]:
df_programme = unicef_cpe.processing.get_programme_structure()
df_programme.drop('country',inplace=True, axis=1)
print(df_programme.head())

In [None]:
print('Shape before:', df_activities.shape)
df_activities['activity_wbs'] = df_activities['activity'].str.split(r'\s+', n=1, regex=True).str.get(0)
df_activities['activity'] = df_activities['activity'].str.split(r'\s+', n=1, regex=True).str.get(-1)
print('Shape after:', df_activities.shape)
print(df_activities.head())

In [None]:
df_activities = df_activities.merge(df_programme, left_on='activity_wbs', right_on='activity_code', how='left')

# Remove all rows that do not have a corresponding Activity code in df_programme. 
# That is those that are not in the structure, i.e. either earlier years or not in the country list.
df_activities.dropna(subset=['activity_code'], inplace=True)

In [None]:
df_activities['expenditure'] = df_activities['value'].astype(float)
df_activities['year'] = df_activities['year'].astype(int)

In [None]:
print(df_activities.head())

In [None]:

output_path = Path(product['data'])
output_path.parent.mkdir(parents=True, exist_ok=True)  # Create missing directories
df_activities.to_excel(product['data'], index=False)

In [None]:
################################################################################################################################################################################################