In [None]:
# IMPORTANT: The parameters below are set only for running this notebook independently. 
# When executing the full Ploomber pipeline, these values will be overridden by the settings in `pipeline.yaml`. 
# Any modifications made here will not persist when running the pipeline.
upstream = None
COUNTRY =  'ARM' # Code of the Country
product = {
    'data_gender_marker': f'../data/processed/{COUNTRY}/gender_marker.xlsx',
    'data_gender_marker_count': f'../data/processed/{COUNTRY}/gender_marker_count.xlsx',
    'data_disability_tag': f'../data/processed/{COUNTRY}/disability_tag.xlsx',
    'data_disability_tag_count': f'../data/processed/{COUNTRY}/disability_tag_count.xlsx',
    'data_humanitarian_marker': f'../data/processed/{COUNTRY}/humanitarian_marker.xlsx',
    'data_humanitarian_marker_count': f'../data/processed/{COUNTRY}/humanitarian_marker_count_count.xlsx',
    'data_adolescent_tag': f'../data/processed/{COUNTRY}/adolescent_tag.xlsx',
    'data_adolescent_tag_count': f'../data/processed/{COUNTRY}/adolescent_tag_count.xlsx',
 
 }  # Path to save the final data product (stored under the 'data' key)

data_source = 'data/raw/insight-programme-management'  # Path to the source data directory

This Notebook processes the programme structure to extract information about gender and humanitarian markers; and disability and adolescence tags by year. The markers are at output level and the tags at activity level.

In [None]:
import re
import pandas as pd
from pathlib import Path
from unicef_cpe.utils import *
from unicef_cpe.config import PROJ_ROOT

In [None]:
country_map = {k:v for k,v in get_ecaro_countries_mapping(priority=False).items() if k in COUNTRY}
country_code_map = {v:k for k,v in country_map.items() }

### 5. Programme Structure

In [None]:
file_path = PROJ_ROOT / data_source
file_paths = sorted(Path(file_path).glob('*.xlsx'))
file_paths

Read in the data

In [None]:
dfs = []
for path in file_paths:
    df_programmes = pd.read_excel(path, skiprows=12)
    dfs.append(df_programmes)
df_programmes = pd.concat(dfs, axis=0, ignore_index=True)
df_programmes.rename(lambda x: x.lower().replace(' ', '_'), axis=1, inplace=True)
print('Shape:', df_programmes.shape)

In [None]:
# de-duplicate data (same as Mykola)
print('Shape before:', df_programmes.shape)
df_programmes.dropna(subset=['activity_wbs'], ignore_index=True, inplace=True)
assert df_programmes.duplicated(subset=['activity_wbs']).sum() == 0, 'Duplicated activities'
print('Shape after:', df_programmes.shape)


In [None]:
# clean up columns to keep and fill blank spaces
to_keep = [
    'business_area',
    'cp_wbs',
    'cp_short_text',
    # 'outcome_wbs',
    'output_wbs',
    'activity_wbs',
    'gender_equality_marker_name',
    'humanitarian_marker_name',
    'disability_tag',
    'adolescent_tag',
    # 'output_funding',
    'utilized',
]

print('Shape before:', df_programmes.shape)

df_programmes = df_programmes[to_keep].copy()

# Mark missing values in the Tags to not confuse with zero
for column in ('gender_equality_marker_name', 'humanitarian_marker_name', 'disability_tag', 'adolescent_tag'):
    df_programmes[column] = df_programmes[column].fillna('Missing values')

df_programmes['utilized'] = df_programmes['utilized'].fillna(0.)
assert df_programmes.isna().sum().sum() == 0, 'Missing values'

# map business areas to iso codes
df_programmes['business_area'] = df_programmes['business_area'].apply(replace_business_areas_with_iso)
df_programmes.rename({'business_area': 'country'}, axis=1, inplace=True)



#Filter only relevant country
df_programmes = df_programmes[df_programmes["country"] == COUNTRY].copy()
print('Shape after:', df_programmes.shape)

In [None]:
def extract_two_years(string):
    # Regex pattern to match four-digit years (2010 as an example)
    year_pattern = r'(20\d{2})'
    # match = re.search(r'(20\d{2})', text) 
    # Find all matches in the string
    years = re.findall(year_pattern, string)
        
    # If exactly two years are found, return them as a string
    if len(years) == 2:
        return 'CP ' + years[0] + '-' + years[1]
    else:
        return None  # or return an appropriate default value

In [None]:
# Rename CP to make them uniform, eg. CP 2015 - 2020
df_programmes['cp_short_text'] = df_programmes['cp_short_text'].apply(extract_two_years)

**Gender, Disability, Adolecent and Humanitarian Marker/Tags Processing for Funds**

Rating for the Gender Equality Marker (by output): 0 = NONE, 1 = MARGINAL, 2 = SIGNIFICANT, 3 = PRINCIPAL

The Disability Tag (by activity) is supposed to involve rating every activity against a three-category scale; 3, 2, or 0. But in the xlsx file the rating seems to follow the same structure as the Gender Equality Marker, so I will use this here.

Rating for the Humanitarian Marker (by output):  0 = NONE, 1 = MARGINAL, 2 = SIGNIFICANT, 3 = PRINCIPAL

The Adolecent Tag (by activity) is rated yes/no.

***Marker/Tag Processing Function***

In [None]:
def marker_tag_funds(df, tag_type):
    '''
    input df: contains data on CP utilization 
    tag_type: marker/tag that needs to be evaluated (can't be adolescent_tag)
    output df: contains columns ['PRINCIPAL', 'SIGNIFICANT', 'MARGINAL', 'NONE', 'Missing values'] 
    and the normalized funds in % associated with the columns 
    '''
    # Step 1: Aggregate the total utilized funds for each combination
    agg = df.groupby(['country', 'cp_short_text', tag_type])['utilized'].sum().reset_index()

    # Step 2: Normalize the values to %
    agg['total_utilized'] = agg.groupby(['country', 'cp_short_text'])['utilized'].transform('sum')
    agg['normalized'] = 100 * (agg['utilized'] / agg['total_utilized'])

    # Step 3: Pivot the DataFrame to reshape it
    df_out = agg.pivot_table(index=['country', 'cp_short_text'], 
                                        columns=tag_type, 
                                        values='normalized', 
                                        fill_value=0).reset_index()

    # Add the total utilized funds as a separate column in df_out
    total_utilized = agg[['country', 'cp_short_text', 'total_utilized']].drop_duplicates()
    df_out = df_out.merge(total_utilized, on=['country', 'cp_short_text'], how='left')

    # Optional: Rename the columns for better readability
    df_out.columns.name = None  # Remove the name of the columns index
    df_out = df_out.rename_axis(None, axis=1)  # Remove the name of the index

    tag_order = ['PRINCIPAL', 'SIGNIFICANT', 'MARGINAL', 'NONE', 'Missing values']
    order = ['country', 'cp_short_text','total_utilized'] + tag_order
    order = [o for o in order if o in df_out.columns]
    df_out = df_out[order]

    return df_out

In [None]:
def adolescent_tag_funds(df, tag_type = 'adolescent_tag'):
    '''
    input df: contains data on CP utilization 
    tag_type: only for adolescent_tag or other YES/NO tags
    output df: contains columns ['YES', 'NO'] 
    and the normalized funds in % associated with the columns 
    '''
    # Step 1: Aggregate the total utilized funds for each combination
    agg = df.groupby(['country', 'cp_short_text', tag_type])['utilized'].sum().reset_index()

    # Step 2: Normalize the values to %
    agg['total_utilized'] = agg.groupby(['country', 'cp_short_text'])['utilized'].transform('sum')
    agg['normalized'] = 100 * (agg['utilized'] / agg['total_utilized'])

    # Step 3: Pivot the DataFrame to reshape it
    df_out = agg.pivot_table(index=['country', 'cp_short_text'], 
                                        columns=tag_type, 
                                        values='normalized', 
                                        fill_value=0).reset_index()

    # Add the total utilized funds as a separate column in df_out
    total_utilized = agg[['country', 'cp_short_text', 'total_utilized']].drop_duplicates()
    df_out = df_out.merge(total_utilized, on=['country', 'cp_short_text'], how='left')

    # Optional: Rename the columns for better readability
    df_out.columns.name = None  # Remove the name of the columns index
    df_out = df_out.rename_axis(None, axis=1)  # Remove the name of the index

    tag_order = ['YES', 'NO', 'Missing values']
    order = ['country', 'cp_short_text','total_utilized'] + tag_order
    order = [o for o in order if o in df_out.columns]
    df_out = df_out[order]

    return df_out

***Gender Marker Processing***

In [None]:
df_gender = marker_tag_funds(df_programmes, 'gender_equality_marker_name')
print(df_gender.head())

***Disability Tag Processing***

In [None]:
df_disability = marker_tag_funds(df_programmes, 'disability_tag')
print(df_disability.head())

***Humanitarian Marker Processing***

In [None]:
df_humanitarian = marker_tag_funds(df_programmes, 'humanitarian_marker_name')
print(df_humanitarian.head())

***Adolescent Tag Processing***

In [None]:
df_adolescent = adolescent_tag_funds(df_programmes)
print(df_adolescent.head())

**Number of outputs/activities with certain Gender, Disability and Humanitarian Marker/Tag**

Gender Equality Marker: by output

Disability Tag: by activity

Humanitarian Marker: by output

Adolecent Tag: by activity (not implemented) 

In [None]:
# count the number of activities per output, then use marker_tag_counting function to count no of outputs/activities
output_counts = df_programmes['output_wbs'].value_counts()
activity_counts = df_programmes['activity_wbs'].value_counts()
# Map the counts back to the DataFrame
df_programmes['output_proportion'] = df_programmes['output_wbs'].map(1./output_counts)
df_programmes['activity_proportion'] = df_programmes['activity_wbs'].map(1./activity_counts)

In [None]:
def marker_tag_counting(df, tag_type, level):
    
    # Step 1: Count outputs for each combination for gender
    agg = df.groupby(['country', 'cp_short_text', tag_type])[level].sum().reset_index()
    
    # Step 2: total count
    agg['total_count'] = agg.groupby(['country', 'cp_short_text'])[level].transform('sum')

    # Step 3: Pivot the DataFrame to reshape it
    df_out = agg.pivot_table(index=['country', 'cp_short_text'], 
                                        columns=tag_type, 
                                        values=level, 
                                        fill_value=0).reset_index()
    
    # Add the total count as a separate column in df_out
    total_count = agg[['country', 'cp_short_text', 'total_count']].drop_duplicates()
    df_out = df_out.merge(total_count, on=['country', 'cp_short_text'], how='left')

    # Optional: Rename the columns for better readability
    df_out.columns.name = None  # Remove the name of the columns index
    df_out = df_out.rename_axis(None, axis=1)  # Remove the name of the index


    tag_order = ['PRINCIPAL', 'SIGNIFICANT', 'MARGINAL', 'NONE', 'Missing values']
    order = ['country', 'cp_short_text', 'total_count'] + tag_order
    order = [o for o in order if o in df_out.columns]
    df_out = df_out[order]

    return df_out

In [None]:
def adolescent_tag_counting(df, tag_type='adolescent_tag', level='activity_proportion'):
    
    # Step 1: Count outputs for each combination for gender
    agg = df.groupby(['country', 'cp_short_text', tag_type])[level].sum().reset_index()
    
    # Step 2: total count
    agg['total_count'] = agg.groupby(['country', 'cp_short_text'])[level].transform('sum')

    # Step 3: Pivot the DataFrame to reshape it
    df_out = agg.pivot_table(index=['country', 'cp_short_text'], 
                                        columns=tag_type, 
                                        values=level, 
                                        fill_value=0).reset_index()
    
    # Add the total count as a separate column in df_out
    total_count = agg[['country', 'cp_short_text', 'total_count']].drop_duplicates()
    df_out = df_out.merge(total_count, on=['country', 'cp_short_text'], how='left')

    # Optional: Rename the columns for better readability
    df_out.columns.name = None  # Remove the name of the columns index
    df_out = df_out.rename_axis(None, axis=1)  # Remove the name of the index


    tag_order = ['YES', 'NO', 'Missing values']
    order = ['country', 'cp_short_text', 'total_count'] + tag_order
    order = [o for o in order if o in df_out.columns]
    df_out = df_out[order]

    return df_out

**Gender (level=output)**

In [None]:
df_gender_count = marker_tag_counting(df_programmes, 'gender_equality_marker_name', 'output_proportion')
print(df_gender_count.head())

**Disability (level = activity)**

In [None]:
df_disability_count = marker_tag_counting(df_programmes, 'disability_tag', 'activity_proportion')
print(df_disability_count.head())

**Humanitarian (level = output)**

In [None]:
df_humanitarian_count = marker_tag_counting(df_programmes, 'humanitarian_marker_name', 'output_proportion')
print(df_humanitarian_count.head())

In [None]:
df_adolescent_count = adolescent_tag_counting(df_programmes)
print(df_adolescent_count.head())

**Output to Excel**

In [None]:
# Write the dataframes to separate sheets

output_path = Path(product['data_gender_marker'])
output_path.parent.mkdir(parents=True, exist_ok=True)  # Create missing directories



# Create missing directories
df_gender.to_excel(product['data_gender_marker'], index=False)
df_gender_count.to_excel(product['data_gender_marker_count'], index=False)
df_disability.to_excel(product['data_disability_tag'], index=False)
df_disability_count.to_excel(product['data_disability_tag_count'], index=False)
df_humanitarian.to_excel(product['data_humanitarian_marker'], index=False)
df_humanitarian_count.to_excel(product['data_humanitarian_marker_count'], index=False)
df_adolescent.to_excel(product['data_adolescent_tag'], index=False)
df_adolescent_count.to_excel(product['data_adolescent_tag_count'], index=False)