In [None]:
# IMPORTANT: The parameters below are set only for running this notebook independently. 
# When executing the full Ploomber pipeline, these values will be overridden by the settings in `pipeline.yaml`. 
# Any modifications made here will not persist when running the pipeline.


COUNTRY = "ARM"
data_source = [
    "data/raw/insights-contributions-income-analysis-cube/insights-contributions-income-analysis-cube-donor-name-country-level2.xlsx",
    "data/raw/insight-finance-admin-donor-statement-cube",
]

product = {
    "data": f"../data/processed/{COUNTRY}/cp_funds_stucture.xlsx",
}

upstream = {
    "20_activities_programme_stucture": {
        "data": f"../data/processed/{COUNTRY}/activities_programme_stucture.xlsx",
    }
}


This Notebook reads in the donor Hierarchy (Donor Name, Donor Country Name, and Donor level 2 Thematic group) and matches the donor names to the donors in 'insight finance admin donor statement cube' to determine the allocated funding by CP, funding source (RR, OR and OR - Emergency), level 2 Thematic group and donor.

In [None]:
import pandas as pd
import re
from openpyxl import load_workbook
from unicef_cpe.config import PROJ_ROOT
from unicef_cpe.utils import *
import unicef_cpe
from pathlib import Path


COUNTRIES_CODE = {k:v for k,v in get_ecaro_countries_mapping('iso','code', priority=False).items()  if k in COUNTRY}
COUNTRIES_NAME = {k:v for k,v in get_ecaro_countries_mapping('name','code', priority=False).items()  if k in COUNTRY}
# Example:
# COUNTRIES_CODE = {'ARM': '0260', 'AZE': '0310', 'BIH': '0530', 'GEO': '1600', 'KAZ': '2390', 'MKD': '2660'}



### Read in the Donor information:
- Donor Name
- Donor Country Name
- Donor level 2 Thematic group

In [None]:
# ALL CAPS
#all_capital_strings = all(isinstance(name, str) and name.isupper() for name in donors_names_list)

# if false check that the extraction of names is correct
#print(all_capital_strings)  # This will print True if all conditions are met, otherwise False

**Read in Donor Names, Donor Country Name and Donor Class Level 2 and build a hierarchy**

In [None]:
# Read in file with hierarchy : Donor Name -> Donor Country Name -> Donor Level 2 Class
# Check if there is a method to read hierarchy from excel files directly
# file_path = RAW_DATA_DIR.joinpath('insights-contributions-income-analysis-cube',
#                                   'insights-contributions-income-analysis-cube-donor-name-country-level2.xlsx')

file_path = file_path = PROJ_ROOT / data_source[0]

df_donors = pd.read_excel(file_path, skiprows=3, usecols=[0])
df_donors.columns = df_donors.columns.str.lower().str.strip().str.replace(' ', '_')

print('Shape at start', df_donors.shape)

In [None]:
# cleaning
print('Shape before', df_donors.shape)
to_remove = ['Grand Total', 'Unknown', 'UNKNOWN']
# remove 'Grand Total'; Missing Values and Unknown values (handle those seperately)
df_donors = df_donors[~df_donors['row_labels'].isin(to_remove) & df_donors['row_labels'].notna()]
df_donors = df_donors.reset_index(drop=True)
print('Entries should be removed in multiples of 3')
print('Shape after ', df_donors.shape)

In [None]:
level2_list = ['Governments','Field Offices PSFR', 'Individuals (Others)',  
               'Inter-Governmental Organisations', 'Inter-Organisational Arrangements', 
               'National Committees', 'Non-Governmental Organisation', 
               'Multi-donor', 'Regular Resources', 'Thematic']

In [None]:
# make columns for: 1. Donor Name, 2. Donor Country Name, 3. Donor level 2 Thematic group

# Create the 'type' column and set it to None initially
df_donors['type'] = None

# compare to level2_list to work out where class_level2 entries are (would be better to updat from an excel file)
df_donors.loc[df_donors['row_labels'].isin(level2_list), 'type'] = 'donor_class_level2'

In [None]:
# Fill blank rows above 'donor_class_level2' with 'donor_country_name'
for i in range(1,len(df_donors)):
    if df_donors['type'].iloc[i] == 'donor_class_level2':
        # Fill rows above with 'donor_country_name'
        if df_donors['type'].iloc[i-1] == None:
            df_donors.at[i-1, 'type'] = 'donor_country_name'

In [None]:
# Fill blank rows with 'donor_name'
df_donors['type'] = df_donors['type'].fillna('donor_name')

In [None]:
# Check classification makes sense
assert df_donors['type'].str.count('donor_class_level2').sum() == df_donors['type'].str.count(
    'donor_country_name').sum(), 'There is an issue with the donor -> donor country -> donor class2 classification'
# Assert that row_labels are in capital letters for rows where type is 'donor_name'
assert all(df_donors.loc[df_donors['type'] == 'donor_name', 'row_labels'].str.isupper()), 'Donor names have to be capitalized'

In [None]:
# create donor column for donor names
df_donors['donor'] = None
df_donors['donor'] = df_donors['row_labels'].where(df_donors['type'] == 'donor_name')
df_donors['donor'] = df_donors['donor'].ffill()

#create donor_level2 column for the donor classification
df_donors['donor_level2'] = None
df_donors['donor_level2'] = df_donors['row_labels'].where(df_donors['type'] == 'donor_class_level2')
df_donors['donor_level2'] = df_donors['donor_level2'].bfill()

In [None]:
# build donor hierarchy: donor -> donor country -> donor class2,  from df_donors
df_donors_hierarchy = df_donors[df_donors['type'] == 'donor_country_name'].copy()
df_donors_hierarchy.rename(columns={'row_labels': 'donor_country'}, inplace=True)
to_keep = ['donor', 'donor_country', 'donor_level2']

df_donors_hierarchy= df_donors_hierarchy[to_keep]
df_donors_hierarchy = df_donors_hierarchy.reset_index(drop=True)

# add UNKNOWN -> Unknown -> Unknown to df_donors_hierarchy
df_donors_hierarchy.loc[len(df_donors_hierarchy)] = ['UNKNOWN', 'Unknown', 'Unknown']

donors_names_list = df_donors_hierarchy['donor'].unique().tolist()
print('Number of donor entries in df:', df_donors_hierarchy.shape[0])
print('Number of unique donors:', len(df_donors_hierarchy['donor'].unique()))

Read in Funds and Output information:
 - Funds: RR, OR, OR - Emergency and Other funds 
 - Donor Name if known
 - Output code

In [None]:
# read files to df and add funding type from excels
# file_paths = RAW_DATA_DIR.joinpath('insight-finance-admin-donor-statement-cube').glob('*.xlsx')
file_path = PROJ_ROOT / data_source[1]
file_paths = Path(file_path).glob('*.xlsx')

df_list = []

for files in file_paths:
    # Load the workbook and the specific sheet
    workbook = load_workbook(filename=files, data_only=True)
    sheet = workbook.active

    # Read the data into a DataFrame
    df_funds = pd.read_excel(files, skiprows=5)

    # read the Fund Sub-Category and create a column funds_type
    df_funds['funds_type'] = None

    # Iterate through the rows and check for a row with Fund Sub-Category
    for row in range(1, 15):  # Adjust the range as needed
        cell = sheet[f'A{row}']  # Assuming the column in excel is A
        if cell.value == 'Fund Sub-Category':  # Check if cell value is 'Fund Sub-Category'
            df_funds['funds_type'] = sheet[f'B{row}'].value  # Assign the value from column B to df_funds['funds_type']
            break  # Optionally break if you only want the first occurrence
    df_list.append(df_funds.copy())

df_funds = pd.concat(df_list, axis=0, ignore_index=True)

print('Shape before:', df_funds.shape)
# Remove any columns from the DataFrame that contain only NaN 
df_funds.dropna(axis=1, how="all", inplace=True)
print('Shape after:', df_funds.shape)

In [None]:

df_funds.columns = df_funds.columns.str.lower().str.strip().str.replace(' ', '_')
df_funds = df_funds[df_funds['row_labels'] != 'Grand Total']

# allocation refers to allocated funds (check this for plots)
to_keep = ['row_labels', 'allocation', 'funds_type']
df_funds = df_funds[to_keep]
print(df_funds.shape)

In [None]:
# Make a source column with the type of sources
# output matches country code/
df_funds['type'] = df_funds['row_labels'].apply(lambda x: 'output' if str(x).count('/') >= 3 else None)
#avoid counting twice the allocation
df_funds['allocation'] = df_funds.apply(lambda row: row['allocation'] if row['type']!='output' else 0, 1)

df_funds['output_code'] = df_funds['row_labels'].str.split(' ').str[0].where(df_funds['type'] == 'output')
df_funds['output_code'] = df_funds['output_code'].ffill()

# remove all type != output and keep only donors
df_funds = df_funds[df_funds['type'] != 'output'].copy()
print(df_funds.shape)

**Matching CP to Output** through 20_activities_programme_stucture 

- **Dictionary with CPs**

In [None]:
df_activities = pd.read_excel(upstream["20_activities_programme_stucture"]["data"])

In [None]:
df_activities.groupby('cp')['value'].sum()

In [None]:
cp_output_map = df_activities.groupby("output_code")['cp'].unique().to_dict()
cp_output_map = {k:v[0] for k,v in cp_output_map.items()}

In [None]:
# add country name from dictionary
df_funds['country'] = df_funds['output_code'].apply(
    lambda x: COUNTRY if str(x).startswith(COUNTRIES_CODE.get(COUNTRY) + '/') else None)

print(f"Before: {df_funds.shape}")
# remove all outputs for countries not in countries_list
df_funds = df_funds[df_funds['country'].eq(COUNTRY)].copy()
print(f"After: {df_funds.shape}")

In [None]:
# classify the cycles based on a dictionary:
funds_dictionary = {'Other Resources - Emergency': 'OR - Emergency', 'Other Resources - Regular': 'OR', 
                    'Regular Resources': 'RR'}
print('Shape before', df_funds.shape)
df_funds['cp'] = df_funds['output_code'].replace(cp_output_map)

# Filter out output that do not belong to mappend CP
df_funds = df_funds[df_funds['cp'].str.contains("CP")].copy()
print('Shape after', df_funds.shape)

In [None]:
df_funds['funds_type'] = df_funds['funds_type'].map(funds_dictionary)
df_funds['donor'] = df_funds['row_labels'] 

to_keep = ['donor', 'allocation', 'funds_type', 'country', 'output_code', 'cp']
df_funds = df_funds[to_keep]


print('Shape before', df_funds.shape)
df_funds.dropna(subset=['cp'], inplace=True) # drops cycles that can't be classified (DM cycles from previous CP)
print('Shape after', df_funds.shape)

**Matching to Donors Hierarchy**

In [None]:
# Donor Names - Donor Country are not unique, but Donor - Donor Class Level 2 is (mostly!)

df_to_merge = df_donors_hierarchy[['donor', 'donor_level2']].drop_duplicates()
# ~ 2 donors have more than 1 type of class level2 in those cases concatenate the entries for 'donor_level2' 
# and create a new class. This should we don't assign the wrong class to the donor. 
# Group by 'donor' and aggregate 'donor_level2' by joining the entries
df_to_merge = df_to_merge.groupby('donor', as_index=False).agg({
    'donor_level2': lambda x: ', '.join(x)  # Concatenate values with a comma (or any separator you prefer)
})
print('Number of donors in df to merge:', df_to_merge.shape[0])
print('Number of unique donors:', len(df_donors_hierarchy['donor'].unique()))

In [None]:
print('Shape before', df_funds.shape)
df_funds = df_funds.merge(df_to_merge[['donor', 'donor_level2']], on=['donor'], how='left')
print('Shape after', df_funds.shape)

In [None]:
# add start year for easier sorting
df_funds['cp_funds_type'] = df_funds['cp'] + ': ' + df_funds['funds_type']
df_funds['start_year'] = df_funds['cp'].str.extract(r'(\d{4})')

In [None]:
# Fill None values of donor and donor_level_2
df_funds['donor'] = df_funds['donor'].fillna('N.D.').copy()
df_funds['donor_level2'] = df_funds['donor_level2'].fillna('N.D.').copy()

**Aggregate Funds by Donor, Country and CP and write to Spreadsheet**

In [None]:
# aggregate by funds_type by donor and cp
df_funds_agg = df_funds.groupby(['donor', 'donor_level2', 'cp', 
                                 'funds_type','start_year'], as_index=False)['allocation'].sum()
print('Aggregated funds df', df_funds_agg.shape)

# Drop entries where the absolute value of 'allocation' is less than 1 USD
df_funds_agg = df_funds_agg[df_funds_agg['allocation'].abs() >= 1]
print('Cleaned aggregated funds df', df_funds_agg.shape)

In [None]:
df_funds_agg.groupby(['cp','funds_type'])['allocation'].sum()

In [None]:
print('Unique donors in aggregated funds:', len(df_funds_agg['donor'].unique()))

**Ouput to Excel** 

Output Funds  to cp_funds_stucture.xlsx

In [None]:
df_funds_agg.to_excel(product['data'], index=False)

In [None]:
####################################################################################################