In [None]:
# IMPORTANT: The parameters below are set only for running this notebook independently. 
# When executing the full Ploomber pipeline, these values will be overridden by the settings in `pipeline.yaml`. 
# Any modifications made here will not persist when running the pipeline.
upstream = None
COUNTRY =  'ARM' # Code of the Country
product = {'data': f'../data/processed/{COUNTRY}/insight_indicators.xlsx'}  # Path to save the final data product (stored under the 'data' key)
data_source = 'data/raw/insight-ram-data-sets'  # Path to the source data directory

In [None]:
import re
from pathlib import Path
import pandas as pd
import unicef_cpe as cpe
from unicef_cpe.config import PROJ_ROOT
from unicef_cpe.utils import get_ecaro_countries_mapping

country_map = {k:v for k,v in get_ecaro_countries_mapping(priority=False).items() if k in COUNTRY}
country_code_map = {v:k for k,v in country_map.items() }

This Notebook extracts indicator data (standard and additional indicators) and produces a spreadsheet for further analysis (copy of **RAM Indicators** in 01_main).

In [None]:
# file_paths = sorted(list(PATH_DATA.joinpath('raw', 'insight-ram-data-sets').glob('*.xlsx')))
source_path = PROJ_ROOT / Path(data_source)
file_paths = sorted(list(source_path.glob('*.xlsx')))
file_paths

In [129]:
def clean_indicators(df: pd.DataFrame, verbose: bool = False) -> pd.DataFrame:
    if verbose:
        print("Shape before:", df.shape)

    df.rename(lambda x: re.sub(r"\s+", "_", x.strip().lower()), axis=1, inplace=True)
    mask = df["result_area"].fillna("").str.match(r"\d+") & (df["indicator_rating_type"] == 'End-year assessment') 
    df = df.loc[mask].copy()
    df.dropna(axis=1, how="all", inplace=True)
    df.dropna(subset=["indicator_actual"], inplace=True)

    def to_float(x):
        try:
            return float(x)
        except:
            return None
        
    def to_int(x):
        try:
            return int(x)
        except:
            return None

    for column in ("baseline_year", "target_year"):
        df[column] = df[column].apply(to_int)
    for column in ("baseline_value", "target_value", "indicator_actual"):
        df[column] = df[column].apply(to_float)

    mapping = cpe.utils.get_ecaro_countries_mapping(keys="code", values="iso")
    df["country"] = df["business_area"].str.split(" - ").str.get(-1).replace(mapping)

    to_keep = [
        "country",
        "indicator_code",
        "indicator",
        "indicator_category",
        "indicator_unit",
        "baseline_year",
        "baseline_value",
        "target_year",
        "target_value",
        "finalization_date",
        "indicator_actual",
        "indicator_rating_type",
        "rating",
    #    "indicator_disaggregation",
    ]
    df = df.reindex(to_keep, axis=1)
    if verbose:
        print("Shape after:", df.shape)
    return df

In [None]:
dfs = []
for file_path in file_paths:
    df_indicators = pd.read_excel(file_path, skiprows=7)
    df_indicators = clean_indicators(df_indicators)
    dfs.append(df_indicators)

df_indicators = pd.concat(dfs, axis=0, ignore_index=True)
print('Shape before:', df_indicators.shape)
df_indicators.drop_duplicates(ignore_index=True, inplace=True)
df_indicators.sort_values(
    by=['country', 'indicator_code', 'baseline_year'],
    ignore_index=True,
    inplace=True,
)
df_indicators = df_indicators.loc[df_indicators['country'].eq(COUNTRY)].copy()

print('Shape after:', df_indicators.shape)


In [None]:
# deduplicate the data
print('Shape before:', df_indicators.shape)

to_keep = ['country', 'indicator_code', 'target_year', 'finalization_date']
df_indicators[to_keep[-1]] = pd.to_datetime(df_indicators[to_keep[-1]], format='%d.%m.%Y')
# take the row with the most recent finalisation date
df_indicators = df_indicators.sort_values(to_keep).groupby(to_keep[:-1]).tail(1)
assert df_indicators.duplicated(subset=to_keep[:-1]).sum() == 0, 'Duplicates'

print('Shape after:', df_indicators.shape)

In [None]:
print('Shape before:', df_indicators.shape)
df_indicators.eval('achieved = indicator_actual > target_value', inplace=True)
df_indicators['progress'] = df_indicators.groupby(['country', 'indicator_code'])['achieved'].transform('mean')
df_indicators['progress'] = pd.cut(
    x=df_indicators['progress'],
    bins=[-1e-3, 1/3, 2/3, 1.],
    labels=['Off Track', 'Mixed Progress', 'On Track']
).astype(str)
df_indicators.drop('achieved', axis=1, inplace=True)
print('Shape after:', df_indicators.shape)

In [133]:
df_indicators.to_excel(product['data'], index=False) 