In [None]:
# IMPORTANT: The parameters below are set only for running this notebook independently. 
# When executing the full Ploomber pipeline, these values will be overridden by the settings in `pipeline.yaml`. 
# Any modifications made here will not persist when running the pipeline.
upstream = None
COUNTRY =  'ARM' # Code of the Country
product = {'data': f'../data/processed/{COUNTRY}/insight_trips.xlsx'}  # Path to save the final data product (stored under the 'data' key)
data_source = 'data/raw/insight-finance-trips'  # Path to the source data directory

# Efficiency
## Efficiency - 4
### To what extent the human resources were allocated in alignment to the defined  priorities? Are staff structures defined based on fund availability/priorities? How is this enabling/restricting UNICEF delivery?

This Notebook summarizes the trip approval reports from Insight (copy of **Trip Approval Report** in 01_main).

In [None]:
import re
from pathlib import Path
import pandas as pd
import unicef_cpe as cpe

from unicef_cpe.config import PROJ_ROOT


country_map = {k:v for k,v in cpe.utils.get_ecaro_countries_mapping(priority=False).items() if k in COUNTRY}
country_code_map = {v:k for k,v in country_map.items() }

In [None]:
# file_paths = sorted(list(RAW_DATA_DIR.joinpath('insight-finance-trips').glob('*.csv')))
source_path = PROJ_ROOT / Path(data_source)
file_paths = sorted(list(source_path.glob('*.csv')))

In [None]:
to_rename = {
    'TRIP_REASON3': 'reason',
    'BEGIN_DATE3': 'date',
    'TA_AMOUNT2': 'amount',
}
df_list = []
for file_path in file_paths:
    country = file_path.name.split()[-2]
    df_trips = pd.read_csv(file_path, usecols=to_rename)
    df_trips.insert(0, 'country', country)
    df_list.append(df_trips)
df_trips = pd.concat(df_list, axis=0, ignore_index=True)
df_trips.rename(to_rename, axis=1, inplace=True)
df_trips.drop_duplicates(ignore_index=True, inplace=True)  # records duplicated by 'Approved by'
print('Shape:', df_trips.shape)
print(df_trips.head())

In [None]:
print('Shape before:', df_trips.shape)
df_trips['date'] = pd.to_datetime(df_trips['date'], format='%d.%m.%Y')
#df_trips['amount'] = df_trips['amount'].str.replace(',', '').astype(float)
print('Shape after:', df_trips.shape)

mask = df_trips['country'].eq(COUNTRY)
df_trips = df_trips[mask].copy()
print(df_trips.head())

In [None]:
df_trips.isna().sum()

In [None]:
print('Shape before:', df_trips.shape)
df_trips.dropna(ignore_index=True, inplace=True)
print('Shape after:', df_trips.shape)
print(df_trips.head())

In [None]:
df_trips.groupby([df_trips['date'].dt.year, 'reason']).size().unstack(level=0)

In [None]:
print('Shape before:', df_trips.shape)
mask = df_trips['date'].dt.year.ge(2018)  # before 2018 reason is always OTHER
df_trips = df_trips.loc[mask].copy()
print('Shape after:', df_trips.shape)
print(df_trips.head())

In [None]:
df_trips['reason'].value_counts()

In [None]:
to_keep = df_trips['reason'].value_counts().nlargest(10).index.tolist()
print('Shape before:', df_trips.shape)
df_trips['reason'] = df_trips['reason'].apply(lambda x: x if x in to_keep else 'OTHER')
print('Shape after:', df_trips.shape)
print(df_trips.head())

In [None]:
print('Shape before:', df_trips.shape)
df_trips = df_trips.groupby(['country', pd.Grouper(key='date', freq='ME'), 'reason'], as_index=False).agg({'amount': 'sum'})
print('Shape after:', df_trips.shape)
print(df_trips.head())

In [None]:
output_path = Path(product['data'])
output_path.parent.mkdir(parents=True, exist_ok=True)  # Create missing directories
df_trips.to_excel(product['data'], index=False) 