In [None]:
upstream = None
product = None
data_source = None
COUNTRY = 'ARM'

In [None]:
import os 
import re
from pathlib import Path
import pandas as pd


import importlib
from typing import List, Dict


from unicef_cpe.config import PROJ_ROOT
from unicef_cpe.extraction import extract_text_from_file
from unicef_cpe.utils import *
from unicef_cpe.genai import *

import unicef_cpe.processing as cpe_processing


In [None]:
country_map = {k:v for k,v in get_ecaro_countries_mapping(priority=False).items() if k in COUNTRY}
country_code_map = {v:k for k,v in country_map.items() }
MODEL = 'gpt-4o'

In [None]:
def clean_raw_file(df: pd.DataFrame):
    tmp_df = df.copy()

    df = tmp_df

    country = re.search(r'Business Area:\s+([\w\s]+)\s+-\s+\w+', df.iloc[9, 0]).group(1)
    
    country = next(k for k, v in COUNTRIES.items() if v == country)
    
    # drop first rows 
    df = df.iloc[15:]

    # remove rows where third column equals 'total'
    df = df[~(df.iloc[:, 2] == 'Total')]
    
    # remove rows where first column ends with 'total'
    df = df[~df.iloc[:, 0].fillna('').str.endswith('Total')]

    # remove rows where 7th column ends with 'total'
    df = df[~df.iloc[:, 6].fillna('').str.endswith('Total')]

    # drop some columns invisible in excel
    df = df.drop(df.columns[[10, 16]], axis=1)

    #columns1 = df.iloc[12]
    #columns2 = df.iloc[13]
    #columns = [f"{col1} {col2}" for col1, col2 in zip(columns1, columns2)]  # merged column issue, cannot use 
    columns = ['Implementing Partner','eTools Ref Number','FR Number','FR Start Date','FR End Date','Itm','FR Item Description','FR Amount','FR Due Date','Outcome/Output','Grant','Commitment Type','Commitment Reference','Commitment Description','Commitment Amount','Transfers to IP']
    df.columns = columns

    # make a new column where first column is not null and the other are
    mask = df.iloc[:, 1:].isna().all(axis=1)
    df['Collaboration type'] = df.iloc[:,0].where(mask, None).ffill()
    df = df[~mask]
    df['Country'] = country

    df.columns = [col.lower().replace(' ', '_') for col in df.columns]

    # group by implementing partner and fr number and forward fill for each group column etools ref number
    df.iloc[:, [0, 2,3,4]] = df.iloc[:, [0, 2,3,4]].ffill()
    df[['etools_ref_number', 'itm', 'fr_item_description', 'fr_due_date', ]] =  df.groupby(['implementing_partner', 'fr_number'])[['etools_ref_number', 'itm', 'fr_item_description', 'fr_due_date', ]].ffill()

    # convert to date from dd.mm.yyyy
    df['fr_start_date'] = pd.to_datetime(df['fr_start_date'], format='%d.%m.%Y', errors='coerce')
    df['fr_end_date'] = pd.to_datetime(df['fr_end_date'], format='%d.%m.%Y', errors='coerce')
    df['fr_due_date'] = pd.to_datetime(df['fr_due_date'], format='%d.%m.%Y', errors='coerce')
    df['month'] = df['fr_start_date'].dt.to_period('M').dt.to_timestamp(how='s')
    df['year'] = df['fr_start_date'].dt.to_period('Y').dt.to_timestamp(how='s')

    # anything outside 2013 and 2024 is an error
    df = df[(df['year'] >= '2013-01-01') & (df['year'] <= '2025-01-01')]

    df['transfers_to_ip'] = df['transfers_to_ip'].astype(float)

    return df 


In [None]:
dfs = []
data_path = PROJ_ROOT / data_source[0]

excels = sorted(Path(data_path).glob('*.xlsx'))
for excel in excels:
    df = pd.read_excel(excel)
    df = clean_raw_file(df)
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True)

In [None]:
df.head()

In [None]:
from unicef_cpe.processing import get_programme_structure

# Get the programme structure dataframe
programme_structure_df = get_programme_structure()

# Display the dataframe
programme_structure_df.columns

In [None]:

data_path_funds_info = PROJ_ROOT / data_source[1]
data_path_partner_fr_activity_rel = PROJ_ROOT / data_source[2]
data_path_partner_by_country = PROJ_ROOT / data_source[3]


def drop_first_rows(df: pd.DataFrame, *args) -> pd.DataFrame:
    df = df.iloc[5:]
    columns = df.iloc[0]
    df = df.iloc[1:]
    df.columns = columns
    print(df.columns)
    return df

def drop_totals_rows(df: pd.DataFrame, *args) -> pd.DataFrame:
    return df[~df.iloc[:, 0].fillna('').str.endswith('Total')]

def forward_fill(df: pd.DataFrame, columns: List, *args) -> pd.DataFrame:
    df.loc[:, columns] = df.loc[:, columns].ffill()
    return df

preprocessing_funcs = [
    drop_first_rows
    , drop_totals_rows
    , forward_fill
]

df_funds_info = pd.read_excel(data_path_funds_info, sheet_name='by BA, Partner & FR')
df_activity_rel = pd.read_excel(data_path_partner_fr_activity_rel, sheet_name='by BA, Partner & FR')
df_partner_country = pd.read_excel(data_path_partner_by_country, sheet_name='by BA, Partner & FR')

for func in preprocessing_funcs:
    df_funds_info = func(df_funds_info, df_funds_info.columns)
    df_activity_rel = func(df_activity_rel, df_activity_rel.columns)
    df_partner_country = func(df_partner_country, df_partner_country.columns)

df_funds_info = df_funds_info[~df_funds_info['FR Document Number'].isna()]

df_extra_info = df_funds_info.copy()
df_extra_info = df_extra_info.merge(df_activity_rel, left_on='FR Document Number', right_on=['FR Document Number'])
df_extra_info = df_extra_info.merge(df_partner_country, left_on=['Vendor Code', 'Vendor Name'], right_on=['Vendor Code', 'Vendor Name'])


In [None]:
# filter partners by country 
df_extra_info['country_code'] = df_extra_info['Implementing Business Area'].apply(lambda x: x[:3].upper())  # todo very unsafe but ok as quick 
df_extra_info = df_extra_info[df_extra_info['country_code'].isin(COUNTRIES.keys())]

# add vendor code
print(df.shape)
tmp_df = df.merge(df_extra_info[['Vendor Code', 'Vendor Name', 'FR Document Number']].drop_duplicates(), how='left', left_on=['implementing_partner', 'fr_number'], right_on=['Vendor Name', 'FR Document Number'], suffixes=('', '_partner'))
print(tmp_df.shape)

partner_map = tmp_df.groupby(['implementing_partner', 'country'])['Vendor Code'].first(skipna=True).to_dict()
tmp_df['vendor_code'] = tmp_df.set_index(['implementing_partner', 'country']).index.map(partner_map)

tmp_df.drop(['Vendor Code', 'Vendor Name', 'FR Document Number'], axis=1, inplace=True)

# join activities to partners 
print(tmp_df.shape)
tmp_df = tmp_df.merge(df_extra_info[['Vendor Code', 'FR Document Number', 'WBS Level4 - Activity']].drop_duplicates(), left_on=['vendor_code', 'fr_number'], right_on=['Vendor Code', 'FR Document Number'], how='left', suffixes=('', '_rel'))
print(tmp_df.shape)

# join donor 
print(tmp_df.shape)
tmp_df = tmp_df.merge(df_extra_info[['Grant', 'Donor Name']].drop_duplicates(), left_on='grant', right_on='Grant', how='left', suffixes=('', '_donor'))
print(tmp_df.shape)

# join activity description 
tmp_df['activity_code'] = tmp_df['WBS Level4 - Activity'].str.split(' ').str[0]
print(tmp_df.shape)
tmp_df = tmp_df.merge(programme_structure_df[['activity_code', 'activity_description']].drop_duplicates(), how='left', left_on='activity_code', right_on='activity_code')
print(tmp_df.shape)
# join donors
#df_activity_rel[['Vendor Code', 'Vendor Name', 'FR Document Number']].groupby('FR Document Number').nunique()

In [None]:
code_mapping_df = programme_structure_df[['outcome_code', 'outcome_name', 'output_code', 'output_name', 'output_description', 'activity_code', 'activity_name', 'activity_description', 'goal_area_code', 'goal_area', 'strategy_name']].drop_duplicates()
code_mapping_dict = code_mapping_df.set_index('activity_code').to_dict()
tmp_df['outcome_name'] = tmp_df['activity_code'].map(code_mapping_dict['outcome_name'])
tmp_df['output_name'] = tmp_df['activity_code'].map(code_mapping_dict['output_name'])
tmp_df['output_description'] = tmp_df['activity_code'].map(code_mapping_dict['output_description'])
tmp_df['goal_area_code'] = tmp_df['activity_code'].map(code_mapping_dict['goal_area_code'])
tmp_df['goal_area'] = tmp_df['activity_code'].map(code_mapping_dict['goal_area'])
tmp_df['strategy_name'] = tmp_df['activity_code'].map(code_mapping_dict['strategy_name'])

In [None]:
#goals_path = OUTPUT_DATA_DIR.joinpath('cp_funds_and_SDG.xlsx')

goals_path  = PROJ_ROOT / data_source[4]

goals_df = pd.read_excel(goals_path)

In [None]:
sdg_activity_map = goals_df.groupby(['activity_wbs'])['goal_area'].first().to_dict()

tmp_df['sdg'] = tmp_df['activity_code'].map(sdg_activity_map)

In [None]:
# locations = pd.read_excel(RAW_DATA_DIR / 'pbi_programme_data_explorer_subnational [wbs_activity_location]' / 'activity_location.xlsx')
locations_path = PROJ_ROOT / data_source[5]
locations = pd.read_excel(locations_path)
locations.columns = locations.loc[1]
locations = locations.loc[2:]

In [None]:

locations = locations.groupby(['Subnational[Outcome/Output WBS Element]']).agg({
    'Subnational[Location]': lambda x:  ';'.join(x.unique())
})
location_map = locations['Subnational[Location]'].to_dict()
# 

In [None]:
tmp_df['outcome/output_no_slash'] = tmp_df['outcome/output'].str.replace('/', '') + '000'
tmp_df['locations'] = tmp_df['outcome/output_no_slash'].map(location_map)

In [None]:
# write_sheet_to_excel(tmp_df, product['data2'])
tmp_df.to_excel(product['data2'], index=False)

In [None]:
df_partner_list = df[['country', 'year', 'implementing_partner']].drop_duplicates().sort_values(['country', 'year', 'implementing_partner'])
df_partner_list.head()

### Plot unique partners by country

In [None]:
df_to_plot = df.groupby(['country', 'year'])['implementing_partner'].nunique().reset_index()
df_partner_count_by_country_year = df_to_plot.copy()

### Plot new partners to previous years

In [None]:
df_to_plot = df.copy()
df_to_plot = df_to_plot[['country', 'year', 'implementing_partner']].drop_duplicates().sort_values(['year', 'implementing_partner'], ascending=[True, True])
df_to_plot['existing'] = None 
# inefficient but works 
for idx, row in df_to_plot.iterrows():
    if idx == 0:
        df_to_plot.loc[idx, 'existing'] = 'New partner'
    else:
        if df_to_plot[(df_to_plot['year'].dt.year <= row['year'].year - 1) & (df_to_plot['implementing_partner'] == row["implementing_partner"])].shape[0] > 0:
            df_to_plot.loc[idx, 'existing'] = 'Existing partner'
        else:
            df_to_plot.loc[idx, 'existing'] = "New partner"
        
df_partner_list = df_partner_list.merge(df_to_plot, on=['country', 'year', 'implementing_partner'])

df_to_plot = df_to_plot.groupby(['country', 'year', 'existing']).size().unstack().reset_index().query('year >= "2014-01-01"')
partner_count_new_to_past = df_to_plot.copy()
df_to_plot.head()

### Plot are chart with partners split in groups of < 100000 year < 1M per year and above >M

In [None]:
df.groupby(['implementing_partner', 'year']).agg(
        {'transfers_to_ip': 'sum'}
    ).reset_index().describe()

In [None]:
df_partner_list

In [None]:
# Define the bins and labels
bins = [-float('inf'), 1e5, 1e6, float('inf')]
labels = ['Below 100K', '100K to 1M', 'Above 1M']

df_to_plot = df.copy()

df_partner_year_transfers = df_to_plot.groupby(['country', 'implementing_partner', 'year']).agg(
        {'fr_amount': 'sum'}
    ).reset_index()

# Bin the 'transfers_to_ip' column
df_partner_year_transfers['partner_rating_binned'] = pd.cut(df_partner_year_transfers['fr_amount'], bins=bins, labels=labels)

# Create a dictionary for mapping
rating_dict = df_partner_year_transfers.set_index(['country', 'implementing_partner', 'year'])['partner_rating_binned'].to_dict()

# Map the binned ratings back to the original dataframe
df_to_plot['partner_rating_binned'] = df_to_plot.set_index(['country', 'implementing_partner', 'year']).index.map(rating_dict)
df_partner_list['partner_rating_binned'] = df_partner_list.set_index(['country', 'implementing_partner', 'year']).index.map(rating_dict)

df_to_plot = df_to_plot.groupby(['country', 'year', 'partner_rating_binned']).agg(
    {'fr_amount': 'sum',
     'implementing_partner': 'nunique'},
    ).reset_index()

df_partner_fr_consumed_by_year = df_to_plot.copy()

df_to_plot.head()


### Agreement duration plot (not used)

In [None]:
freq = 'QS'
# Define the bins and labels. bins are periods of length freq of min and max df month
bins = pd.date_range(start=df['month'].min(), end=df['month'].max(), freq=freq)
labels = [f'{start.year} - Q{start.quarter}' for start, end in zip(bins, bins[1:])]

pd.cut(df['month'], bins=bins, labels=labels)

for country in df['country'].unique():
    df_country = df[df['country'] == country].copy()

    df_country['bins'] = pd.cut(df_country['month'], bins=bins, labels=labels)

    df_country = df_country.groupby(['bins']).agg(
        {'implementing_partner': set}
    )

    
df_country.head()


### Get partner types dataframe and map

In [None]:
df_ecaro_partner_list = cpe_processing.read_partner_types_data()
df_ecaro_partner_list['partner_type'] = df_ecaro_partner_list['partner_type'].str.title()
df_ecaro_partner_list['partner_and_cso_type'] = df_ecaro_partner_list['partner_and_cso_type'].str.title()
df_ecaro_partner_list['partner_and_cso_type'] = df_ecaro_partner_list['partner_and_cso_type'].map(lambda x: x.replace(' -', '') if x.endswith(' -') else x )
df_ecaro_partner_list.head()

In [None]:
### the following takes one type when multiple are associated to a partner. Not entirely correct maybe

partner_type_map = df_ecaro_partner_list.groupby(['vendor_name'])['partner_type'].first().to_dict()
#partner_type_map

cso_type_map = df_ecaro_partner_list.groupby(['vendor_name'])['partner_and_cso_type'].first().to_dict()
cso_code_type_map = df_ecaro_partner_list.groupby(['vendor_code'])['partner_and_cso_type'].first().to_dict()
# sorted(set(cso_type_map.values()))

In [None]:
tmp_df['partner_type'] = tmp_df['vendor_code'].map(cso_code_type_map)

In [None]:
df['partner_type'] = df['implementing_partner'].map(cso_type_map)
df['partner_macro_type'] = df['implementing_partner'].map(partner_type_map)
df_partner_list['partner_type'] = df_partner_list['implementing_partner'].map(cso_type_map)

### Map outputs to goal area

In [None]:
from unicef_cpe.processing import get_programme_structure

# Get the programme structure dataframe
programme_structure_df = get_programme_structure()

# Display the dataframe
programme_structure_df.head()

In [None]:
# output_mapping = programme_structure_df[['goal_area', 'output_code']].drop_duplicates().set_index('output_code')['goal_area'].to_dict()
df_output_mapping = programme_structure_df[['output_code', 'generic_intervention_name']].drop_duplicates()

df_output_mapping.head()

### Plot partner type count by gic

In [None]:
df_to_plot = df.copy()
df_to_plot = df_to_plot.merge(df_output_mapping, left_on='outcome/output', right_on='output_code', how='left')
df_to_plot = df_to_plot[['implementing_partner', 'partner_type', 'generic_intervention_name', 'country']].drop_duplicates()
df_partner_list = df_partner_list.merge(df_to_plot, on=['country', 'implementing_partner', 'partner_type'])
df_to_plot = df_to_plot.groupby(['country', 'generic_intervention_name', 'partner_type']).agg({'implementing_partner': 'nunique'}).reset_index()
df_partner_count_by_gic = df_to_plot.copy()
df_to_plot.head()

### Write excels

In [None]:
# Write the dataframes to separate sheets

output_path = Path(product['data'])
output_path.parent.mkdir(parents=True, exist_ok=True)  # Create missing directories


writer = pd.ExcelWriter(path=product['data'], engine='openpyxl', mode='w')

df_partner_count_by_country_year.to_excel(writer, sheet_name='df_partner_count_by_country_year', index=False)
partner_count_new_to_past.to_excel(writer, sheet_name='partner_count_new_to_past', index=False)
df_partner_fr_consumed_by_year.to_excel(writer, sheet_name='partner_fr_consumed_by_year', index=False)
df_partner_count_by_gic.to_excel(writer, sheet_name='partner_count_by_gic', index=False)
df_partner_list.to_excel(writer,sheet_name='partner_list', index=False)

writer.close()