In [None]:
# IMPORTANT: The parameters below are set only for running this notebook independently. 
# When executing the full Ploomber pipeline, these values will be overridden by the settings in `pipeline.yaml`. 
# Any modifications made here will not persist when running the pipeline.


COUNTRY =  'ARM' # Code of the Country
product = {
    "data": f"../data/processed/{COUNTRY}/unsdcf-partner_type_mapping.xlsx",
    "data2": f"../data/processed/{COUNTRY}/unsdcf-framework-with-partner-types.xlsx",
}


data_source = ['data/raw/unsdcf-framework-2021-2025-exported-2024-11-18-3.csv', 'data/raw/insight-ram3-partner-types/ecar-partner-list.xlsx']



This Notebook uses UNDSCF data and insight partner type data to classify contributing partners and implementation partners according to their partner types with an LLM.

In [None]:
import pandas as pd
import re
from openpyxl import load_workbook
import json
import tqdm
from typing import List

from unicef_cpe.config import PROJ_ROOT

from unicef_cpe.utils import *
import unicef_cpe
from pathlib import Path

from unicef_cpe.genai import prompt

In [None]:


data_path = PROJ_ROOT / data_source[0]
df = pd.read_csv(data_path)

In [None]:
df.columns

In [None]:
columns_to_classify = ['Contributing partners', 'Implementation partners']

for c in columns_to_classify:
    df[c + ' uses comma separator'] = 0

In [None]:
partner_data_path = PROJ_ROOT / data_source[1]

partner_types_df = read_excel_sheet(partner_data_path, 'by BA, Partner & FR')
partner_types_df.columns = partner_types_df.iloc[5]
partner_types_df = partner_types_df.iloc[6:] 
partner_types_df = partner_types_df[['Partner Type', 'Partner and CSO Type']].drop_duplicates().rename({'Partner Type': 'category', 'Partner and CSO Type': 'sub_category', }, axis=1)
partner_types_df.to_dict(orient='records')


In [None]:
classification_map = [
    {'category': 'GOVERNMENT', 'sub_category': 'LOCAL GOVERNMENT'},
    {'category': 'GOVERNMENT', 'sub_category': 'CENTRAL GOVERNMENT'},
 {'category': 'CIVIL SOCIETY ORGANIZATION',
  'sub_category': 'CIVIL SOCIETY ORGANIZATION - NATIONAL NGO'},
 {'category': 'PRIVATE SECTOR', 'sub_category': 'PRIVATE SECTOR'},
 {'category': 'MULTI-LATERAL ORGANIZATION',
  'sub_category': 'MULTI-LATERAL ORGANIZATION'},
 {'category': 'CIVIL SOCIETY ORGANIZATION',
  'sub_category': 'CIVIL SOCIETY ORGANIZATION'},
 {'category': 'CIVIL SOCIETY ORGANIZATION',
  'sub_category': 'CIVIL SOCIETY ORGANIZATION - INTERNATIONAL NGO'},
 {'category': 'UN AGENCY', 'sub_category': 'UN AGENCY'},
 {'category': 'CIVIL SOCIETY ORGANIZATION',
  'sub_category': 'CIVIL SOCIETY ORGANIZATION - ACADEMIC INSTITUTION'},
 {'category': 'CIVIL SOCIETY ORGANIZATION',
  'sub_category': 'CIVIL SOCIETY ORGANIZATION - COMMUNITY BASED ORGANIZATION'},
 {'category': 'CIVIL SOCIETY ORGANIZATION',
  'sub_category': 'CIVIL SOCIETY ORGANIZATION - RED CROSS/RED CRESCENT NATIONAL SOCIETIES'},
 {'category': 'FINANCIAL SERVICE PROVIDER',
  'sub_category': 'FINANCIAL SERVICE PROVIDER'}]

In [None]:
detect_comma_separator_message = """You are a helpful assistant. 
Your task is to determine if each line contains multiple organization/government/entity names separated by a comma. 
You will receive names in the form of a tuple: (country, partner(s) name). 
'Core funds' is a specification and not a separate partner name.
If multiple partner names are separated by a comma, reply with 1. If there is only one partner name, reply with 0."""

system_message = """You are a helpful assistant. Your task is to classify the following partners from the UNSDCF report into the following categories and subcategories: %s
You will receive a list of partners, with each partner on a separate line. 
Each line includes the country associated with the report and the partner's name. 
Your task is to provide for each line one of the JSON above.
Provide the result as a list of JSON, code for list only, nothing more. 
If there is significant uncertainty about the partner type, use 'uncertain' for both the category and subcategory.
Example:

Input:
(Armenia, UNICEF)
(Armenia, UNFPA)

Output:
[{"category": "UN AGENCY", "sub_category": "UN AGENCY"}, {"category": "UN AGENCY", "sub_category": "UN AGENCY"}]
""" % json.dumps(classification_map)

system_message

In [None]:
seen = set()
type_map = dict()
tested_for_commas = list()

In [None]:
for idx, row in tqdm.tqdm(df.iterrows()):
    country = row['Country']
    for col in columns_to_classify:
        col_value = row[col]
        if col_value == '-':
            continue
        if ';' in col_value:
            partners = col_value.split(';')
        elif ',,,,,,' in col_value:  # NOT USED, BAD RESULTS
            if str((country, col_value)) in tested_for_commas:
                continue 
            comma_separator = prompt(str((country, col_value)), detect_comma_separator_message, model='gpt-4o')
            tested_for_commas.append(str((country, col_value)))
            print(str((country, col_value)), comma_separator)
            try:
                comma_separator = int(comma_separator)
            except Exception as e:
                print(f"An error occurred when dealing with commas in partner {(country, partner)}")
                comma_separator = 0
                
            if comma_separator == 1:
                df.loc[idx, col + ' uses comma separator'] = comma_separator
                partners = col_value.split(',')
            else:
                partners = [col_value]
        else:
            partners = [col_value]

        for partner in partners:
            partner = partner.strip()
            try:
                type_map[(country, partner)]
            except:
                type_map[(country, partner)] = {}

In [None]:
partner_list = [p for p in type_map.keys() if type_map[p] == {}]
batch_size = 10
for i in range(0, len(partner_list), batch_size):
    sublist = partner_list[i:i+batch_size]
    partners_str = '\n'.join(str(p) for p in sublist)
    ai_classification = prompt(partners_str, system_message, model='gpt-4o')
    result = json.loads(ai_classification)
    if len(result) != len(sublist):
        raise ValueError("Incorrect number of result received back")
    for p, t in zip(sublist, result):
        type_map[p] = t
    

In [None]:
for idx, row in tqdm.tqdm(df.iterrows()):
    country = row['Country']
    for col in columns_to_classify:
        tmp_category, tmp_subcategory = [], []
        col_value = row[col]
        if col_value == '-':
            continue
        if row[col + ' uses comma separator'] == 1:
            partners = col_value.split(',')
        else:
            partners = col_value.split(';')
        for partner in partners:
            partner = partner.strip()
            json_dict = type_map[(country, partner)]
            tmp_category.append(json_dict['category'])
            tmp_subcategory.append(json_dict['sub_category'])
        
        df.loc[idx, col + ' partner category'] = ';'.join(tmp_category)
        df.loc[idx, col + ' partner sub category'] = ';'.join(tmp_subcategory)

In [None]:
col_value = 'USAID, Government of the United Kingdom, Government of Germany, Government of the Republic of Korea'
if col_value == '-':
            pass
if ';' in col_value:
            partners = col_value.split(';')
elif ',' in col_value:
            if str((country, col_value)) in tested_for_commas:
                pass 
            comma_separator = prompt(str((country, col_value)), detect_comma_separator_message, model='gpt-4o')
            tested_for_commas.append(str((country, col_value)))
            try:
                comma_separator = int(comma_separator)
            except Exception as e:
                print(f"An error occurred when dealing with commas in partner {(country, partner)}")
                comma_separator = 0
                
            if comma_separator == 1:
                df.loc[idx, col + ' uses comma separator'] = comma_separator
                partners = col_value.split(',')
            else:
                partners = [col_value]
else:
            partners = [col_value]

partners

In [None]:
partner_type_df = pd.DataFrame(data=type_map.values(), index=type_map.keys()).reset_index()
partner_type_df.columns = ['country', 'partner', 'category', 'sub_category']

# write_sheet_to_excel(partner_type_df, file_path=OUTPUT_DATA_DIR / 'unsdcf-partner_type_mapping.xlsx')
partner_type_df.to_excel(product['data'], index=False) 

In [None]:
# write_sheet_to_excel(df, file_path=OUTPUT_DATA_DIR / 'unsdcf-framework-with-partner-types.xlsx')
df.to_excel(product['data2'], index=False)