In [None]:
upstream = None
COUNTRY =  'ARM' # Code of the Country
product = {'data': f'../data/processed/{COUNTRY}/COAR_Sections.xlsx'}  # Path to save the final data product (stored under the 'data' key)
data_source = 'data/raw/insight-ram-summary-narrative-analysis'  # Path to the source data directory

This Notebook performs COAR Mining and produces a spreadsheet for further analysis (copy of **COAR Mining** in 01_main):

- Mine innovations from COAR 'Lessons Learned and Innovations' sections using GenAI.
- Mine collaborations and partnerships from COAR 'UN Collaboration and Other Partnerships' sections.
- COARs come from RAM Summary Narratives.

In [None]:
import re
from pathlib import Path
import pandas as pd
import unicef_cpe as cpe
from pathlib import Path
from unicef_cpe.config import PROJ_ROOT
from unicef_cpe import utils

**Preparing Narratives**

In [None]:
country_map = {k:v for k,v in utils.get_ecaro_countries_mapping(priority=False).items() if k in COUNTRY}

In [None]:
df_list = []

source_path = PROJ_ROOT / Path(data_source)
#for file_path in sorted(RAW_DATA_DIR.joinpath('insight-ram-summary-narrative-analysis').glob('*.csv')):
for file_path in sorted(source_path.glob('*.csv')):
    match = re.search(r'\d{4}', file_path.name)
    if not match:
        print(f'Could not find a match in {file_path.name}. Skipping...')
        continue
    df_coars = pd.read_csv(file_path)
    df_coars['year'] = int(match.group())
    country_name = country_map.get(COUNTRY)

    country_mask = df_coars['BUSINESS_AREA_NAME'].eq(country_name)
    df_coars = df_coars[country_mask]

    df_list.append(df_coars)

df_coars = pd.concat(df_list, axis=0, ignore_index=True)
print('Shape:', df_coars.shape)
print(df_coars.head())

In [None]:
to_rename = {
    'BUSINESS_AREA_NAME': 'country',
    'year': 'year',
    'NarrativeTitle': 'section',
    'NarrativeText': 'text',
}
to_replace = cpe.utils.get_ecaro_countries_mapping(keys='name', values='iso')
to_replace['Moldova'] = to_replace['Republic of Moldova']
to_replace['Republic of Kyrgyzstan'] = to_replace['Kyrgyzstan']
to_replace['Republic of Montenegro'] = to_replace['Montenegro']
to_replace['Rep of Uzbekistan'] = to_replace['Uzbekistan']
to_replace['ECARO, Switzerland'] = 'ECARO'

In [None]:
print('Shape before:', df_coars.shape)

df_coars = df_coars.rename(to_rename, axis=1).reindex(to_rename.values(), axis=1)
df_coars.dropna(subset='text', inplace=True)
df_coars['text'] = df_coars['text'].str.strip()
mask = df_coars['text'].str.len().ge(10)
df_coars = df_coars.loc[mask].reset_index(drop=True)  # remove empty or short texts
df_coars['country'] = df_coars['country'].replace(to_replace)
df_coars.sort_values(['country', 'year'], ignore_index=True, inplace=True)

difference = set(df_coars['country']) - set(to_replace.values())
assert not difference, f'Add missing replacements for the following items: {difference}.'
print('Shape after:', df_coars.shape)
print(df_coars.head())

In [None]:
print('Shape before:', df_coars.shape)

mask = df_coars['country'].eq('ECARO')
df_roars = df_coars.loc[mask].reset_index(drop=True)  # copy ECARO
df_coars = df_coars.loc[~mask].copy()  # remove ECARO

print('Shape after:', df_coars.shape)
print(df_coars.head())

In [None]:
df_coars['section'].value_counts()

In [None]:
output_path = Path(product['data'])
output_path.parent.mkdir(parents=True, exist_ok=True)  # Create missing directories
df_coars.to_excel(product['data'], index=False)