In [None]:
# IMPORTANT: The parameters below are set only for running this notebook independently. 
# When executing the full Ploomber pipeline, these values will be overridden by the settings in `pipeline.yaml`. 
# Any modifications made here will not persist when running the pipeline.
COUNTRY =  'ARM' # Code of the Country
upstream = {
    "100_coar_sections": {
        "data": f"../data/processed/{COUNTRY}/COAR_Sections.xlsx",
    }
}
product = {
    "data": f"../data/processed/{COUNTRY}/coars_innovation_summary.xlsx",
}

This Notebook is a revision of the previously AI generated **Summarising Narratives** in 01_main, which provides a summary of the COAR contributions from COAR text. 

In [None]:
import re
from pathlib import Path
import importlib
import pandas as pd
from tqdm import tqdm
import unicef_cpe as cpe
from unicef_cpe.config import PROJ_ROOT


In [None]:
MODEL = 'gpt-4o'
country_map = {k:v for k,v in cpe.utils.get_ecaro_countries_mapping(priority=False).items() if k in COUNTRY}
country_code_map = {v:k for k,v in country_map.items() }

In [None]:

df_coars = pd.read_excel(upstream['100_coar_sections']['data'])

# Remove unnecessary countries for efficiency
mask = df_coars['country'].eq(COUNTRY)

df_coars = df_coars[mask].copy()

df_summaries = df_coars[['country']].drop_duplicates(ignore_index=True)
df_summaries.set_index('country', inplace=True)
print('Shape:', df_summaries.shape)
print(df_summaries.head())

In [None]:
print('Shape before:', df_coars.shape)

df_coars['input'] = df_coars.apply(lambda row: '### {section} in the Country in {year}\n\n{text}'.format(**row), axis=1)
df_coars = df_coars.groupby(['country', 'section'], as_index=False).agg({'input': list})
df_coars['input'] = df_coars['input'].str.join('\n\n')

print('Shape after:', df_coars.shape)
print(df_coars.head())

In [None]:
df_coars['section'].value_counts()

In [None]:
df_coars['input'].str.split().str.len().describe().round(1)

In [None]:
importlib.reload(cpe.genai)

mapping = {
   # 'context': cpe.genai.summarise_coar_context,
    'contributions': cpe.genai.summarise_coar_contributions,
    #'innovations': cpe.genai.summarise_coar_innovations,
    #'partnerships': cpe.genai.summarise_coar_partnerships,
}

for index, row in tqdm(df_coars.iterrows()):
    for keyword, summarise_fn in mapping.items():
        if keyword in row['section'].lower():
            summary = summarise_fn(row['input'][:110_000], country_map.get(row['country']), model=MODEL)
            df_summaries.loc[row['country'], f'{keyword}_summary'] = summary
            break
    else:
        print('No matches for {} section at index {}.'.format(row['section'], index))
print('Shape:', df_summaries.shape)
print(df_summaries.head())

In [None]:
output_path = Path(product['data'])
output_path.parent.mkdir(parents=True, exist_ok=True)  # Create missing directories
df_summaries.to_excel(product['data'])