In [None]:
# IMPORTANT: The parameters below are set only for running this notebook independently. 
# When executing the full Ploomber pipeline, these values will be overridden by the settings in `pipeline.yaml`. 
# Any modifications made here will not persist when running the pipeline.

COUNTRY =  'ARM' # Code of the Country

upstream = {
    "03_coars_narratives": {
        "data": f"../data/processed/{COUNTRY}/narrative_by_section.xlsx",
    },
    "02_summary_narrative_ouput_level": {
        "data": f"../data/processed/{COUNTRY}/output-summary-narrative.xlsx",
    }
}
product = {
    "data": f"../data/processed/{COUNTRY}/evidence_of_innovations.xlsx",
}


# Identify and extract innovations

**Description**
Innovation can be found in different sources:
1. **COARs** documents
2. End of the year Outcome/Output Sumamry Narrative
3. Programme Structure

The Notebook extracts the innovations from the COARs documents and the End of the year Outcome/Output sumamry Narrative into a structure format.


In [None]:
import re
import pandas as pd
from pathlib import Path
from unicef_cpe.config import PROJ_ROOT
from unicef_cpe import utils, genai


In [None]:
# Restrict only to these countries to avoid unexpected costs and long executing times.
country_map = {k:v for k,v in utils.get_ecaro_countries_mapping(priority=False).items() if k in COUNTRY}
country_code_map = {v:k for k,v in country_map.items() }
MODEL = 'gpt-4o'

In [None]:
df_narrative = pd.read_excel(upstream['03_coars_narratives']['data'])

df_output_narrative = pd.read_excel(upstream['02_summary_narrative_ouput_level']['data'])

In [None]:
print(df_narrative.head())

In [None]:
print(df_output_narrative.head())

In [None]:
df_output_narrative = df_output_narrative.groupby(['country', 'year'], as_index=False).agg({'lessons_learned_and_innovations': 'unique'}).copy()
df_output_narrative['lessons_learned_and_innovations'] = df_output_narrative['lessons_learned_and_innovations'].str.join('\n\n')
print(df_output_narrative.head())

In [None]:
# Identify sections that potentially discuss about innovation
innovation_mask = df_narrative['section_title'].str.contains('nnovation|novel')
print(df_narrative[innovation_mask]['section_title'].unique())

In [None]:
df_narrative = df_narrative[innovation_mask].groupby(['country', 'year'], as_index=False).agg({'section_text': 'unique'}).copy()
df_narrative['section_text'] = df_narrative['section_text'].str.join('\n\n')
print(df_narrative.head())

In [None]:
df_innovation = df_output_narrative.merge(df_narrative, how='left', left_on=['country','year'], right_on=['country','year'])

In [None]:
df_innovation['innovations'] = df_innovation['lessons_learned_and_innovations'].fillna('') + '\n\n' + df_innovation['section_text'].fillna('')

# Removing the newline when both columns are NaN
df_innovation['innovations']  = df_innovation['innovations'] .str.strip('\n')


In [None]:
df_list = []


country_name = country_map.get(COUNTRY)
country_mask = df_innovation['country'].eq(country_name)

years = sorted(df_innovation[country_mask]['year'].unique())

for year in years:
    year_mask = df_innovation['year'] == year
    
    try:
        # We take only the text from the first section as it is the **context** section
        text = df_innovation[country_mask & year_mask]['innovations'].iloc[0]
    except IndexError:
        # Skip if there's no narrative text for this combination of country and year
        print(f"No narrative text available for {country_name} in {year}")
        continue

    print(f"Summarizing innovations for {country_name} in {year}")
    innovation = genai.summarise_innovations(text, country_name, model=MODEL, api_type='openai')
    
    # Clean text by removing potential generated headers
    innovation = utils.remove_section_title(innovation)
    innovation_list = innovation.split('\n-')[1:]  # Skip the first empty element after the split
    
    for inn in innovation_list:
        # Split each innovation into goal area and its description
        innovation_description = inn.split('**:')
        
        innovation_title = innovation_description[0].replace('- **','').replace('**', '').replace('\n', '').strip()
        innovation_text = innovation_description[1].replace('\n', '').strip()
        df_list.append([COUNTRY, country_name, year, innovation_title, innovation_text])


df_innovations = pd.DataFrame(df_list, columns=['country_code', 'country', 'year', 'innovation_title', 'innovation_text'])

In [None]:
print(df_innovations.head())

In [None]:

df_innovations.reset_index()
output_path = Path(product['data'])
output_path.parent.mkdir(parents=True, exist_ok=True)  # Create missing directories


df_innovations.to_excel(product['data'], index=False)

In [None]:
################################################################################################################################################################################################