In [None]:
# IMPORTANT: The parameters below are set only for running this notebook independently. 
# When executing the full Ploomber pipeline, these values will be overridden by the settings in `pipeline.yaml`. 
# Any modifications made here will not persist when running the pipeline.

COUNTRY =  'ARM' # Code of the Country

upstream = {
    "03_coars_narratives": {
        "data": f"../data/processed/{COUNTRY}/narrative_by_section.xlsx",
    },
}
product = {
    "data": f"../data/processed/{COUNTRY}/evidence_of_gender_activity.xlsx",
}



This Notebook uses an LLM model to identify and extract approaches related to gender equality from coar summaries.

# Identify and extract innovations

**Description**
Innovation can be found in different sources:
1. **COARs** documents
2. End of the year Outcome/Output Sumamry Narrative
3. Programme Structure

The Notebook extracts the innovations from the COARs documents and the End of the year Outcome/Output sumamry Narrative into a structure format.


In [None]:
import re
import pandas as pd
from pathlib import Path
from unicef_cpe.config import PROJ_ROOT
from unicef_cpe import utils, genai


In [None]:
# Restrict only to these countries to avoid unexpected costs and long executing times.
country_map = {k:v for k,v in utils.get_ecaro_countries_mapping(priority=False).items() if k in COUNTRY}
country_code_map = {v:k for k,v in country_map.items() }
MODEL = 'gpt-4o'

In [None]:
df_narrative = pd.read_excel(upstream['03_coars_narratives']['data'])

In [None]:
print(df_narrative.groupby(['country','year'])['section_text'].unique())

In [None]:
print(df_narrative)

In [None]:
df_narrative['section_title'] = df_narrative['section_title'].apply(lambda x: x.title().replace(':','').strip())

In [None]:
# Identify sections that potentially discuss about innovation
gender_mask = df_narrative['section_title'].str.contains('Contributions|Context')
print(df_narrative[gender_mask]['section_title'].unique())

In [None]:
df_narrative = df_narrative[gender_mask].groupby(['country', 'year'], as_index=False).agg({'section_text': 'unique'}).copy()
df_narrative['section_text'] = df_narrative['section_text'].str.join('\n\n')
print(df_narrative.head())

In [None]:
df_list = []
country_name = country_map.get(COUNTRY)


country_mask = df_narrative['country'] == country_name

years = sorted(df_narrative[country_mask]['year'].unique())

for year in years:
    year_mask = df_narrative['year'] == year
    
    try:
        # We take only the text from the first section as it is the **context** section
        text = df_narrative[country_mask & year_mask]['section_text'].iloc[0]
    except IndexError:
        # Skip if there's no narrative text for this combination of country and year
        print(f"No narrative text available for {country_name} in {year}")
        continue

    print(f"Summarizing gender for {country_name} in {year}")
    gender_activity = genai.summarise_gender(text, country_name, model=MODEL, api_type='openai')
    
    # Clean text by removing potential generated headers
    gender_activity = utils.remove_section_title(gender_activity)
    gender_activity_list = gender_activity.split('\n-')[1:]  # Skip the first empty element after the split
    
    for inn in gender_activity_list:
        # Split each gender_activity into goal area and its description
        gender_activity_description = inn.split('**:')
        
        gender_activity_title = gender_activity_description[0].replace('- **','').replace('**', '').replace('\n', '').strip()
        gender_activity_text = gender_activity_description[1].replace('\n', '').strip()
        df_list.append([COUNTRY, country_name, year, gender_activity_title, gender_activity_text])


df_gender_activity = pd.DataFrame(df_list, columns=['country_code', 'country', 'year', 'gender_activity_title', 'gender_activity_text'])

In [None]:
print(df_gender_activity.head())

In [None]:
# utils.write_sheet_to_excel(df_gender_activity.reset_index(), OUTPUT_DATA_DIR.joinpath('evidence_of_gender_activity.xlsx'), 'textual_evidence')
output_path = Path(product['data'])
output_path.parent.mkdir(parents=True, exist_ok=True)  # Create missing directories
df_gender_activity.reset_index()
df_gender_activity.to_excel(product['data'], index=False)

In [None]:
################################################################################################################################################################################################