In [None]:
# IMPORTANT: The parameters below are set only for running this notebook independently. 
# When executing the full Ploomber pipeline, these values will be overridden by the settings in `pipeline.yaml`. 
# Any modifications made here will not persist when running the pipeline.

upstream = None
COUNTRY =  'ARM' # Code of the Country
product = {'data': f'../data/processed/{COUNTRY}/output-summary-narrative.xlsx'}  # Path to save the final data product (stored under the 'data' key)
data_source = 'data/raw/insight-ram-combined-outcomes-outputs-and-end-year-summary-narratives-report'  # Path to the source data directory

This Notebook extracts output narratives from the annual End Year Summary Narratives Report.

# Extract narratives from outcomes and outputs
## End of the year summaries

**Description**
Each Outcome and Output have updates every year with the following paragraphs:
Headline Statement
1. Output Analytical Statement of Progress
2. Lessons Learned and Innovations
3. Contributions
4. Partnerhsips

Source data from: **raw/RAM3-combined-outcomes-outputs-and-end-year-summary-narratives-report**


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from unicef_cpe.config import PROJ_ROOT
import re
import unicef_cpe.utils as utils

In [None]:
# Restrict only to the country 

country_map = {k:v for k,v in utils.get_ecaro_countries_mapping(priority=False).items() if k in COUNTRY}
country_code_map = {v:k for k,v in country_map.items() }

In [None]:
source_path = PROJ_ROOT / data_source
file_path = [x for x in sorted(Path(source_path).glob('*.csv'))][0]
country_programme = '-'.join(file_path.name.split('-')[-3:]).replace('.csv','')

In [None]:
df_list = []
for file_path in sorted(Path(source_path).glob('*.csv')):
    country, year = file_path.name.split('-')[-5:3]
    country_programme = '-'.join(file_path.name.split('-')[-3:]).replace('.csv','')
    match = re.search(r'\d{4}', file_path.name)
    if not match:
        print(f'Could not find a match in {file_path.name}. Skipping...')
        continue
    if COUNTRY == country:
        df = pd.read_csv(file_path)
        df['year'] = int(match.group())
        df['country_code'] = country
        df['country'] = country_map.get(country)
        df['country_programme'] = country_programme
        df_list.append(df)

df = pd.concat(df_list, axis=0, ignore_index=True)
print('Shape:', df.shape)
print(df.head())

In [None]:
# #Identify which column to keep and which to remove
# to_keep = ['country_code', 'country', 'country_programme']
# to_remove =[]
# for col in df.columns:
#     #if the columns has always the same value os safe to remove it
#     if df[col].nunique()<=1:
#         pass#to_remove.append(col)
#     # We are not interested in the indicator as for now
#     # elif 'indicator' not in col.lower():
#     else:
#         to_keep.append(col)
# print(len(to_keep),len(to_remove))
# print('Shape before:', df.shape)
# df = df[to_keep].copy()


In [None]:

df = df.map(utils.clean_text)
df.drop_duplicates(inplace=True)
print('Shape after:', df.shape)
print(df)

Dataframe insight by looking at **PCR_FULL_TEXT** columns:
1. **Outcome Statement**
   1. **Textbox1** -> Outcome code + name
   2. **PCR_FULL_TEXT** -> Outcome Statement
   3. **Textbox3** -> Output code + name
   4. **Textbox12** -> Output Statement 
2. **NarrativeTitle**  is empty
3.  **Update on the context and situation of children** has only:
    1.   **Textbox1**-> **Document Title**
    2.   **Textbox9**-> **Text Description**
4.  **Major contributions and drivers of results** has only:
    1.   **Textbox1**-> **Document Title**
    2.   **Textbox9**-> **Text Description**
5.  **UN Collaboration and Other Partnerships**  has only:
    1.   **Textbox1**-> **Document Title**
    2.   **Textbox9**-> **Text Description**
6.  **Lessons Learned and Innovations**  has only:
    1.   **Textbox1**-> **Document Title**
    2.   **Textbox9**-> **Text Description**

In [None]:
general_columns = ['country_programme', 'year', 'country_code', 'country','section']
coars_columns = [ 'coar_title', 'section_description']
narrative_columns = ['outcome_code', 'outcome_name', 'outcome_description', 'output_code', 'output_name', 'output_description']
indicator_columns = [col for col in df.columns if col.startswith('Indicator_Status')]
description_columns = [
    'Progress_headline_statement3',
    'Progress_headline_statement4',
    'Progress_headline_statement7',
    'Require_adjustments4',
    'Require_adjustments5',
    'Details_of_Contribution_Test4',
    'Details_of_Contribution_Test5',
    'Partnerhsips4',
    'Partnerhsips5', 
 ]

In [None]:
def extract_section(row):
    # Extract 'Outcome Statement' if it starts the text, otherwise keep the original text
    return 'Outcome Statement' if row.startswith('Outcome Statement') else row

# Apply the section extraction to create a new 'section' column
df['section'] = df['PCR_FULL_TEXT'].apply(extract_section)

def process_outcome_and_output(row):
    # Process rows where the section is 'Outcome Statement'
    if row['section'] == 'Outcome Statement':
        try:
            # Extract outcome information from Textbox1
            outcome = row['Textbox1'].replace('Outcome:', '').strip()
            outcome_code, outcome_name = outcome.split(' ', 1)

            # Extract outcome description from PCR_FULL_TEXT
            outcome_description = row['PCR_FULL_TEXT'].replace('Outcome Statement:', '').strip()

            # Extract output information from Textbox3
            output = row['Textbox3'].replace('Output:', '').strip()
            output_code, output_name = output.split(' ', 1)
            
            # Output description from PCR_FULL_TEXT
            output_description = row['Textbox12'].replace('Output Statement:', '').strip()

            return outcome_code, outcome_name, outcome_description, output_code, output_name, output_description, None, None
        except (ValueError, AttributeError):
            # Handle unexpected format or missing values
            return None, None, None, None, None, None, None, None
    else:
        # For other sections, return the relevant coar_title and section_description
        coar_title = row.get('Textbox1', None)
        section_description = row.get('Textbox9', None)
        return None, None, None, None, None, None, coar_title, section_description

# Apply the function row-wise and assign the result to new columns
df[narrative_columns + coars_columns] = df.apply(process_outcome_and_output, axis=1, result_type='expand')

In [None]:
#Create two distinct dataframes: 
mask = df['section'].eq('Outcome Statement') 
# 1. **df_coars**: with Final approved COAR's sections
df_coars = df[~mask][general_columns + coars_columns].copy()
df_coars.drop_duplicates(inplace=True)
df_coars = df_coars[df_coars['section'] != 'NarrativeTitle'].copy()
# 2. **df_narrative**: With Outcome and Output individual statements
to_keep = general_columns + narrative_columns + description_columns + indicator_columns
df_narrative = df[mask][to_keep].copy()
df_narrative.drop_duplicates(inplace=True)

In [None]:

# Create the new column 'Output Analytical Statement of Progress'
df_narrative['headline_statement'] = df_narrative['Progress_headline_statement3'].fillna(df_narrative['Progress_headline_statement4'])
df_narrative['output_analytical_statement_of_progress'] = df_narrative[indicator_columns].bfill(axis=1).iloc[:, 0]
df_narrative['lessons_learned_and_innovations'] = df_narrative['Require_adjustments5'].fillna(df_narrative['Require_adjustments4'])
df_narrative['contributions'] = df_narrative['Details_of_Contribution_Test5'].fillna(df_narrative['Details_of_Contribution_Test4'])
df_narrative['partnerhsips'] = df_narrative['Partnerhsips5'].fillna(df_narrative['Partnerhsips4'])

new_description_columns = ['headline_statement', 
                           'output_analytical_statement_of_progress',
                           'lessons_learned_and_innovations',
                           'contributions',
                           'partnerhsips']
new_narrative_columns = general_columns+narrative_columns + new_description_columns

df_narrative = df_narrative[new_narrative_columns].copy()

In [None]:
# Function to aggregate non-empty, non-NaN values into a list
def aggregate_non_nan(values):
    # Filter out NaN and empty values
    filtered_values = [value for value in values if pd.notna(value) and value != '']
    return filtered_values[0] if len(filtered_values) > 0 else np.nan  # Return NaN if list is empty

# Perform the groupby and aggregation
df_narrative = df_narrative.groupby(general_columns + narrative_columns).agg(
    {col: aggregate_non_nan for col in new_description_columns}
).reset_index()

In [None]:
print(df_narrative.head())

In [None]:

output_path = Path(product['data'])
output_path.parent.mkdir(parents=True, exist_ok=True)  # Create missing directories
df_narrative.to_excel(product['data'], index=False)

In [None]:
#######################################################################################################################################################################