In [None]:
# IMPORTANT: The parameters below are set only for running this notebook independently. 
# When executing the full Ploomber pipeline, these values will be overridden by the settings in `pipeline.yaml`. 
# Any modifications here will not persist when running the pipeline.

COUNTRY =  'ARM' # Code of the Country
upstream = {'01_text_extraction':{'data':f'../data/interim/{COUNTRY}/unicef-ecaro-cpe-corpus.jsonl'}} # Path to save the final data product (stored under the 'data' key)
product = {'data':f'../data/processed/{COUNTRY}/narrative_by_section.xlsx'}
data_source = '../data/raw/insight-ram-summary-narrative-analysis'
STARTING_YEAR = 2011

### Extract Summary Narratives
**Description**
The **End-of-Year Summary Narratives** are documented in **COARs**. Summary narratives from **2018 onwards are available in CSV** format, while earlier years require extraction from PDF documents and categorization by section.

This notebook performs the following tasks:

1.	Extracts **summary narratives from 2018 onwards** using available CSV data.
2.	If STARTING_YEAR < 2018  the following steps are executed:
	* Uses the extract_section_titles function to retrieve section titles from PDF files. Since exact extraction is not possible, an approximate method is applied.
	* Creates a structured DataFrame, organizing the extracted sections by country and year for periods **before 2018**.
3.	Merges both DataFrames to generate a comprehensive dataset covering all available years.

In [None]:
import re
from pathlib import Path
import pandas as pd
import unicef_cpe 
from unicef_cpe.config import PROJ_ROOT, DATA_DIR

In [None]:
country_map = {k:v for k,v in unicef_cpe.utils.get_ecaro_countries_mapping(priority=False).items() if k in COUNTRY}
country_code_map = {v:k for k,v in country_map.items() }

In [None]:
## Read End of the Summary narrative (COAR equivalent for available years.)
df_list = []
# for file_path in sorted(upstream_data['summary-narrative']):
for file_path in sorted(Path(data_source).glob('*.csv')):
    match = re.search(r'\d{4}', file_path.name)
    if not match:
        print(f'Could not find a match in {file_path.name}. Skipping...')
        continue
    
    df_coars = pd.read_csv(file_path)
    df_coars['year'] = int(match.group())
    df_coars['file_name']= file_path.name
    df_coars['file_type']= 'summary-narrative'
    df_list.append(df_coars)


df_coars = pd.concat(df_list, axis=0, ignore_index=True)

df_coars['NarrativeText'] = df_coars['NarrativeText'].apply(unicef_cpe.utils.clean_text)
df_coars['country_code'] = df_coars['BUSINESS_AREA_NAME'].replace(country_code_map)

#Filter only relevant country
df_coars = df_coars[df_coars['country_code'].eq(COUNTRY)].copy()


df_coars.columns = ['region', 'country', 'section_title', 'section_text', 'year', 'file_name', 'document_type', 'country_code']
df_coars = df_coars[['country_code', 'country', 'year', 'section_title', 'section_text', 'file_name', 'document_type' ]].copy()

print('Shape:', df_coars.shape)
print(df_coars.head())

In [None]:
if STARTING_YEAR < 2018:
    df_text = unicef_cpe.utils.read_corpora_jsonl(Path(upstream['01_text_extraction']['data']), 'coars')
    df_text['country_code'] = df_text['country'].copy()
    df_text['country'] = df_text['country'].replace(COUNTRY, value=None)
    print('Shape:', df_text.shape)
    print(df_text.head())

In [None]:
if STARTING_YEAR < 2018:
    year_sections = {}
    available_years = sorted(df_text['year'].unique())
    for year in available_years:
        # Example documents (replace with actual text from your documents)
        documents = df_text[df_text['year'] == year]['text'].unique().tolist()
        # Process multiple documents and get title frequency 
        # IMPORTANT: The title extraction is rule based and it might be not exact
        title_frequency = unicef_cpe.document_processing.extract_section_titles_multiple_documents(documents)

        # Filter titles that have "page" in it, Likeley just the page number
        common_titles = [title for title in title_frequency if 'page' not in title.lower()]

        year_sections[year] = common_titles

In [None]:
if STARTING_YEAR < 2018:
    data = []

    # Iterate over each document in the original DataFrame
    for index, row in df_text.iterrows():
        year = row['year']
        # Extract section titles from the document
        section_titles = year_sections[year]
        
        # Extract sections and their text based on the section titles
        sections = unicef_cpe.document_processing.extract_sections_with_text(row['text'], section_titles)



        # Add each section and its text to the new DataFrame
        for section_title, section_text in sections:

            # Clean the text 
            cleaned_section_text = unicef_cpe.utils.clean_text(section_text)
            data.append({
                'country_code': row['country_code'],
                'country': row['country'],
                'year': year,
                'section_title': section_title,
                'section_text': cleaned_section_text,
                'year': row['year'],
                'file_name': row['file_name'],
                'file_type': row['file_type'],
            })

    # Create a new DataFrame from the collected data
    df_sections = pd.DataFrame(data)

In [None]:
if STARTING_YEAR < 2018:
    # Display or use the new DataFrame
    print('Shape:', df_sections.shape)
    df_sections.head()  # To view the first few rows

In [None]:
# Let's merge sections extracted directly from COARs before 2018 with summary narratives. 
# This is because summary narrative was not available before 2018
df = df_coars.copy()
if STARTING_YEAR < 2018:
    df = pd.concat([df_sections[df_sections['year']<2018], df_coars]).copy()
print(df.head())

In [None]:

output_path = Path(product['data'])
output_path.parent.mkdir(parents=True, exist_ok=True)  # Create missing directories
df.to_excel(product['data'], index=False)

In [None]:
################################################################################################################################################################################################