In [None]:
# IMPORTANT: The parameters below are set only for running this notebook independently. 
# When executing the full Ploomber pipeline, these values will be overridden by the settings in `pipeline.yaml`. 
# Any modifications made here will not persist when running the pipeline.


YEAR = 2018
COUNTRY =  'ARM' # Code of the Country
STARTING_YEAR = 2018
upstream = {
    "03_coars_narratives": {
        "data": f"../data/processed/{COUNTRY}/narrative_by_section.xlsx",
    }
}
product = {
    "data": f"../data/processed/{COUNTRY}/coar_context_summaries.xlsx",
}

# Context

This Notebook uses an LLM model to extract the context by year and overall from the coar summaries. 

In [None]:
import re
import pandas as pd
from unicef_cpe.config import PROJ_ROOT
from unicef_cpe import utils, genai
from pathlib import Path

In [None]:
# Restrict only to these countries to avoid unexpected costs and long executing times.
country_map = {k:v for k,v in utils.get_ecaro_countries_mapping(priority=False).items() if k in COUNTRY}
country_code_map = {v:k for k,v in country_map.items() }
MODEL = 'gpt-4o'

In [None]:
df_coars = pd.read_excel(upstream['03_coars_narratives']['data'])
print('Shape:', df_coars.shape)
print(df_coars.head())

In [None]:
## Get ONLY the first paragraph for Each Country and each Year as describing the context.
df_coars = df_coars.groupby(['country_code', 'country', 'year']).first().reset_index()
print(df_coars.head())

In [None]:
## NOTE. Let's focus only context AFTER YEAR
print('Shape before:', df_coars.shape)
year_mask = df_coars['year'] >= YEAR
df_coars = df_coars[year_mask].copy()
print('Shape After:', df_coars.shape)

In [None]:
print('Shape before:', df_coars.shape)

df_coars['input'] = df_coars.apply(lambda row: '#### {section_title} in the Country in {year}\n\n{section_text}'.format(**row), axis=1)

print('Shape after:', df_coars.shape)
print(df_coars.head())

In [None]:
df_coars['input'].str.split().str.len().describe().round(1)

In [None]:
country_code_map

In [None]:
df_list = []

country_name = country_map.get(COUNTRY)
country_mask = df_coars['country'] == country_name

years = sorted(df_coars[country_mask]['year'].unique())

for year in years:
    year_mask = df_coars['year'] == year
    
    try:
        # We take only the text from the first section as it is the **context** section
        text = df_coars[country_mask & year_mask]['input'].iloc[0]
    except IndexError:
        # Skip if there's no narrative text for this combination of country and year
        print(f"No narrative text available for {country_name} in {year}")
        continue

    print(f"Summarizing context for {country_name} in {year}")
    context = genai.summarise_coar_context(text, country_name, number_of_paragraphs='one', paragraph_length=100, model=MODEL, api_type='openai')
    df_list.append([year, COUNTRY, country_name,  context])

df_context_by_year = pd.DataFrame(df_list, columns=['year', 'country_code', 'country', 'context'])

In [None]:
print('Shape:', df_context_by_year.shape)
print(df_context_by_year.head())

In [None]:
df_coars = df_coars.groupby(['country', 'section_title'], as_index=False).agg({'input': list})
df_coars['input'] = df_coars['input'].str.join('\n\n')
print('Shape:', df_coars.shape)
print(df_coars.head())

In [None]:
df_list = []



country_mask = df_coars['country'] == country_name    
try:
    # We take only the text from the first section as it is the **context** section
    text = df_coars[country_mask]['input'].iloc[0]
except IndexError:
    # Skip if there's no narrative text for this combination of country and year
    print(f"No narrative text available for {country_name}")
    

print(f"Summarizing context for {country_name}")
context = genai.summarise_coar_context(text, country_name, number_of_paragraphs='two', paragraph_length=300, model=MODEL, api_type='openai')

df_list.append([COUNTRY, country_name, context])

df_context_overall = pd.DataFrame(df_list, columns=['country_code', 'country', 'context'])

In [None]:
print('Shape:', df_context_overall.shape)
print(df_context_overall.head())

In [None]:
df_context_by_year.reset_index()
df_context_overall.reset_index()

output_path = Path(product['data'])
output_path.parent.mkdir(parents=True, exist_ok=True)  # Create missing directories

writer = pd.ExcelWriter(path=product['data'], engine='openpyxl', mode='w')

df_context_by_year.to_excel(writer, sheet_name='context_by_year')
df_context_overall.to_excel(writer, sheet_name='context_overall')
writer.close()

In [None]:
################################################################################################################################################################################################