In [None]:
upstream = None
product = None
data_source = 'data/corpora/rdletters/' 
countries_list = ['ARM']

This Notebook uses an LLM model to summarize Regional Directors Letters to identify key strengths (well-executed areas) and areas for improvement (sections needing further attention or action). 

In [None]:
import re
from pathlib import Path
import pandas as pd
from tqdm import tqdm
import unicef_cpe 
# from unicef_cpe.config import DATA_DIR, OUTPUT_DATA_DIR
from unicef_cpe.config import PROJ_ROOT

In [None]:
# Restrict only to these countries to avoid unexpected costs and long executing times.
COUNTRY_CODE_LIST = countries_list

COUNTRIES = unicef_cpe.utils.get_ecaro_countries_mapping(priority=False)
COUNTRY_CODES = {v:k for k, v in COUNTRIES.items()}
MODEL =  'llama3.2:1b'
API_TYPE='ollama'

In [None]:
# upstream_data = {
#     'rdletters': (DATA_DIR / 'corpora' / 'rdletters'), 
#     }

# product = OUTPUT_DATA_DIR.joinpath('rd_letters-strengths-improvement_llama.xlsx')

In [None]:
records = []
source_path =  PROJ_ROOT / Path(data_source)

for country in tqdm(COUNTRY_CODE_LIST):
    for file_path in source_path.glob(f'{country.lower()}*.*'):
    # for file_path in list(Path(source_path).glob(f'{country.lower()}*.*')):     
    # for file_path in list(upstream_data['rdletters'].glob(f'{country.lower()}*.*')):
        year = int(re.search(r'\d+', file_path.name).group())
        record = {
            'country_code': country,
            'country': COUNTRIES.get(country),
            'file_name': file_path.name,
            'file_type': file_path.parent.name,
            'year': year,
            'text': unicef_cpe.extraction.extract_text_from_file(file_path),
        }
        if not record['text'].strip():
            print(f'{file_path} is empty.')
            continue
        #print(record)

        records.append(record)

In [None]:
df_rd_letters = pd.DataFrame.from_records(records)
df_rd_letters.sort_values(['country','year'], inplace=True)
print(df_rd_letters.head())

In [None]:
df_list = []
list_of_countries = [COUNTRIES.get(country) for country in COUNTRY_CODE_LIST]

for country in list_of_countries:
    print(country)
    country_mask = df_rd_letters['country'] == country
    
    years = sorted(df_rd_letters[country_mask]['year'].unique())
    print(years)
    for year in years:
        year_mask = df_rd_letters['year'] == year
        
        try:
            # We take only the text from the first section as it is the **context** section
            text = df_rd_letters[country_mask & year_mask]['text'].iloc[0]
        except IndexError:
            # Skip if there's no narrative text for this combination of country and year
            print(f"No narrative text available for {country} in {year}")
            continue

        print(f"Summarizing Feedbacks for {country} in {year}")
        #response = unicef_cpe.genai.summarise_acomplishments(text, model=MODEL, api_type=API_TYPE)
        #df_list.append(response)
#         sections = re.split(r'####', response.strip(), maxsplit=2)
#         section_title1 = 'Strengths'  # The first section's title
#         section_text1 = sections[1].split('Strengths')[1].strip()
#         section_title2 = 'Areas for Improvement'  # The second section's title is always the same
#         section_text2 = sections[2].split('Areas for Improvement')[1].strip()

#         country_code = COUNTRY_CODES.get(country)
#         df_list.append([country_code, country, year, section_title1, section_text1])
#         df_list.append([country_code, country, year, section_title2, section_text2])

# df_feedback = pd.DataFrame(df_list, columns=['country_code', 'country', 'year', 'feedback_title', 'feedback_text'])

In [None]:
print(df_feedback.head())

In [None]:
df_feedback.to_excel(product['data'], index=False)

In [None]:
################################################################################################################################################################################################

In [None]:
from langchain.schema import SystemMessage, HumanMessage
from pydantic import BaseModel, Field
from typing import Dict
from typing_extensions import Annotated
import json
from typing import Dict
from pydantic import BaseModel
from langchain_openai import ChatOpenAI
from langchain_ollama import ChatOllama


# Adjusted Schema: Ensure Strengths and Areas_for_Improvement are only Dict[str, str]
class EvaluationSummary(BaseModel):
    """Structured summary of strengths and areas for improvement in a UNICEF evaluation."""
    Strengths: Dict[str, str] = Field(
        ..., description="A dictionary where keys are section names and values are concise descriptions of strengths."
    )
    Areas_for_Improvement: Dict[str, str] = Field(
        ..., description="A dictionary where keys are section names and values are concise descriptions of areas needing improvement."
    )

# Adjusted system message with a stronger instruction
system_message = """
You are a UNICEF expert analyzing a text that reports what was done well and what needs improvement in a program or initiative.

### **Instructions**:
1. Extract all key points from the text.
2. Categorize them into:
   - **Strengths**: What was successfully carried out.
   - **Areas for Improvement**: What needs further attention or refinement.
3. Capture all relevant details—do not omit minor but important points.
4. **Return only structured JSON** in this format:
   ```json
   {
       "Strengths": {
           "Action/Aspect 1": "Brief description",
           "Action/Aspect 2": "Brief description"
       },
       "Areas_for_Improvement": {
           "Action/Aspect 1": "Brief description",
           "Action/Aspect 3": "Brief description"
       }
   }
5.	Do not include explanations, introductions, or extra text.
6.	Ensure each action/aspect appears only once per category.
7.	Extract and categorize the maximum number of relevant points.
"""
# llm = ChatOpenAI(
#             model='gpt-3.5-turbo',
#             temperature=0.0,
#             seed=42
#         )

llm = ChatOllama(
    model='mistral',
    temperature=0.0,
    seed=42
).with_structured_output(EvaluationSummary, method="json_schema")

messages = [
    SystemMessage(content=system_message),
    HumanMessage(content=text),
]
response = llm.invoke(messages)
print(response)


In [None]:
response.Strengths, response.Areas_for_Improvement

In [None]:
len(response.Strengths), len(response.Areas_for_Improvement)