In [1]:
# %%
import asyncio
import pandas as pd
from tqdm import tqdm
from typing import List, Dict

# Local Imports from your codebase
from summarization import Summarizer
from evaluation_service import EvaluationAgent
from costar_prompt import CostarPrompt
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage

import os
from dotenv import load_dotenv

# Load environment variables from the .env file in the current directory
load_dotenv(override=True)

# Verify that critical keys are loaded
print(f"Tracing Enabled: {os.getenv('LANGSMITH_TRACING')}")
print(f"Project Name: {os.getenv('LANGSMITH_PROJECT')}")

# %%
# Define model IDs
BASE_MODEL = "gpt-4.1-nano" # or your specific base model
FT_MODEL = "ft:gpt-4.1-nano-2025-04-14:universiti-malaya:summarizer-07122025:Ck3tnFjB"

# # Initialize Evaluator (using gpt-5-mini as judge at temp 0.0)
# evaluator = EvaluationAgent(
#     task_type="summarization", 
#     threshold=0.8, 
#     model_name="gpt-5-mini"
# )

# Generic Prompt (Minimalist)
GENERIC_PROMPT = "Summarize the following news article to 3-5 sentences."

Tracing Enabled: None
Project Name: None


In [16]:
# %%
async def run_config_1(article_text: str, config: dict = None):
    """Configuration 1: Base Model + Generic Prompt"""
    llm = ChatOpenAI(model=BASE_MODEL, temperature=0.0)
    msg = [HumanMessage(content=f"{GENERIC_PROMPT}\n\n{article_text}")]
    # The config object is passed directly to the ainvoke method
    res = await llm.ainvoke(msg, config=config)
    return res.content

async def run_config_2(article_text: str, config: dict = None):
    """Configuration 2: Base Model + CoSTAR Framework"""
    summarizer = Summarizer(model_id=BASE_MODEL)
    # Ensure the summarize method is updated to accept and pass the config object
    return summarizer.summarize(article_text, config=config)

async def run_config_3(article_text: str, config: dict = None):
    """Configuration 3: Fine-Tuned Model + Generic Prompt"""
    llm = ChatOpenAI(model=FT_MODEL, temperature=0.0)
    msg = [HumanMessage(content=f"{GENERIC_PROMPT}\n\n{article_text}")]
    res = await llm.ainvoke(msg, config=config)
    return res.content

async def run_config_4(article_text: str, config: dict = None):
    """Configuration 4: Proposed System (Fine-Tuned + CoSTAR)"""
    summarizer = Summarizer(model_id=FT_MODEL)
    return summarizer.summarize(article_text, config=config)

In [5]:
import asyncio
from tqdm import tqdm
from typing import List, Dict
import pandas as pd

async def evaluate_test_set(test_data: List[Dict[str, str]]):
    all_results = []
    
    for i, data in enumerate(tqdm(test_data, desc="Evaluating Articles")):
        title = data.get("title", f"Article_{i}")
        article_text = data.get("text", "")
        
        if not article_text:
            continue

        # Prepare metadata for each configuration to be tracked in LangSmith
        try:
            c1_out, c2_out, c3_out, c4_out = await asyncio.gather(
                run_config_1(article_text, config={"metadata": {"article_id": i, "config": "Base_Generic"}}),
                run_config_2(article_text, config={"metadata": {"article_id": i, "config": "Base_CoSTAR"}}),
                run_config_3(article_text, config={"metadata": {"article_id": i, "config": "FT_Generic"}}),
                run_config_4(article_text, config={"metadata": {"article_id": i, "config": "Proposed_System"}})
            )
        except Exception as e:
            print(f"‚ö†Ô∏è Error in generation for '{title}': {e}")
            continue
        
        configs = [
            ("Base_Generic", c1_out),
            ("Base_CoSTAR", c2_out),
            ("FT_Generic", c3_out),
            ("Proposed_System", c4_out)
        ]
        
        for config_name, output in configs:

            # Execute metric evaluation using the objective judge
            eval_res = await evaluator.a_evaluate(
                generated_text=output,
                source_context=article_text,
                section_topic=f"Ablation Test: {config_name} | {title}"
            )
            
            # Construct the comprehensive data row
            row = {
                "article_id": i,
                "title": title,
                "config": config_name,
                "input_text": article_text,  # Tracking the source input
                "output_result": output,     # Tracking the summarization output
                "factual_fidelity": eval_res['metrics']['Factual Fidelity']['score'],
                "contextual_relevance": eval_res['metrics']['Content Importance & Relevance']['score'],
                "pro_tone": eval_res['metrics']['Professional Tone & Grammar']['score'],
                "exec_quality": eval_res['metrics']['Executive Writing Quality']['score'],
                "logical_flow": eval_res['metrics']['Summary Coherence & Flow']['score'],
                "avg_score": eval_res['average_score']
            }
            all_results.append(row)
            
    return pd.DataFrame(all_results)

In [6]:

# %% [markdown]
# ## 4. Execution Logic
# Loading the Excel data and preparing the evaluation payload.

# %%
# Load the dataset from the specified local path
file_path = r"C:\Users\gooyt\Desktop\ai-news-agent\data\Data\test_dataset_22122025.xlsx"
df_test = pd.read_excel(file_path)

# Prepare the test data list
# We combine 'title' and 'text' to ensure the Evaluator has full context
test_data = df_test[['title', 'text']].dropna(subset=['text']).to_dict('records')

print(f"‚úÖ Loaded {len(test_data)} valid articles from Excel for the Ablation Study.")


‚úÖ Loaded 74 valid articles from Excel for the Ablation Study.


In [None]:

# Run the experiment
if len(test_data) > 0:
    results_df = await evaluate_test_set(test_data)
    
    # Export raw results for verification
    results_df.to_csv("ablation_study_results_raw.csv", index=False)
    
    # Aggregate results for Table 1 (Section 4.2)
    table_1 = results_df.groupby("config")[
        ["factual_fidelity", "contextual_relevance", "pro_tone", "logical_flow", "exec_quality", "avg_score"]
    ].mean().reset_index()
    
    # Sort to match the thesis narrative
    config_order = ["Base_Generic", "Base_CoSTAR", "FT_Generic", "Proposed_System"]
    table_1['config'] = pd.Categorical(table_1['config'], categories=config_order, ordered=True)
    table_1 = table_1.sort_values("config")
    
    display(table_1)

In [10]:
results_df

Unnamed: 0,article_id,title,config,input_text,output_result,factual_fidelity,contextual_relevance,pro_tone,exec_quality,logical_flow,avg_score
0,0,GENM: To remain listed after Genting falls sho...,Base_Generic,KUALA LUMPUR (Dec 1): Genting Malaysia Bhd (KL...,Genting Malaysia Bhd will remain listed after ...,1.0,1.0,0.9,0.9,0.8,0.92
1,0,GENM: To remain listed after Genting falls sho...,Base_CoSTAR,KUALA LUMPUR (Dec 1): Genting Malaysia Bhd (KL...,Genting Malaysia Bhd (GENM) will remain listed...,1.0,1.0,0.9,0.8,0.8,0.90
2,0,GENM: To remain listed after Genting falls sho...,FT_Generic,KUALA LUMPUR (Dec 1): Genting Malaysia Bhd (KL...,"Genting Malaysia Bhd will remain listed, after...",1.0,1.0,0.9,0.8,0.8,0.90
3,0,GENM: To remain listed after Genting falls sho...,Proposed_System,KUALA LUMPUR (Dec 1): Genting Malaysia Bhd (KL...,"Genting Malaysia Bhd will remain listed, after...",1.0,1.0,1.0,0.8,0.7,0.90
4,1,PESTEC: Names Pee Boon Hooi as new group CFO,Base_Generic,KUALA LUMPUR (Dec 1): Power grid and rail netw...,Pestec International Bhd has appointed Pee Boo...,1.0,1.0,0.9,0.7,0.8,0.88
...,...,...,...,...,...,...,...,...,...,...,...
291,72,PASUKGB: Bags RM64m data centre construction j...,Proposed_System,KUALA LUMPUR (Dec 15): Engineering services pr...,Engineering services provider Pasukhas Group B...,1.0,1.0,0.8,0.8,0.9,0.90
292,73,THPLANT: Names ex-Censof CFO Md Zaini Md Zakar...,Base_Generic,KUALA LUMPUR (Dec 15): TH Plantations Bhd (KL:...,TH Plantations Bhd has appointed Md Zaini Md Z...,1.0,1.0,0.9,0.8,0.8,0.90
293,73,THPLANT: Names ex-Censof CFO Md Zaini Md Zakar...,Base_CoSTAR,KUALA LUMPUR (Dec 15): TH Plantations Bhd (KL:...,TH Plantations Bhd has appointed Md Zaini Md Z...,1.0,1.0,1.0,0.8,0.8,0.92
294,73,THPLANT: Names ex-Censof CFO Md Zaini Md Zakar...,FT_Generic,KUALA LUMPUR (Dec 15): TH Plantations Bhd (KL:...,"TH Plantations Bhd, the plantation arm of Lemb...",1.0,1.0,1.0,0.7,0.8,0.90


In [11]:
from langsmith import Client
import json

client = Client()

project_name = "news-agent"

runs = client.list_runs(
    project_name=project_name,
    execution_order=1,
    limit=4000
)

exported_runs = []

for run in runs:
    exported_runs.append({
        "run_id": run.id,
        "name": run.name,
        "run_type": run.run_type,
        "start_time": run.start_time,
        "end_time": run.end_time,
        "inputs": run.inputs,
        "outputs": run.outputs,
        "error": run.error,
        "metadata": run.extra
    })

with open("langsmith_traces_v2.json", "w") as f:
    json.dump(exported_runs, f, indent=2, default=str)


In [7]:
import pandas as pd
import numpy as np

def perform_safe_aggregation(input_file, output_file):
    # Load the dataset
    try:
        df = pd.read_excel(input_file)
    except Exception:
        print(f"Error: Could not read {input_file}. Ensure the file exists and is not open.")
        return

    # Define the intended metric columns
    quality_metrics = ["factual_fidelity", "contextual_relevance", "pro_tone", "logical_flow", "exec_quality", "avg_score"]
    efficiency_metrics = ["total_tokens", "total_time_seconds"]

    # Filter for columns that are actually present in the file to avoid KeyError
    active_quality = [c for c in quality_metrics if c in df.columns]
    active_efficiency = [c for c in efficiency_metrics if c in df.columns]

    if "config" not in df.columns:
        print("Error: The 'config' column is missing. Aggregation aborted.")
        return

    # Convert metric columns to numeric to ensure math operations succeed
    for col in active_quality + active_efficiency:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Execute group aggregation
    # Using a list to hold dataframes for a final concatenation
    agg_parts = []
    
    if active_quality:
        agg_parts.append(df.groupby("config")[active_quality].agg(["mean", "std"]))
    
    if active_efficiency:
        agg_parts.append(df.groupby("config")[active_efficiency].agg(["mean", "std"]))

    if not agg_parts:
        print("No valid metric columns were found for aggregation.")
        return

    # Combine metrics and reset the index for a flat table structure
    agg_df = pd.concat(agg_parts, axis=1).reset_index()

    # Flatten the MultiIndex column headers
    flat_cols = []
    for col in agg_df.columns:
        if isinstance(col, tuple) and col[1]:
            flat_cols.append(f"{col[0]}_{col[1]}")
        else:
            # Handle the 'config' column which does not have a suffix
            flat_cols.append(col[0] if isinstance(col, tuple) else col)
    agg_df.columns = flat_cols

    # Apply professional naming conventions for the final report
    rename_mapping = {
        "config": "Configuration",
        "avg_score_mean": "Avg Score",
        "avg_score_std": "Score SD",
        "total_tokens_mean": "Avg Tokens",
        "total_tokens_std": "Tokens SD",
        "total_time_seconds_mean": "Avg Time (s)",
        "total_time_seconds_std": "Time SD"
    }
    
    # Only rename columns that were successfully generated
    final_rename = {k: v for k, v in rename_mapping.items() if k in agg_df.columns}
    agg_df.rename(columns=final_rename, inplace=True)

    # Export results to Excel
    agg_df.to_excel(output_file, index=False)
    print(f"Aggregation successfully completed. Results saved to {output_file}")

# Execute the process
perform_safe_aggregation("evaluation_results_recovered_v2.xlsx", "evaluation_summary_v3.xlsx")

Aggregation successfully completed. Results saved to evaluation_summary_v3.xlsx


In [20]:
from evaluate import load

rouge = load("rouge")


Downloading builder script: 6.14kB [00:00, 6.20MB/s]


In [21]:
df = pd.read_excel("ablation_study_results_raw_v2.xlsx")
df

Unnamed: 0,article_id,title,config,input_text,output_result,factual_fidelity,contextual_relevance,pro_tone,exec_quality,logical_flow,avg_score
0,0,GENM: To remain listed after Genting falls sho...,Proposed_System,KUALA LUMPUR (Dec 1): Genting Malaysia Bhd (KL...,"Genting Malaysia Bhd will remain listed, after...",1.0,1.0,1.0,0.8,0.7,0.90
1,0,GENM: To remain listed after Genting falls sho...,Base_CoSTAR,KUALA LUMPUR (Dec 1): Genting Malaysia Bhd (KL...,Genting Malaysia Bhd (GENM) will remain listed...,1.0,1.0,0.9,0.8,0.8,0.90
2,0,GENM: To remain listed after Genting falls sho...,FT_Generic,KUALA LUMPUR (Dec 1): Genting Malaysia Bhd (KL...,"Genting Malaysia Bhd will remain listed, after...",1.0,1.0,0.9,0.8,0.8,0.90
3,0,GENM: To remain listed after Genting falls sho...,Base_Generic,KUALA LUMPUR (Dec 1): Genting Malaysia Bhd (KL...,Genting Malaysia Bhd will remain listed after ...,1.0,1.0,0.9,0.9,0.8,0.92
4,1,PESTEC: Names Pee Boon Hooi as new group CFO,Proposed_System,KUALA LUMPUR (Dec 1): Power grid and rail netw...,Power grid and rail network engineering group ...,1.0,1.0,1.0,0.6,0.7,0.86
...,...,...,...,...,...,...,...,...,...,...,...
291,72,PASUKGB: Bags RM64m data centre construction j...,FT_Generic,KUALA LUMPUR (Dec 15): Engineering services pr...,Engineering services provider Pasukhas Group B...,1.0,1.0,1.0,0.8,0.9,0.94
292,73,THPLANT: Names ex-Censof CFO Md Zaini Md Zakar...,FT_Generic,KUALA LUMPUR (Dec 15): TH Plantations Bhd (KL:...,"TH Plantations Bhd, the plantation arm of Lemb...",1.0,1.0,1.0,0.7,0.8,0.90
293,73,THPLANT: Names ex-Censof CFO Md Zaini Md Zakar...,Proposed_System,KUALA LUMPUR (Dec 15): TH Plantations Bhd (KL:...,"TH Plantations Bhd, the plantation arm of Lemb...",1.0,1.0,1.0,0.7,0.8,0.90
294,73,THPLANT: Names ex-Censof CFO Md Zaini Md Zakar...,Base_Generic,KUALA LUMPUR (Dec 15): TH Plantations Bhd (KL:...,TH Plantations Bhd has appointed Md Zaini Md Z...,1.0,1.0,0.9,0.8,0.8,0.90


In [25]:
import pandas as pd
import evaluate

# Load the evaluation metrics from Hugging Face
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

def calculate_nlp_metrics(file_path, output_path):
    # Load your existing results
    df = pd.read_excel(file_path) # or pd.read_excel if using the .xlsx version
    
    # Ensure the columns exist and are string types
    df['input_text'] = df['input_text'].astype(str)
    df['output_result'] = df['output_result'].astype(str)
    
    # 1. Calculate ROUGE Scores
    # ROUGE compute expects a list of predictions and a list of references
    rouge_results = rouge.compute(
        predictions=df['output_result'].tolist(),
        references=df['input_text'].tolist(),
        use_aggregator=False # We want per-row scores for your table
    )
    
    # Add ROUGE scores back to the DataFrame
    df['rouge1'] = rouge_results['rouge1']
    df['rouge2'] = rouge_results['rouge2']
    df['rougeL'] = rouge_results['rougeL']
    
    # 2. Calculate BLEU Scores
    # BLEU calculation can be computationally intensive; we iterate per row
    bleu_scores = []
    for pred, ref in zip(df['output_result'], df['input_text']):
        try:
            res = bleu.compute(predictions=[pred], references=[[ref]])
            bleu_scores.append(res['bleu'])
        except Exception:
            bleu_scores.append(0.0)
            
    df['bleu_score'] = bleu_scores
    
    # Save the updated raw results
    df.to_excel(output_path, index=False)
    print(f"Metrics successfully added and saved to {output_path}")
    return df

# Execute the calculation
df_updated = calculate_nlp_metrics("ablation_study_results_raw_v2.xlsx", "results_with_nlp_metrics.xlsx")

Downloading builder script: 5.94kB [00:00, 2.57MB/s]
Downloading extra modules: 3.34kB [00:00, 743kB/s]


Metrics successfully added and saved to results_with_nlp_metrics.xlsx


In [6]:
import pandas as pd
import numpy as np

def perform_safe_aggregation(input_file, output_file):
    """
    Performs a robust aggregation of evaluation metrics, including 
    DeepEval, ROUGE, and BLEU scores.
    """
    try:
        # Load the dataset (CSV or Excel)
        if input_file.endswith('.xlsx'):
            df = pd.read_excel(input_file)
        else:
            df = pd.read_csv(input_file)
    except Exception as e:
        print(f"Error: Could not read {input_file}. {e}")
        return

    # Expanded list of metrics to include ROUGE and BLEU
    quality_metrics = [
        "factual_fidelity", "contextual_relevance", "pro_tone", 
        "logical_flow", "exec_quality", "avg_score",
        "rouge1", "rouge2", "rouge3", "rougeL", "bleu_score"
    ]
    efficiency_metrics = ["total_tokens", "total_time_seconds"]

    # Filter for columns that are actually present in the file
    active_quality = [c for c in quality_metrics if c in df.columns]
    active_efficiency = [c for c in efficiency_metrics if c in df.columns]

    if "config" not in df.columns:
        print("Error: The 'config' column is missing. Aggregation aborted.")
        return

    # Convert metric columns to numeric to ensure math operations succeed
    for col in active_quality + active_efficiency:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Execute group aggregation
    agg_parts = []
    
    if active_quality:
        agg_parts.append(df.groupby("config")[active_quality].agg(["mean", "std"]))
    
    if active_efficiency:
        agg_parts.append(df.groupby("config")[active_efficiency].agg(["mean", "std"]))

    if not agg_parts:
        print("No valid metric columns were found for aggregation.")
        return

    # Combine metrics and reset the index
    agg_df = pd.concat(agg_parts, axis=1).reset_index()

    # Flatten the MultiIndex column headers
    flat_cols = []
    for col in agg_df.columns:
        if isinstance(col, tuple) and col[1]:
            flat_cols.append(f"{col[0]}_{col[1]}")
        else:
            flat_cols.append(col[0] if isinstance(col, tuple) else col)
    agg_df.columns = flat_cols

    # Professional naming conventions for the thesis table
    rename_mapping = {
        "config": "Configuration",
        "avg_score_mean": "Avg Score",
        "avg_score_std": "Score SD",
        "rouge1_mean": "ROUGE-1 Mean",
        "rouge1_std": "ROUGE-1 SD",
        "rouge2_mean": "ROUGE-2 Mean",
        "rouge2_std": "ROUGE-2 SD",
        "rouge3_mean": "ROUGE-3 Mean",
        "rouge3_std": "ROUGE-3 SD",
        "rougeL_mean": "ROUGE-L Mean",
        "rougeL_std": "ROUGE-L SD",
        "bleu_score_mean": "BLEU Mean",
        "bleu_score_std": "BLEU SD",
        "total_tokens_mean": "Avg Tokens",
        "total_tokens_std": "Tokens SD",
        "total_time_seconds_mean": "Avg Time (s)",
        "total_time_seconds_std": "Time SD"
    }
    
    final_rename = {k: v for k, v in rename_mapping.items() if k in agg_df.columns}
    agg_df.rename(columns=final_rename, inplace=True)

    # Export results
    agg_df.to_excel(output_file, index=False)
    print(f"Aggregation successfully completed. Results saved to {output_file}")

# Execute the process with your latest results file
perform_safe_aggregation("evaluation_results_recovered.xlsx", "evaluation_summary_with_nlp_metrics_v2_25122025.xlsx")

Aggregation successfully completed. Results saved to evaluation_summary_with_nlp_metrics_v2_25122025.xlsx


In [33]:
import pandas as pd

# Load your Excel file
df = pd.read_excel("processed_langsmith_traces_v2.xlsx",sheet_name="Sheet2")

# Slicing syntax: df.iloc[start:stop:step]
# Start at index 2 (the 3rd row), and take every 4th row
extracted_df = df.iloc[2::4]

# Export the results to a new Excel file
extracted_df.to_excel("extracted_rows_exec.xlsx", index=False)

In [None]:
import pandas as pd
import asyncio
import tqdm.notebook as tqdm
from evaluation_service import EvaluationAgent

# 1. Load the existing results
file_path = "results_with_nlp_metrics.xlsx"
df = pd.read_excel(file_path,sheet_name="Sheet1")

# 2. Initialize the Evaluator
# Use gpt-5-mini as the judge for consistency with previous runs
evaluator = EvaluationAgent(
    task_type="recovery_summarization", 
    threshold=0.8, 
    model_name="gpt-5-mini"
)

async def recover_reasons(df):
    """
    Identifies missing reasons and re-evaluates specific rows.
    """
    updated_rows = []
    
    # Filter for rows needing recovery (where reasons are missing)
    recovery_queue = df[df['logical_flow_reason'].isna() | df['exec_quality_reason'].isna()]
    
    print(f"üöÄ Found {len(recovery_queue)} records requiring reason recovery.")

    for index, row in tqdm.tqdm(df.iterrows(), total=len(df), desc="Processing Results"):
        # Check if recovery is needed for this specific row
        if pd.isna(row['logical_flow_reason']) or pd.isna(row['exec_quality_reason']):
            try:
                # Re-run evaluation on the existing output_result
                eval_res = await evaluator.a_evaluate(
                    generated_text=row['output_result'],
                    source_context=row['input_text'],
                    section_topic=f"Reason Recovery: {row['title']}"
                )
                # Extract the new results and update both the score and the reason
                new_flow_score = eval_res['metrics'].get('Summary Coherence & Flow', {}).get('score')
                new_flow_reason = eval_res['metrics'].get('Summary Coherence & Flow', {}).get('feedback')

                new_exec_score = eval_res['metrics'].get('Executive Writing Quality', {}).get('score')
                new_exec_reason = eval_res['metrics'].get('Executive Writing Quality', {}).get('feedback')

                # Mandatory update for consistency
                row['logical_flow'] = new_flow_score
                row['logical_flow_reason'] = new_flow_reason
                row['exec_quality'] = new_exec_score
                row['exec_quality_reason'] = new_exec_reason
                
                # Extract reasons from the new evaluation response
                # Note: 'feedback' contains the qualitative reasoning in your metrics setup
                row['logical_flow_reason'] = eval_res['metrics'].get('Summary Coherence & Flow', {}).get('feedback', "")
                row['exec_quality_reason'] = eval_res['metrics'].get('Executive Writing Quality', {}).get('feedback', "")
                
                # Optional: Update scores if they were also missing
                row['logical_flow'] = eval_res['metrics'].get('Summary Coherence & Flow', {}).get('score', row['logical_flow'])
                row['exec_quality'] = eval_res['metrics'].get('Executive Writing Quality', {}).get('score', row['exec_quality'])
                
            except Exception as e:
                print(f"‚ö†Ô∏è Failed to recover reasons for Article ID {row['article_id']}: {e}")
        
        updated_rows.append(row)
        
    return pd.DataFrame(updated_rows)

# 3. Execute and Save
if __name__ == "__main__":
    updated_df = await recover_reasons(df)
    updated_df.to_excel("evaluation_results_recovered.xlsx", index=False)
    print("‚úÖ Recovery complete. Results saved to 'evaluation_results_recovered.xlsx'.")

2025-12-25 20:59:16,713 [INFO] Initialized EvaluationAgent for task: recovery_summarization


üöÄ Found 296 records requiring reason recovery.


Processing Results:   0%|          | 0/296 [00:00<?, ?it/s]

Output()

Output()

# Iteration 2:

In [None]:
import pandas as pd
import asyncio
import tqdm.auto as tqdm
from summarization import Summarizer
from evaluation_service import EvaluationAgent

# 1. Load the existing results
file_path = "evaluation_results_recovered.xlsx"
df = pd.read_excel(file_path,sheet_name="FT_Generic")

# 2. Initialize Services
summarizer = Summarizer(model_id=FT_MODEL)
evaluator = EvaluationAgent(
    task_type="summarization", 
    threshold=0.8, 
    model_name="gpt-5-mini"
)

async def perform_iterative_refinement(df):
    """
    Applies refinement based on specific metric failures and tracks progress in new columns.
    """
    final_rows = []
    
    # Define the failure condition based on your specific sub-metrics
    # We target style, flow, or quality lapses below the 0.8 threshold
    mask = (df['logical_flow'] < 0.8) | (df['exec_quality'] < 0.8)
    print(f"üöÄ Found {mask.sum()} articles requiring targeted refinement.")

    for index, row in tqdm.tqdm(df.iterrows(), total=len(df), desc="Refining Iteration 2"):
        if (row['logical_flow'] < 0.8) or (row['exec_quality'] < 0.8):
            # Construct feedback using the recovered qualitative reasons
            feedback = (
                f"Logical Flow Feedback: {row['logical_flow_reason']}. "
                f"Executive Quality Feedback: {row['exec_quality_reason']}."
            )
            
            try:
                # Perform Refinement
                refined_output = summarizer.refine_summary(
                    source_text=row['input_text'],
                    current_draft=row['output_result'],
                    improvements=feedback
                )
                
                # Evaluate the Refined Result (Iteration 2)
                eval_res = await evaluator.a_evaluate(
                    generated_text=refined_output,
                    source_context=row['input_text'],
                    section_topic=f"Iteration 2: {row['title']}"
                )
                
                # Populate New Columns for the Second Iteration
                row['output_result_v2'] = refined_output
                row['factual_fidelity_v2'] = eval_res['metrics'].get('Factual Fidelity', {}).get('score')
                row['contextual_relevance_v2'] = eval_res['metrics'].get('Content Importance & Relevance', {}).get('score')
                row['pro_tone_v2'] = eval_res['metrics'].get('Professional Tone & Grammar', {}).get('score')
                row['logical_flow_v2'] = eval_res['metrics'].get('Summary Coherence & Flow', {}).get('score')
                row['exec_quality_v2'] = eval_res['metrics'].get('Executive Writing Quality', {}).get('score')
                row['refinement_triggered'] = True
                
            except Exception as e:
                print(f"‚ö†Ô∏è Error in refinement for {row['title']}: {e}")
        else:
            # If no refinement was needed, copy current values to the v2 columns for consistency
            row['output_result_v2'] = row['output_result']
            row['factual_fidelity_v2'] = row['factual_fidelity']
            row['contextual_relevance_v2'] = row['contextual_relevance']
            row['pro_tone_v2'] = row['pro_tone']
            row['logical_flow_v2'] = row['logical_flow']
            row['exec_quality_v2'] = row['exec_quality']
            row['refinement_triggered'] = False

        final_rows.append(row)
        await asyncio.sleep(0.5)
        
    return pd.DataFrame(final_rows)

# 3. Execution and Export
if __name__ == "__main__":
    v2_df = await perform_iterative_refinement(df)
    v2_df.to_excel("summarization_iteration_2_results.xlsx", index=False)
    print("‚úÖ Iteration 2 complete. Results saved with tracking columns.")

2025-12-26 02:10:42,967 [INFO] Initialized EvaluationAgent for task: summarization
2025-12-26 02:10:42,973 [INFO] Initialized EvaluationAgent for task: summarization


üöÄ Found 34 articles requiring targeted refinement.


Refining Iteration 2:   0%|          | 0/74 [00:00<?, ?it/s]

Input v1: [SystemMessage(content='You are a Senior Financial News Analyst and Editor at a premier Malaysian investment bank. \n            Your objective is to produce and refine high-fidelity news watch summaries that are factually dense, \n            strategically insightful, and aligned with institutional standards.\n            You MUST treat editorial feedback as MANDATORY correction orders to be strictly followed.', additional_kwargs={}, response_metadata={}), HumanMessage(content="\n        # CONTEXT #\nYou are a high-level Investment Banking Senior Editor. Your role is to REWRITE a sub-par draft to meet the bank's strict publication standards.\n# OBJECTIVE #\nREWRITE the draft below because it failed quality audits. \n            You MUST significantly improve the text to address the specific failures (EDITORIAL FEEDBACK (Instructions) provided):Logical Flow Feedback: The output opens with the primary corporate action (Pee Boon Hooi appointed group CFO, effective Monday), then

Output()

Output()

Output()

In [2]:
# 1. Load the existing results
file_path = "evaluation_results_recovered.xlsx"
df = pd.read_excel(file_path,sheet_name="FT_Generic")

# 2. Initialize Services
summarizer = Summarizer(model_id=FT_MODEL)


mask = (df['logical_flow'] < 0.8) | (df['exec_quality'] < 0.8)
df[mask].head()

2025-12-26 03:46:24,542 [INFO] Initialized EvaluationAgent for task: summarization


Unnamed: 0,article_id,title,config,input_text,output_result,factual_fidelity,contextual_relevance,pro_tone,exec_quality,logical_flow,avg_score,rouge1,rouge2,rougeL,bleu_score,logical_flow_reason,exec_quality_reason
1,1,PESTEC: Names Pee Boon Hooi as new group CFO,FT_Generic,KUALA LUMPUR (Dec 1): Power grid and rail netw...,Power grid and rail network engineering group ...,1.0,1,0.9,0.7,1.0,0.88,0.45509,0.439759,0.45509,0.108566,The output opens with the primary corporate ac...,The output leads with the key news immediately...
4,4,SUNWAY: To establish RM2b sukuk programme for ...,FT_Generic,KUALA LUMPUR (Dec 1): Sunway Bhd (KL:SUNWAY) s...,Sunway Bhd said its unit Sunway Cochrane Sdn B...,1.0,1,1.0,0.7,0.9,0.86,0.58156,0.564286,0.58156,0.235096,The response opens with the primary corporate ...,The output leads with the key news immediately...
5,5,ASIANPAC: CEO resigns 'to pursue other opportu...,FT_Generic,KUALA LUMPUR (Dec 1): Property developer Asian...,Property developer Asian Pac Holdings Bhd said...,1.0,0,0.8,0.7,0.8,0.6,0.61194,0.593985,0.61194,0.287207,The output correctly leads with the most impor...,Strong lead ‚Äî the opening sentence immediately...
7,7,TALAMT: Shareholders requisition EGM to appoin...,FT_Generic,KUALA LUMPUR (Dec 2): A group of shareholders ...,A group of shareholders holding a combined sta...,1.0,1,0.9,0.7,1.0,0.9,0.615894,0.593333,0.615894,0.304287,The response opens with the primary corporate ...,The lead delivers the key news immediately and...
8,8,MKH: Sued by homeowners in Shah Alam township ...,FT_Generic,KUALA LUMPUR (Dec 2): Property developer MKH B...,Property developer MKH Bhd said 47 homeowners ...,1.0,1,0.8,0.7,0.8,0.82,0.558635,0.54818,0.558635,0.209547,The lead sentence correctly presents the prima...,Strong opening that delivers the key news imme...


In [16]:
df_copy = df[df['exec_quality'] == 0.5]
sample = df_copy.iloc[2]
sample

article_id                                                             55
title                   SUMI: CoolisT Group ink MOU to develop bio-bas...
config                                                         FT_Generic
input_text              KUALA LUMPUR (Dec 10): Sumisaujana Group Bhd‚Äôs...
output_result           Sumisaujana Group Bhd‚Äôs wholly owned subsidiar...
factual_fidelity                                                      1.0
contextual_relevance                                                    1
pro_tone                                                              1.0
exec_quality                                                          0.5
logical_flow                                                          1.0
avg_score                                                            0.78
rouge1                                                           0.441687
rouge2                                                            0.42394
rougeL                            

In [17]:
def synthesize_feedback(raw_feedback: str, model_id: str = "gpt-4.1-nano") -> str:
    """
    Standalone function to test the conversion of descriptive evaluation 
    feedback into prescriptive, negative-only editorial instructions.
    """
    # Initialize a fast, instruction-following model
    llm = ChatOpenAI(model=model_id, temperature=0.0)

    # System Definition: The strict Editor persona
    system_msg = SystemMessage(
        content="""You are a Senior Managing Editor at an investment bank. 
        Your task is to review quality audits of news summaries and extract 
        ONLY the actionable corrections.

        Rules:
        1. Remove all praise, positive reinforcement, or 'passing' remarks.
        2. Identify negative feedback or specific failures only (e.g., missing figures, wrong tone, structure issues)..
        3. **Positive Framing:** You must convert negative constraints into **POSITIVE DIRECTIVE COMMANDS**. Do not use negative language (like `avoid`, `do not`, `don't`, etc).
        4. If the feedback mentions missing data (dates, amounts), explicitly command the writer to retrieve them from the source.
        5. Tone: Strict, direct, and imperative.
        6. Do not add any additional commentary beyond the instructions.
        
        Response:
        A bulleted list of mandatory instructions.
        """
    )

    # The prompt passes the raw G-Eval output
    human_msg = HumanMessage(
        content=f"""
        ### RAW EVALUATION FEEDBACK ###
        {raw_feedback}

        ### INSTRUCTIONS FOR WRITER ###
        Provide the consolidated list of mandatory corrections below:
        """
    )

    # Execute
    try:
        response = llm.invoke([system_msg, human_msg])
        return response.content.strip()
    except Exception as e:
        return f"Error during synthesis: {e}"
    
# Test the function with sample feedback
sample_feedback = sample['logical_flow_reason'] + " " + sample['exec_quality_reason']
synthesized_instructions = synthesize_feedback(sample_feedback)
print(synthesized_instructions)

- Expand the summary to meet the 3‚Äì6 sentence length requirement, incorporating additional operational details.
- Include the specific date "Dec 10" from the source to provide temporal context.
- Add information on the MOU duration, specifically the two-year period.
- Incorporate the projected financial impact or relevant figures related to the FY ending Dec 31, 2025.
- Retrieve and include any relevant numeric data or percentages from the source to enhance numeric density.


In [18]:
# Perform Refinement
refined_output = summarizer.refine_summary(
    source_text=sample['input_text'],
    current_draft=sample['output_result'],
    improvements=synthesized_instructions
)

üîç Extracted Facts for Refiner:
- On Dec 10, Sumisaujana Group Bhd‚Äôs wholly owned subsidiary SumiSaujana TCM Chemicals Sdn Bhd (SSTCM) entered into a memorandum of understanding (MOU) with Zangjiagang CoolisT Life Technology Co Ltd (CoolisT Group) to jointly develop and commercialise bio-based polyols derived from renewable vegetable oils for use in polyurethane foams in CoolisT Group‚Äôs furniture and bedding products.  
- The MOU is valid for a period of two years from the date of execution, unless terminated earlier in accordance with its terms.  
- The collaboration aims to accelerate the global adoption of sustainable, bio-based chemical materials that meet international environmental and performance standards, with SumiSaujana expanding its product offerings and technical capabilities.  
- The MOU is not expected to have any material effect on the net assets and gearing of SumiSaujana and its subsidiary for the financial year ending Dec 31, 2025.  
- The partnership supports 

In [19]:
refined_output

'On Dec 10, Sumisaujana Group Bhd‚Äôs wholly owned subsidiary SumiSaujana TCM Chemicals Sdn Bhd (SSTCM) entered into a memorandum of understanding (MOU) with Zangjiagang CoolisT Life Technology Co Ltd (CoolisT Group) to jointly develop and commercialise bio-based polyols derived from renewable vegetable oils for use in polyurethane foams in CoolisT Group‚Äôs furniture and bedding products. The MOU is valid for a period of two years from the date of execution, unless terminated earlier in accordance with its terms. The collaboration aims to accelerate the global adoption of sustainable, bio-based chemical materials that meet international environmental and performance standards, with SumiSaujana expanding its product offerings and technical capabilities. The MOU is not expected to have any material effect on the net assets and gearing of SumiSaujana and its subsidiary for the financial year ending Dec 31, 2025. (The Edge)'

In [20]:
evaluator1 = EvaluationAgent(
    task_type="recovery_summarization", 
    threshold=0.8, 
    model_name="gpt-5-mini"
)

# Evaluate the Refined Result (Iteration 2)
eval_res = await evaluator1.a_evaluate(
    generated_text=refined_output,
    source_context=sample['input_text'],
    section_topic=f"Iteration 2: {sample['title']}"
)

2025-12-26 03:52:31,079 [INFO] Initialized EvaluationAgent for task: recovery_summarization


Output()

Output()

2025-12-26 03:52:42,816 [INFO] [RECOVERY_SUMMARIZATION] Executive Writing Quality: 0.900 | PASS
2025-12-26 03:52:42,817 [INFO] [RECOVERY_SUMMARIZATION] Summary Coherence & Flow: 1.000 | PASS


In [21]:
eval_res

{'timestamp': '2025-12-26T03:52:42.816447',
 'section_topic': 'Iteration 2: SUMI: CoolisT Group ink MOU to develop bio-based polyols',
 'metrics': {'Executive Writing Quality': {'score': 0.9,
   'threshold': 0.8,
   'passed': True,
   'feedback': 'The piece opens with the key news immediately and clearly (MOU signed Dec 10), uses impactful, unambiguous sentences, and includes important specifics (two‚Äëyear term; no material effect on net assets for FY ending Dec 31, 2025). It meets the 3‚Äì6 sentence constraint (four sentences), avoids promotional language and quotes, and reads professional. Minor deduction for an overly long, information‚Äëdense opening sentence and a brief parenthetical source note, but overall highly polished and concise.'},
  'Summary Coherence & Flow': {'score': 1.0,
   'threshold': 0.8,
   'passed': True,
   'feedback': 'The response opens with the primary corporate action (SSTCM entering an MOU with CoolisT) and then follows a clear, logical descent: MOU durati