<a href="https://colab.research.google.com/github/wjleece/AI-Agents-w-HF-Evals/blob/main/AI_Agents_w_Evals.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

If you use this code, please cite:

{
  title = {Helping AI Agents and Evaluators Learn Through Dynamic RAG with Human Feedback},

  author = {Bill Leece},

  year = {2025}
  
}

In [None]:
%pip install anthropic
%pip install -q -U google-generativeai
%pip install fuzzywuzzy

In [None]:
#Setup and Imports
import anthropic
import google.generativeai as gemini
import re
import json
import time
import os
import copy
import glob # For finding files matching a pattern
import uuid # For generating unique learning IDs in RAG
from google.colab import userdata
from google.colab import drive # For Google Drive mounting
from datetime import datetime
from typing import Dict, List, Any, Optional, Union, Tuple
from fuzzywuzzy import process, fuzz



In [None]:
# === How to Obtain Your API Keys ===

# --- Google API Key (for Gemini API / Google AI Studio) ---
# 1. Go to Google AI Studio: https://aistudio.google.com/
# 2. Sign in with your Google account.
# 3. If you're new, you might need to agree to the Terms of Service.
# 4. On the main page, look for a button or link like "Get API key" or "Create API key".
#    This is often found in the left-hand menu or as a prominent button.
# 5. You might be prompted to create a new project or select an existing Google Cloud Project.
#    - If creating a new project, give it a descriptive name.
# 6. Once the project is set up, an API key will be generated for you.
# 7. **Important**: Copy this API key immediately and store it in a safe and private place.
#    You might not be able to see the full key again after closing the dialog.
# 8. This key will be your GOOGLE_API_KEY.

# --- Anthropic API Key (for Claude API) ---
# 1. Go to the Anthropic Console: https://console.anthropic.com/
# 2. Sign up for an account or log in if you already have one.
# 3. Once logged in, you'll see "API Keys" in the main navigation menu on the left side.
#    Click on "API Keys".
# 4. Click the "Create Key" button (usually displayed as a prominent button).
# 5. Give your API key a descriptive name (e.g., "ColabNotebookKey" or "MyProject").
# 6. Click "Create Key".
# 7. **Important**: Your API key will be displayed once. Copy it immediately and store it securely.
#    The key will start with "sk-ant-" followed by additional characters.
#    Anthropic will not show you this key again for security reasons.
# 8. **Billing Required**: To use the API, you'll need to add credits to your account.
#    Go to the "Billing" section in the console to add credits or set up automatic billing.
#    New accounts typically receive some free credits to get started.
# 9. This key will be your ANTHROPIC_API_KEY.
# 10. For detailed API documentation and rate limits, visit: https://docs.anthropic.com/

# === How to Add Your API Keys as Secrets in Google Colab ===
# 1. In your Colab notebook, click on the "Key" icon (🔑) in the left sidebar. This will open the "Secrets" pane.
# 2. Click the "+ Add a new secret" button.
# 3. For the Anthropic API Key:
#    - In the "Name" field, enter: ANTHROPIC_API_KEY
#    - In the "Value" field, paste your actual Anthropic API key that you obtained.
#    - Make sure the "Notebook access" toggle is enabled.
# 4. Click the "+ Add a new secret" button again for the Google API Key.
# 5. For the Google API Key:
#    - In the "Name" field, enter: GOOGLE_API_KEY
#    - In the "Value" field, paste your actual Google API key that you obtained.
#    - Ensure the "Notebook access" toggle is enabled.
# 6. Once both secrets are added, they can be accessed in your code as shown below.
#    You might need to restart your Colab runtime for the new secrets to be available.

from google.colab import userdata

ANTHROPIC_API_KEY = userdata.get('ANTHROPIC_API_KEY')
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')

# Now you can use ANTHROPIC_API_KEY and GOOGLE_API_KEY in your code.
# For example, to verify they are loaded (optional, remove or comment out after testing):
# if ANTHROPIC_API_KEY:
#     print("Anthropic Key loaded successfully (first few chars):", ANTHROPIC_API_KEY[:5])
# else:
#     print("Anthropic Key not found. Please check your Colab Secrets.")

# if GOOGLE_API_KEY:
#     print("Google Key loaded successfully (first few chars):", GOOGLE_API_KEY[:5])
# else:
#     print("Google Key not found. Please check your Colab Secrets.")

In [None]:
#Set LLM client & select model

anthropic_client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)
gemini.configure(api_key=GOOGLE_API_KEY)

ANTHROPIC_MODEL_NAME = "claude-3-5-sonnet-latest"
EVAL_MODEL_NAME = "gemini-2.5-flash-preview-05-20" # For the 'best' evals, use "gemini-2.5-pro-preview-05-06" --> this model is slow though, so for the purposes of this demo we'll use gemini-2.5-flash-preview-05-20
CLASSIFIER_MODEL_NAME = "gemini-1.5-flash-latest" # Fast model for question classification / routing

In [None]:
# Drive Authentication & path mapping
# You should create a folder in your Google Drive to store 'learnings'
# Update DEFAULT_LEARNINGS_DRIVE_SUBPATH with the folder path you've created
# The default path is My Drive/AI/Knowledgebases
# You can use that path yourself, but in order for that to be functional, you'll need to create that *exact* path in *your* Google Drive

DRIVE_MOUNT_PATH = '/content/drive'

try:
    drive.mount(DRIVE_MOUNT_PATH)
    print(f"Google Drive mounted successfully at {DRIVE_MOUNT_PATH}.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}. RAG features will not work.")

# Set up the default learnings path
DEFAULT_LEARNINGS_DRIVE_SUBPATH = "My Drive/AI/Knowledgebases"  # Either create this Drive path or map to another that you define
LEARNINGS_DRIVE_BASE_PATH = os.path.join(DRIVE_MOUNT_PATH, DEFAULT_LEARNINGS_DRIVE_SUBPATH)

# Create the directory if it doesn't exist
if not os.path.exists(LEARNINGS_DRIVE_BASE_PATH):
    try:
        os.makedirs(LEARNINGS_DRIVE_BASE_PATH)
        print(f"Created learnings directory: {LEARNINGS_DRIVE_BASE_PATH}")
    except Exception as e:
        print(f"Error creating learnings directory {LEARNINGS_DRIVE_BASE_PATH}: {e}")
else:
    print(f"Using existing learnings directory: {LEARNINGS_DRIVE_BASE_PATH}")

In [None]:
#Specialized System Prompts

# --- Worker AI Prompts ---
worker_base_instructions = """
You are a helpful customer service assistant for an e-commerce system.
Your overriding goal is to be helpful by answering questions and performing actions as requested by a human user.
When responding to the user, use the conversation context to maintain continuity.
- If a user refers to "my order" or similar, use the context to determine which order they're talking about.
- If they mention "that product" or use other references, check the context to determine what they're referring to.
- Always prioritize recent context over older context when resolving references.

The conversation context will be provided to you with each message. This includes:
- Previous questions and answers
- Recently viewed customers, products, and orders
- Recent actions taken (like creating orders, updating products, etc.)
- Relevant Learnings from a knowledge base (if applicable to the current query type).
- **Crucially, results from any tools you use will also be part of the context provided back to you.**

**YOUR RESPONSE TO THE HUMAN USER:**
Your primary role is to communicate effectively and naturally with the human user.
- After you use tools and get their results (which will be shown to you), your final textual response to the user **must be a friendly, conversational, and easy-to-understand summary.**
- **DO NOT output raw data (like JSON strings or complex lists/dictionaries from tool results) directly in your response to the user.**
- Instead, you must **interpret the tool results** and explain the outcome or provide the requested information in natural language.
- For example, if a tool you used returns information like `{"product_name": "Perplexinator", "inventory_count": 1485}`, your response to the user should be something like: "Currently, we have 1485 Perplexinators in stock." or "The Perplexinator has 1485 units available, would you like to know more?"
- If you perform an action (e.g., creating an order), confirm this action clearly and provide key details in a sentence, for instance: "I've successfully created your order (Order ID: O4) for 10 Widgets."
- Always aim to be helpful, polite, and clear in your language.

REQUESTING CLARIFICATION FROM THE USER:
If you determine that you absolutely need more information from the user to accurately and efficiently fulfill their request or use a tool correctly, you MUST:
1. Formulate a clear, concise question for the user.
2. Prefix your entire response with the exact tag: `CLARIFICATION_REQUESTED:`
   Example: `CLARIFICATION_REQUESTED: To update the order, could you please provide the Order ID?`
3. Do NOT use any tools in the same turn you are requesting clarification. Wait for the user's response.

Keep all other responses friendly, concise, and helpful.
"""

worker_operational_system_prompt = f"""
{worker_base_instructions}

Your current task is OPERATIONAL. Focus on understanding user requests related to e-commerce functions (managing orders, products, customers), using the provided tools accurately, and interacting with the data store.
You MUST strictly adhere to any relevant guidelines or rules found in the 'Relevant Learnings from Knowledge Base' section. These learnings represent updated operational procedures and should override your general knowledge if there is a conflict.
Remember to synthesize tool results into a user-friendly textual response. The detailed tool outputs are logged separately for evaluation.
"""

worker_metacognitive_learnings_system_prompt = f"""
{worker_base_instructions}

Your current task is METACOGNITIVE: SUMMARIZING LEARNINGS AND EXPLAINING YOUR THINKING.
If the user asks you to "summarize your learnings", "what have you learned", "why did you", "is there a better way to" or similar phrases, your response should be based PRIMARILY on the content provided to you under the heading "Relevant Learnings from Knowledge Base" in your current context.
- List the key principles or pieces of information from these provided learnings.
- Do not confuse these explicit learnings with a general summary of your recent actions or the current state of the data store, unless a learning specifically refers to such an action or state.
- If no specific learnings are provided in your context for this type of query, you can state that no specific new learnings have been highlighted for this interaction.
- Avoid using tools for this type of summarization unless a tool is specifically designed to retrieve or process learnings.
"""

# --- Evaluator AI Prompt (unified but guided by query type information) ---
# This prompt is largely the same as the one from worker_prompt_update_learning_summary,
# but we will emphasize the query_type in the main prompt to the evaluator.

evaluator_system_prompt = """
You are Google Gemini, an impartial evaluator assessing the quality of responses from an AI assistant to customer service queries.

You will be provided with:
- The user's query and the TYPE of query it was classified as (e.g., OPERATIONAL, METACOGNITIVE_LEARNINGS_SUMMARY).
- The conversation context (including RAG learnings) that was available to the AI assistant.
- **The AI assistant's final user-facing textual response.**
- **A log of tools called by the AI assistant, including their inputs and raw outputs.**
- A snapshot of the 'Data Store State *Before* AI Action'.
- A snapshot of the 'Data Store State *After* AI Action'.
- Details of any clarification questions the AI assistant asked.

Your primary goal is to assess the AI assistant based on the SPECIFIC TASK it was attempting, as indicated by the query type.

For each interaction, evaluate the assistant's response based on:
1.  **Accuracy**:
    * If OPERATIONAL:
        * How correct and factual is the AI's **user-facing textual response**?
        * Does the textual response accurately reflect the outcomes of the **tool calls** and changes in the datastore?
        * Did its actions (tool calls) correctly process information or modify the datastore as intended? Verify against 'Tool Call Log', 'Before' and 'After' states.
    * If METACOGNITIVE_LEARNINGS_SUMMARY: Did the AI accurately summarize the "Relevant Learnings from Knowledge Base" provided in its context? Was the summary faithful to these learnings?
    * Check for new entity IDs and correct updates if applicable to the query type.

2.  **Efficiency**:
    * Did the assistant achieve its goal with minimal clarifying questions?
    * If OPERATIONAL: Were tool calls used appropriately and efficiently? (Refer to 'Tool Call Log')
    * If METACOGNITIVE_LEARNINGS_SUMMARY: Was the summary direct and to the point based on provided learnings?

3.  **Context Awareness**:
    * Did the assistant correctly use the conversation history and entities?
    * Crucially, did the assistant adhere to the task defined by the query type?
    * Did it correctly use any "Relevant Learnings from Knowledge Base" that were pertinent to the query type?
    * For OPERATIONAL tasks, did the user-facing response make sense given the tool outputs and datastore changes?

4.  **Helpfulness & Clarity (of the user-facing response)**:
    * How well did the assistant address the user's needs *for the identified query type* in its textual response?
    * Was the **user-facing response clear, polite, and easy to understand?** Did it avoid jargon or raw data dumps?
    * Did it provide relevant information in a helpful manner?


Score the response on a scale of 1-10 for each criterion, and provide an overall score. Provide detailed reasoning, EXPLICITLY MENTIONING THE QUERY TYPE and referencing the **user-facing text**, the **tool call log**, and **datastore states** as appropriate.
- For OPERATIONAL queries, heavily reference the 'Tool Call Log' and 'Before'/'After' data store states when assessing the underlying actions, and then assess if the user-facing text accurately and clearly conveys this.
- For METACOGNITIVE_LEARNINGS_SUMMARY, heavily reference the "Relevant Learnings from Knowledge Base" that were provided to the worker.

Crucially, verify if the Worker Agent correctly applied any pertinent 'Relevant Learnings from Knowledge Base.' If a relevant learning was provided but ignored or misapplied by the Worker Agent, this should negatively impact the 'Context Awareness' and 'Accuracy' scores significantly.
When evaluating 'Context Awareness,' explicitly state whether the Worker Agent followed or disregarded the provided RAG learnings. Detail any discrepancies.

EVALUATING CLARIFICATION QUESTIONS:
If the worker AI asked for clarification:
- Assess necessity using 'Data Store State *Before* AI Action' and context.
- If necessary and well-phrased, it should NOT negatively impact Efficiency.
- If unnecessary, it SHOULD negatively impact Efficiency.

If you, the evaluator, still have questions, use "CLARIFICATION NEEDED_EVALUATOR:".

DATA STORE CONSISTENCY (Primarily for OPERATIONAL tasks):
When assessing Accuracy for OPERATIONAL tasks, explicitly compare the AI's actions (via tool log and datastore changes) with its claims in the user-facing text.
"""

In [None]:
# Global Data Stores (Initial data - will be managed by the Storage class instance)
# These are initial values. The Storage class will manage them.
initial_customers = {
    "C1": {"name": "John Doe", "email": "john@example.com", "phone": "123-456-7890"},
    "C2": {"name": "Jane Smith", "email": "jane@example.com", "phone": "987-654-3210"}
}

initial_products = {
    "P1": {"name": "Widget A", "description": "A simple widget. Very compact.", "price": 19.99, "inventory_count": 999},
    "P2": {"name": "Gadget B", "description": "A powerful gadget. It spins.", "price": 49.99, "inventory_count": 200},
    "P3": {"name": "Perplexinator", "description": "A perplexing perfunctator", "price": 79.99, "inventory_count": 1483}
}

initial_orders = {
    "O1": {"id": "O1", "product_id": "P1", "product_name": "Widget A", "quantity": 2, "price": 19.99, "status": "Shipped"},
    "O2": {"id": "O2", "product_id": "P2", "product_name": "Gadget B", "quantity": 1, "price": 49.99, "status": "Processing"}
}


In [None]:
# Standalone Anthropic Completion Function (for basic tests)
#def get_completion_anthropic_standalone(prompt: str):
#    message = anthropic_client.messages.create(
#        model=ANTHROPIC_MODEL_NAME,
#        max_tokens=2000,
#        temperature=0.0,
#        system=worker_base_instructions,
#        tools=tools_schemas_list,
#        messages=[
#          {"role": "user", "content": prompt}
#        ]
#    )
#    return message.content[0].text

In [None]:
#prompt_test_anthropic = "Hey there, which AI model do you use for answering questions?"
#print(f"Anthropic Standalone Test: {get_completion_anthropic_standalone(prompt_test_anthropic)}")

In [None]:
class Storage:
    """Manages the in-memory e-commerce datastore."""
    def __init__(self):
        self.customers = copy.deepcopy(initial_customers)
        self.products = copy.deepcopy(initial_products)
        self.orders = copy.deepcopy(initial_orders)
        print("Storage initialized with deep copies of initial data.")

    def get_full_datastore_copy(self) -> Dict[str, Any]:
        """Returns a deep copy of the current datastore."""
        return {
            "customers": copy.deepcopy(self.customers),
            "products": copy.deepcopy(self.products),
            "orders": copy.deepcopy(self.orders)
        }

In [None]:
#Tools that the AI Agent can use to satisfy user requests

tools_schemas_list = [
    {
        "name": "create_customer",
        "description": "Adds a new customer to the database. Includes customer name, email, and (optional) phone number.",
        "input_schema": {
            "type": "object",
            "properties": {
                "name": {"type": "string", "description": "The name of the customer."},
                "email": {"type": "string", "description": "The email address of the customer."},
                "phone": {"type": "string", "description": "The phone number of the customer (optional)."}
            },
            "required": ["name", "email"]
        }
    },
    {
        "name": "get_customer_info",
        "description": "Retrieves customer information based on their customer ID. Returns the customer's name, email, and (optional) phone number.",
        "input_schema": {
            "type": "object",
            "properties": {
                "customer_id": {"type": "string", "description": "The unique identifier for the customer."}
            },
            "required": ["customer_id"]
        }
    },
    {
        "name": "update_customer",
        "description": "Updates an existing customer's information. Only fields that are provided will be updated; other fields remain unchanged.",
        "input_schema": {
            "type": "object",
            "properties": {
                "customer_id": {"type": "string", "description": "The unique identifier for the customer to update."},
                "name": {"type": "string", "description": "The new name for the customer (optional)."},
                "email": {"type": "string", "description": "The new email address for the customer (optional)."},
                "phone": {"type": "string", "description": "The new phone number for the customer (optional)."}
            },
            "required": ["customer_id"]
        }
    },
    {
        "name": "create_product",
        "description": "Adds a new product to the product database. Includes name, description, price, and initial inventory count.",
        "input_schema": {
            "type": "object",
            "properties": {
                "name": {"type": "string", "description": "The name of the product."},
                "description": {"type": "string", "description": "A description of the product."},
                "price": {"type": "number", "description": "The price of the product."},
                "inventory_count": {"type": "integer", "description": "The amount of the product that is currently in inventory."}
            },
            "required": ["name", "description", "price", "inventory_count"]
        }
    },
    {
        "name": "update_product",
        "description": "Updates an existing product with new information. Only fields that are provided will be updated; other fields remain unchanged.",
        "input_schema": {
            "type": "object",
            "properties": {
                "product_id": {"type": "string", "description": "The unique identifier for the product to update."},
                "name": {"type": "string", "description": "The new name for the product (optional)."},
                "description": {"type": "string", "description": "The new description for the product (optional)."},
                "price": {"type": "number", "description": "The new price for the product (optional)."},
                "inventory_count": {"type": "integer", "description": "The new inventory count for the product (optional)."}
            },
            "required": ["product_id"]
        }
    },
    {
        "name": "get_product_info",
        "description": "Retrieves product information based on product ID or product name (with fuzzy matching for misspellings). Returns product details including name, description, price, and inventory count.",
        "input_schema": {
            "type": "object",
            "properties": {
                "product_id_or_name": {"type": "string", "description": "The product ID or name (can be approximate)."}
            },
            "required": ["product_id_or_name"]
        }
    },
    {
        "name": "list_all_products",
        "description": "Lists all available products in the inventory.",
        "input_schema": { "type": "object", "properties": {}, "required": [] }
    },
    {
        "name": "create_order",
        "description": "Creates an order using the product's current price. Inventory is adjusted if sufficient. Orders can only be created for products that are in stock for fulfillment. Supports specifying products by either ID or name with fuzzy matching for misspellings.",
        "input_schema": {
            "type": "object",
            "properties": {
                "product_id_or_name": {"type": "string", "description": "The ID or name of the product to order (supports fuzzy matching)."},
                "quantity": {"type": "integer", "description": "The quantity of the product in the order. Must be positive."},
                "status": {"type": "string", "description": "The initial status of the order (e.g., 'Processing', 'Shipped')."}
            },
            "required": ["product_id_or_name", "quantity", "status"]
        }
    },
    {
        "name": "get_order_details",
        "description": "Retrieves the details of a specific order based on the order ID. Returns the order ID, product name, quantity, price, and order status.",
        "input_schema": {
            "type": "object",
            "properties": {
                "order_id": {"type": "string", "description": "The unique identifier for the order."}
            },
            "required": ["order_id"]
        }
    },
    {
        "name": "update_order_status",
        "description": "Updates the status of an order and adjusts inventory accordingly. Changing to \"Shipped\" decreases inventory (if not already decreased by order creation). Changing from \"Shipped\" to \"Returned\" or \"Canceled\" increases inventory. Status can be \"Processing\", \"Shipped\", \"Delivered\", \"Returned\", or \"Canceled\".",
        "input_schema": {
            "type": "object",
            "properties": {
                "order_id": {"type": "string", "description": "The unique identifier for the order."},
                "new_status": {
                    "type": "string",
                    "description": "The new status to set for the order.",
                    "enum": ["Processing", "Shipped", "Delivered", "Returned", "Canceled"]
                }
            },
            "required": ["order_id", "new_status"]
        }
    }
]

print(f"Defined {len(tools_schemas_list)} tool schemas.")

In [None]:
# Tool Functions

def create_customer(current_storage: Storage, name: str, email: str, phone: Optional[str] = None):
    new_id = f"C{len(current_storage.customers) + 1}"
    while new_id in current_storage.customers: new_id = f"C{int(new_id[1:]) + 1}"
    current_storage.customers[new_id] = {"name": name, "email": email, "phone": phone}
    print(f"[Tool Exec] create_customer: ID {new_id}, Name: {name}")
    return {"status": "success", "customer_id": new_id, "customer": current_storage.customers[new_id]}

def get_customer_info(current_storage: Storage, customer_id: str):
    customer = current_storage.customers.get(customer_id)
    if customer:
        print(f"[Tool Exec] get_customer_info: ID {customer_id} found.")
        return {"status": "success", "customer_id": customer_id, "customer": customer} # Added customer_id to response
    print(f"[Tool Exec] get_customer_info: ID {customer_id} not found.")
    return {"status": "error", "message": "Customer not found"}

def update_customer(current_storage: Storage, customer_id: str, name: Optional[str]=None, email: Optional[str]=None, phone: Optional[str]=None):
    if customer_id not in current_storage.customers:
        print(f"[Tool Exec] update_customer: Customer ID {customer_id} not found.")
        return {"status": "error", "message": f"Customer with ID {customer_id} not found."}

    customer = current_storage.customers[customer_id]
    updated_fields = []

    if name is not None: # Check for None explicitly, as empty string might be a valid (though unusual) name state
        customer["name"] = name
        updated_fields.append("name")
    if email is not None:
        customer["email"] = email
        updated_fields.append("email")
    if phone is not None: # Phone can be set to empty string to remove it, or a new number
        customer["phone"] = phone
        updated_fields.append("phone")

    if not updated_fields:
        print(f"[Tool Exec] update_customer: ID {customer_id}, no fields provided for update.")
        return {"status": "warning", "message": "No fields provided for update.", "customer_id": customer_id, "customer": customer}

    print(f"[Tool Exec] update_customer: ID {customer_id}, Updated fields: {', '.join(updated_fields)}.\n   New data: {customer}")
    return {"status": "success", "customer_id": customer_id, "customer": customer, "updated_fields": updated_fields}

def create_product(current_storage: Storage, name: str, description: str, price: float, inventory_count: int):
    new_id = f"P{len(current_storage.products) + 1}"
    while new_id in current_storage.products: new_id = f"P{int(new_id[1:]) + 1}"
    current_storage.products[new_id] = {"name": name, "description": description, "price": price, "inventory_count": max(0, inventory_count)}
    print(f"[Tool Exec] create_product: ID {new_id}, Name: {name}")
    return {"status": "success", "product_id": new_id, "product": current_storage.products[new_id]}

def update_product(current_storage: Storage, product_id: str, name: Optional[str]=None, description: Optional[str]=None, price: Optional[float]=None, inventory_count: Optional[int]=None):
    if product_id not in current_storage.products:
        print(f"[Tool Exec] update_product: ID {product_id} not found.")
        return {"status": "error", "message": "Product not found"}
    product = current_storage.products[product_id]; updated_fields = []
    if name is not None: product["name"] = name; updated_fields.append("name")
    if description is not None: product["description"] = description; updated_fields.append("description")
    if price is not None: product["price"] = price; updated_fields.append("price")
    if inventory_count is not None :
        product["inventory_count"] = max(0, inventory_count) # Ensure inventory doesn't go below zero
        updated_fields.append("inventory_count")
    if not updated_fields:
        print(f"[Tool Exec] update_product: ID {product_id}, no fields provided for update.")
        return {"status":"warning", "message":"No fields provided for update.", "product_id": product_id, "product": product}
    print(f"[Tool Exec] update_product: ID {product_id}, Updated fields: {', '.join(updated_fields)}.\n   New data: {product}")
    return {"status": "success", "product_id": product_id, "product": product, "updated_fields": updated_fields}

def find_product_by_name(current_storage: Storage, product_name: str, min_similarity: int = 70) -> Tuple[Optional[str], Optional[Dict[str, Any]]]:
    if not product_name: return None, None
    name_id_list = [(p_data["name"], p_id) for p_id, p_data in current_storage.products.items()]
    if not name_id_list: return None, None
    best_match_name_score = process.extractOne(product_name, [item[0] for item in name_id_list], scorer=fuzz.token_sort_ratio)
    if best_match_name_score and best_match_name_score[1] >= min_similarity:
        matched_name = best_match_name_score[0]
        for name_val, pid_val in name_id_list:
            if name_val == matched_name: return pid_val, current_storage.products[pid_val]
    return None, None

def get_product_id(current_storage: Storage, product_identifier: str) -> Optional[str]:
    if product_identifier in current_storage.products: return product_identifier
    product_id, _ = find_product_by_name(current_storage, product_identifier)
    return product_id

def get_product_info(current_storage: Storage, product_id_or_name: str):
    pid = get_product_id(current_storage, product_id_or_name)
    if pid and pid in current_storage.products:
        print(f"[Tool Exec] get_product_info: Found '{product_id_or_name}' as ID '{pid}'.")
        return {"status": "success", "product_id": pid, "product": current_storage.products[pid]}
    print(f"[Tool Exec] get_product_info: Product '{product_id_or_name}' not found.")
    return {"status": "error", "message": f"Product '{product_id_or_name}' not found"}

def list_all_products(current_storage: Storage):
    print(f"[Tool Exec] list_all_products: Found {len(current_storage.products)} products.")
    return {"status": "success", "count": len(current_storage.products), "products": current_storage.products}

def create_order(current_storage: Storage, product_id_or_name: str, quantity: int, status: str):
    actual_product_id = get_product_id(current_storage, product_id_or_name)
    if not actual_product_id:
        print(f"[Tool Exec] create_order: Product '{product_id_or_name}' not found.")
        return {"status": "error", "message": f"Product '{product_id_or_name}' not found."}

    product = current_storage.products[actual_product_id]
    inventory_adjusted_by_creation = False
    order_creation_message = ""

    if quantity <= 0:
        msg = f"Order quantity must be positive. Requested: {quantity}"
        print(f"[Tool Exec] create_order: {msg}")
        return {"status": "error", "message": msg}

    # Create the order entry first
    new_id = f"O{len(current_storage.orders) + 1}"
    while new_id in current_storage.orders:
        new_id = f"O{int(new_id[1:]) + 1}"

    current_order_data = {
        "id": new_id,
        "product_id": actual_product_id,
        "product_name": product["name"],
        "quantity": quantity,
        "price": product["price"],
        "status": status # Initially use the status provided by agent
    }
    current_storage.orders[new_id] = current_order_data
    print(f"[Tool Exec] create_order: Order entry {new_id} created with status '{status}'.")

    # Handle inventory based on availability
    if product["inventory_count"] < quantity:
        order_creation_message = (f"Order {new_id} ({product['name']}) created with status '{status}'. "
                                  f"Insufficient inventory for fulfillment. Available: {product['inventory_count']}, Requested: {quantity}. "
                                  "Inventory not adjusted.")
        print(f"[Tool Exec] create_order: {order_creation_message}")
        # Inventory is NOT changed. Agent might need to set to 'Backordered' or handle it.
    else: # Sufficient inventory
        product["inventory_count"] = max(0, product["inventory_count"] - quantity)
        inventory_adjusted_by_creation = True
        order_creation_message = (f"Order {new_id} ({product['name']}) created with status '{status}'. "
                                  f"Inventory sufficient and deducted. New inventory: {product['inventory_count']}.")
        print(f"[Tool Exec] create_order: {order_creation_message}")

    return {
        "status": "success",
        "order_id": new_id,
        "order_details": current_storage.orders[new_id], # Return the created order
        "remaining_inventory": product["inventory_count"], # Current inventory after potential deduction
        "inventory_adjusted_by_creation": inventory_adjusted_by_creation,
        "message": order_creation_message
    }

def get_order_details(current_storage: Storage, order_id: str):
    order = current_storage.orders.get(order_id)
    if order: print(f"[Tool Exec] get_order_details: Order {order_id} found."); return {"status": "success", "order_details": order}
    print(f"[Tool Exec] get_order_details: Order {order_id} not found."); return {"status": "error", "message": "Order not found"}

def update_order_status(current_storage: Storage, order_id: str, new_status: str):
    if order_id not in current_storage.orders:
        print(f"[Tool Exec] update_order_status: Order {order_id} not found.")
        return {"status": "error", "message": "Order not found"}

    order = current_storage.orders[order_id]
    product_id = order["product_id"]
    quantity = order["quantity"]
    # It's possible product_id might not exist if data is corrupted, though unlikely with current setup
    if product_id not in current_storage.products:
        print(f"[Tool Exec] update_order_status: Product ID {product_id} for order {order_id} not found in products list!")
        return {"status": "error", "message": f"Product {product_id} for order {order_id} not found, cannot update status or inventory."}

    product = current_storage.products[product_id]
    old_status = order["status"]

    if old_status == new_status:
        print(f"[Tool Exec] update_order_status: Status for {order_id} already {new_status}.")
        return {"status": "unchanged", "message": f"Status is already {new_status}", "order_details": order, "current_inventory": product["inventory_count"]}

    inventory_adjusted_by_this_update = False

    if new_status == "Shipped" and old_status != "Shipped":
        if product["inventory_count"] < quantity:
            print(f"[Tool Exec] update_order_status: Insufficient inv for {product_id} to ship order {order_id}. Available: {product['inventory_count']}")
            return {"status": "error", "message": f"Insufficient inventory to change status to Shipped. Available: {product['inventory_count']}"}
        product["inventory_count"] = max(0, product["inventory_count"] - quantity)
        inventory_adjusted_by_this_update = True

    elif old_status == "Shipped" and new_status != "Shipped": # e.g., Returned, Canceled FROM Shipped
        product["inventory_count"] += quantity # Add back to inventory
        inventory_adjusted_by_this_update = True

    order["status"] = new_status
    print(f"[Tool Exec] update_order_status: Order {order_id} to {new_status}. Inv for {product_id} is {product['inventory_count']}. Adjusted: {inventory_adjusted_by_this_update}")
    return {
        "status": "success",
        "order_id": order_id,
        "order_details": order,
        "current_inventory": product["inventory_count"],
        "inventory_adjusted_by_this_update": inventory_adjusted_by_this_update
    }

print("Tool functions defined.")

In [None]:
class QueryClassifier:
    """Classifies user queries using an LLM."""
    def __init__(self, llm_client):
        self.llm_client = llm_client
        self.classification_prompt_template = """
Classify the following user query into one of these categories: OPERATIONAL, METACOGNITIVE_LEARNINGS_SUMMARY.
Return ONLY the category name.

OPERATIONAL queries are about performing e-commerce tasks, like asking about products, creating orders, or updating customer information.
Examples of OPERATIONAL:
- "Show me all shoes."
- "What's the price of P1?"
- "Create an order for 2 widgets."
- "Update my address."

METACOGNITIVE_LEARNINGS_SUMMARY queries are about the AI's own learning process or knowledge derived from feedback.
Examples of METACOGNITIVE_LEARNINGS_SUMMARY:
- "Summarize your learnings."
- "What have you learned recently?"
- "Tell me about your new knowledge."
- "Why did you do that in the last turn?"
- "Is there a better way to handle X?"

User Query: "{user_message}"
Classification:"""
        print("QueryClassifier initialized with LLM client.")

    def classify(self, user_message: str) -> str:
        """Classifies the user query using the LLM."""
        prompt = self.classification_prompt_template.format(user_message=user_message)
        try:
            response = self.llm_client.generate_content(prompt)
            classification = response.text.strip()
            if classification in ["OPERATIONAL", "METACOGNITIVE_LEARNINGS_SUMMARY"]:
                return classification
            else:
                print(f"[QueryClassifier Warning] LLM returned unexpected classification: '{classification}'. Defaulting to OPERATIONAL.")
                return "OPERATIONAL"
        except Exception as e:
            print(f"[QueryClassifier Error] Failed to classify query using LLM: {e}. Defaulting to OPERATIONAL.")
            return "OPERATIONAL"

In [None]:
class ConversationManager:
    """Manages conversation history and context data."""
    def __init__(self):
        self.messages: List[Dict[str, Any]] = []
        self.context_data: Dict[str, Any] = {
            "customers": {}, "products": {}, "orders": {}, "last_action": None
        }
        print("ConversationManager initialized.")

    def add_user_message(self, message: str) -> None:
        self.messages.append({"role": "user", "content": message})

    def add_assistant_message(self, message_content: Union[str, List[Dict[str, Any]]], query_type: str) -> None:
        if isinstance(message_content, str):
            content_to_log = f"[{query_type}]: {message_content}"
        else:
            content_to_log = message_content
        self.messages.append({"role": "assistant", "content": content_to_log})

    def update_entity_in_context(self, entity_type: str, entity_id: str, data: Any) -> None:
        if entity_type in self.context_data:
            self.context_data[entity_type][entity_id] = data
            print(f"[CM_Context Updated] Entity: {entity_type}, ID: {entity_id}")

    def set_last_action(self, action_type: str, action_details: Any) -> None:
        self.context_data["last_action"] = {
            "type": action_type,
            "details": action_details,
            "timestamp": datetime.now().isoformat()
        }
        print(f"[CM_Context Updated] Last Action: {action_type}")

    def get_full_conversation_for_api(self) -> List[Dict[str, Any]]:
        return self.messages.copy()

    def get_context_summary(self) -> str:
        summary_parts = []
        if self.context_data["customers"]: summary_parts.append(f"Recent customers: {list(self.context_data['customers'].keys())}")
        if self.context_data["products"]: summary_parts.append(f"Recent products: {list(self.context_data['products'].keys())}")
        if self.context_data["orders"]: summary_parts.append(f"Recent orders: {list(self.context_data['orders'].keys())}")
        if self.context_data["last_action"]: summary_parts.append(f"Last action type: {self.context_data['last_action']['type']}")
        return "\\n".join(summary_parts) if summary_parts else "No specific context items set yet."


In [None]:
class ToolExecutor:
    def __init__(self, available_tools_dict: Dict[str, callable]):
        self.available_tools = available_tools_dict
        print("ToolExecutor initialized.")

    def execute_tool(self, tool_name: str, tool_input: Dict[str, Any], storage_instance: Storage) -> Dict[str, Any]:
        if tool_name in self.available_tools:
            try:
                tool_function = self.available_tools[tool_name]
                result = tool_function(storage_instance, **tool_input)
                print(f"--- [ToolExecutor] Result for {tool_name}: {json.dumps(result, indent=2, default=str)} ---")
                return result
            except Exception as e:
                print(f"--- [ToolExecutor Error] executing {tool_name}: {e} ---"); import traceback; traceback.print_exc()
                return {"status": "error", "message": f"Error executing tool {tool_name}: {str(e)}"}
        print(f"--- [ToolExecutor Error] Tool {tool_name} not found. ---")
        return {"status": "error", "message": f"Tool {tool_name} not found."}

In [None]:
class KnowledgeManager:
    def __init__(self, base_path: str, drive_mount_path: str, default_subpath: str, evaluator_llm_instance):
        self.base_drive_path = base_path
        self.drive_mount_path = drive_mount_path
        self.default_drive_subpath = default_subpath
        self.evaluator_llm = evaluator_llm_instance
        # Use a separate, fast model for the retrieval task
        self.retrieval_llm = gemini.GenerativeModel(model_name=CLASSIFIER_MODEL_NAME)
        self.active_learnings_cache: List[Dict] = self._load_initial_learnings_from_drive()
        self.learnings_updated_this_session_flag: bool = False
        print(f"KnowledgeManager initialized. Loaded {len(self.active_learnings_cache)} initial learnings from {self.base_drive_path}.")

    def _mount_drive_if_needed(self):
        if not os.path.exists(self.drive_mount_path) or not os.listdir(self.drive_mount_path):
            try:
                drive.mount(self.drive_mount_path, force_remount=True)
                print("Drive mounted by KnowledgeManager.", flush=True)
            except Exception as e:
                print(f"KM: Error mounting Drive: {e}.", flush=True)

    def _initialize_learnings_path(self):
        if not os.path.exists(self.base_drive_path):
            try:
                os.makedirs(self.base_drive_path)
                print(f"KM: Created learnings directory: {self.base_drive_path}", flush=True)
            except Exception as e:
                print(f"KM: Error creating learnings directory {self.base_drive_path}: {e}", flush=True)

    def _get_latest_learnings_filepath(self) -> Optional[str]:
        self._mount_drive_if_needed()
        self._initialize_learnings_path()
        if not os.path.isdir(self.base_drive_path):
            print("KM: Base drive path for learnings is not a directory or does not exist.", flush=True)
            return None
        try:
            list_of_files = glob.glob(os.path.join(self.base_drive_path, 'learnings_*.json'))
            if not list_of_files:
                return None
            return max(list_of_files, key=os.path.getctime)
        except Exception as e:
            print(f"KM: Error accessing learnings files in {self.base_drive_path}: {e}", flush=True)
            return None


    def _read_learnings_from_file(self, filepath: str) -> List[Dict]:
        if not filepath or not os.path.exists(filepath):
            print(f"KM: Learnings filepath not provided or does not exist: {filepath}", flush=True)
            return []
        try:
            with open(filepath, 'r') as f:
                learnings_list = json.load(f)
            # Basic validation: ensure it's a list of dictionaries
            if isinstance(learnings_list, list) and all(isinstance(item, dict) for item in learnings_list):
                return learnings_list
            else:
                print(f"KM: Learnings file {filepath} is not in the expected format (list of dicts).", flush=True)
                return []
        except json.JSONDecodeError as e:
            print(f"KM: Error decoding JSON from {filepath}: {e}", flush=True)
            return []
        except Exception as e:
            print(f"KM: Error reading learnings file {filepath}: {e}", flush=True)
            return []

    def _load_initial_learnings_from_drive(self) -> List[Dict]:
        print("KM: Attempting to load initial learnings from Drive...", flush=True)
        latest_filepath = self._get_latest_learnings_filepath()
        if latest_filepath:
            print(f"KM: Loading initial learnings from: {latest_filepath}", flush=True)
            learnings = self._read_learnings_from_file(latest_filepath)
            print(f"KM: Successfully loaded {len(learnings)} learnings.", flush=True)
            return learnings
        print("KM: No existing learnings file found or accessible for initial load.", flush=True)
        return []

    def persist_active_learnings(self):
        print("KM: Checking if learnings need to be persisted to Google Drive...", flush=True)
        self._mount_drive_if_needed()
        self._initialize_learnings_path()
        if not os.path.isdir(self.base_drive_path):
            print("KM CRITICAL: Learnings directory not available for persistence.", flush=True)
            return
        if not self.active_learnings_cache:
            print("KM: Active learnings cache is empty. Nothing to persist.", flush=True)
            return
        if not self.learnings_updated_this_session_flag:
            print("KM: Learnings cache has not been updated this session. Nothing to persist.", flush=True)
            return

        print(f"KM: Persisting {len(self.active_learnings_cache)} learnings to Google Drive (this may take a moment)...", flush=True)
        timestamp_str = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
        new_filepath = os.path.join(self.base_drive_path, f'learnings_{timestamp_str}.json')
        try:
            with open(new_filepath, 'w') as f:
                json.dump(self.active_learnings_cache, f, indent=4)
            print(f"KM: Successfully persisted learnings to: {new_filepath}", flush=True)
            self.learnings_updated_this_session_flag = False
        except Exception as e:
            print(f"KM: Error persisting active learnings to {new_filepath}: {e}", flush=True)

    def _get_semantically_relevant_learning_ids(self, query: str, learnings_to_search: List[Dict], count: int) -> List[str]:
        """
        Uses an LLM to find the most semantically relevant learnings for a given query.
        """
        if not learnings_to_search:
            return []

        # Format the learnings for the LLM prompt
        formatted_learnings_for_prompt = ""
        for learning in learnings_to_search:
            # Provide the core information for the LLM to judge relevance
            statement = learning.get('final_learning_statement', '')
            learning_id = learning.get('learning_id', 'N/A')
            formatted_learnings_for_prompt += f"ID: {learning_id}\\nStatement: {statement}\\n---\\n"

        # Create the prompt for the retrieval LLM
        retrieval_prompt = f"""
          You are a relevance-ranking assistant. Your task is to identify the most relevant knowledge base 'learnings' for the given user query.
          Analyze the user query and the list of available learnings below. The learnings consist of an ID and a statement.
          Return a comma-separated list of the IDs of the top {count} most semantically relevant learnings.
          Do not explain your reasoning. Return only the comma-separated IDs.

          **User Query:**
          "{query}"

          **Available Learnings:**
          {formatted_learnings_for_prompt}

          **Relevant Learning IDs:**
          """
        try:
            response = self.retrieval_llm.generate_content(retrieval_prompt)
            # Clean up the response and split by comma to get the IDs
            relevant_ids = [id.strip() for id in response.text.strip().split(',') if id.strip()]
            print(f"KM (Semantic Search): Identified relevant learning IDs: {relevant_ids}")
            return relevant_ids
        except Exception as e:
            print(f"KM (Semantic Search) Error: Could not retrieve relevant learning IDs. Error: {e}")
            return []


    def get_relevant_learnings_for_prompt(self, query: str, query_type: str,
                                           recipient_role: Optional[str] = None,
                                           count: int = 5) -> str:
        if not self.active_learnings_cache:
            return "No specific relevant learnings from knowledge base provided for this query."

        # Filter learnings based on the recipient's role (Agent or Evaluator)
        eligible_learnings: List[Dict] = []
        if recipient_role == "Agent":
            eligible_learnings = [
                entry for entry in self.active_learnings_cache
                if entry.get("learning_target") == "AgentAndEvaluator"
            ]
        elif recipient_role == "Evaluator":
            eligible_learnings = [
                entry for entry in self.active_learnings_cache
                if entry.get("learning_target") in ["AgentAndEvaluator", "EvaluatorOnly"]
            ]
        else: # Default if role is None or unexpected
            print(f"KM (Info): recipient_role is '{recipient_role}'. Defaulting to 'AgentAndEvaluator' learnings for RAG.")
            eligible_learnings = [
                entry for entry in self.active_learnings_cache
                if entry.get("learning_target") == "AgentAndEvaluator"
            ]

        if not eligible_learnings:
            return "No specific relevant learnings from knowledge base found for this query based on role."

        # If the query is to summarize learnings, we don't need semantic search.
        # Just return the most recent ones.
        if query_type == "METACOGNITIVE_LEARNINGS_SUMMARY":
            learnings_to_consider = sorted(eligible_learnings, key=lambda x: x.get('timestamp_created', ''), reverse=True)[:count]
        else:
            # Use the new LLM-powered semantic search to find the best learnings
            relevant_ids = self._get_semantically_relevant_learning_ids(query, eligible_learnings, count)
            # Create a map for quick lookups
            learnings_by_id = {learning['learning_id']: learning for learning in eligible_learnings}
            # Retrieve the full learning objects in the order the LLM returned them
            learnings_to_consider = [learnings_by_id[id] for id in relevant_ids if id in learnings_by_id]


        formatted_learnings = [
            f"- Learning (ID: {entry.get('learning_id', 'N/A')[:8]}, Target: {entry.get('learning_target', 'N/A')}): {entry.get('final_learning_statement', str(entry))}"
            for entry in learnings_to_consider
        ]

        if not formatted_learnings:
            return "No specific relevant learnings from knowledge base found for this query after all filters."

        return "\\nRelevant Learnings from Knowledge Base (In-Session Cache):\\n" + "\\n".join(formatted_learnings)


    def synthesize_and_store_learning(self, human_feedback_text: str, user_query_context: str, turn_context_summary: str, learning_target: str):
        print(f'--- KM: Processing New Learning Candidate (Target: {learning_target}): "{human_feedback_text}" ---', flush=True)

        current_feedback_to_process = human_feedback_text
        attempt_count = 0
        max_attempts = 3

        while attempt_count < max_attempts:
            attempt_count += 1
            print(f"KM: Learning Synthesis Attempt {attempt_count}/{max_attempts}...", flush=True)
            print(f"KM: Synthesizing learning with LLM ({EVAL_MODEL_NAME}) (this may take a moment)...", flush=True)

            evaluator_task_prompt_parts = [
                "You are an AI assistant helping to maintain a knowledge base of 'learnings' from human feedback.",
                f"The human feedback is targeted towards: {learning_target}.",
                f'New Human Feedback to process: "{current_feedback_to_process}"',
                f'Original User Query that led to this feedback: "{user_query_context}"',
                f'General Conversation Context when feedback was given: "{turn_context_summary}"',
                "Existing ACTIVE learnings (sample of last 3, if any):" + "".join([f"  - (ID: {entry.get('learning_id','N/A')[:6]}, Target: {entry.get('learning_target', 'N/A')}) {entry.get('final_learning_statement', '')[:100]}..." for entry in self.active_learnings_cache[-3:]]) if self.active_learnings_cache else "  - No existing learnings in cache.",
                "Your Tasks:",
                "1. Analyze the 'New Human Feedback'.",
                "2. Check for CONFLICT or significant REDUNDANCY with existing learnings. Consider general knowledge principles and the stated target of the learnings.",
                "3. If the feedback is new, valuable, non-conflicting, and non-redundant, synthesize it into a concise, actionable 'Finalized Learning Statement'. This statement should be generalizable if possible.",
                "Output Format Instructions:",
                "- If suitable for storing: `FINALIZED_LEARNING: [synthesized statement]`",
                "- If it conflicts: `CONFLICT_DETECTED: [Explanation of the conflict, and if possible, reference key phrases or IDs of conflicting existing learnings]. Proposed statement if you tried to rephrase: [rephrased statement, or original if no rephrase attempt]`",
                "- If it's redundant: `REDUNDANT_LEARNING: [Explanation of redundancy, and if possible, reference key phrases or IDs of the existing learning it's redundant with]. Proposed statement if you tried to rephrase: [rephrased statement, or original if no rephrase attempt]`",
                "- If not actionable/too vague: `NOT_ACTIONABLE: [Explanation]`",
                "Ensure your entire response strictly follows one of these prefixed formats."
            ]
            synthesis_prompt = "\\n".join(evaluator_task_prompt_parts)

            try:
                synthesis_response_obj = self.evaluator_llm.generate_content(synthesis_prompt)
                evaluator_synthesis_text = synthesis_response_obj.text.strip()
                print(f"KM: Gemini Learning Synthesis Raw Response:\\n{evaluator_synthesis_text}")

                final_statement = None
                conflict_explanation = None
                redundant_explanation = None
                not_actionable_explanation = None

                if evaluator_synthesis_text.startswith("FINALIZED_LEARNING:"):
                    final_statement = evaluator_synthesis_text.replace("FINALIZED_LEARNING:", "", 1).strip()
                elif evaluator_synthesis_text.startswith("CONFLICT_DETECTED:"):
                    conflict_explanation = evaluator_synthesis_text.replace("CONFLICT_DETECTED:", "", 1).strip()
                elif evaluator_synthesis_text.startswith("REDUNDANT_LEARNING:"):
                    redundant_explanation = evaluator_synthesis_text.replace("REDUNDANT_LEARNING:", "", 1).strip()
                elif evaluator_synthesis_text.startswith("NOT_ACTIONABLE:"):
                    not_actionable_explanation = evaluator_synthesis_text.replace("NOT_ACTIONABLE:", "", 1).strip()
                else:
                    print("KM: Gemini learning synthesis response format unexpected. Defaulting to not actionable.", flush=True)
                    not_actionable_explanation = f"Response format error: {evaluator_synthesis_text}"

                if final_statement:
                    self.active_learnings_cache.append({
                        "learning_id": str(uuid.uuid4()),
                        "timestamp_created": datetime.now().isoformat(),
                        "original_human_input": human_feedback_text,
                        "processed_human_input": current_feedback_to_process,
                        "final_learning_statement": final_statement,
                        # No longer need to store keywords
                        "status": "active",
                        "learning_target": learning_target
                    })
                    self.learnings_updated_this_session_flag = True
                    print(f"KM: Stored new learning. Cache size: {len(self.active_learnings_cache)}", flush=True)
                    return

                elif conflict_explanation:
                    print(f"KM: Learning conflict detected by Gemini: {conflict_explanation}", flush=True)
                    if attempt_count < max_attempts:
                        print("KM: --- CONFLICT RESOLUTION ---", flush=True)
                        print(f"Original feedback: '{human_feedback_text}'", flush=True)
                        print(f"Feedback being processed: '{current_feedback_to_process}'", flush=True)
                        user_choice = input("Conflict detected. (M)odify your feedback, (S)kip storing, or (P)roceed with current version for resynthesis? [M/S/P]: ").strip().upper()
                        if user_choice == 'M':
                            new_feedback = input("Enter your modified feedback: ").strip()
                            if new_feedback:
                                current_feedback_to_process = new_feedback
                                print("KM: Retrying synthesis with modified feedback.", flush=True)
                                continue
                            else:
                                print("KM: No modification provided. Skipping.", flush=True)
                                return
                        elif user_choice == 'P':
                            print("KM: User chose to proceed. Retrying synthesis with current feedback version.", flush=True)
                            continue
                        else:
                            print("KM: Skipping this learning due to unresolved conflict.", flush=True)
                            return
                    else:
                        print("KM: Max attempts reached for conflict resolution. Skipping this learning.", flush=True)
                        return

                elif redundant_explanation:
                    print(f"KM: Learning deemed redundant by Gemini: {redundant_explanation}", flush=True)
                    user_choice_redundant = input("This learning seems redundant. (S)kip storing, or (F)orce store anyway? [S/F]: ").strip().upper()
                    if user_choice_redundant == 'F':
                        proposed_statement_match = re.search(r"Proposed statement.*?:\\s*(.*)", redundant_explanation, re.IGNORECASE)
                        if proposed_statement_match and proposed_statement_match.group(1).strip():
                            forced_statement = proposed_statement_match.group(1).strip()
                            print(f"KM: Using LLM's proposed statement due to Force: '{forced_statement}'", flush=True)
                        else:
                            forced_statement = current_feedback_to_process
                            print(f"KM: No specific proposed statement from LLM. Using current feedback for Force: '{forced_statement}'", flush=True)

                        self.active_learnings_cache.append({
                            "learning_id": str(uuid.uuid4()),
                            "timestamp_created": datetime.now().isoformat(),
                            "original_human_input": human_feedback_text,
                            "processed_human_input": current_feedback_to_process,
                            "final_learning_statement": forced_statement,
                            # No longer need to store keywords
                            "status": "active_forced_redundancy",
                            "learning_target": learning_target,
                            "notes": f"Forced storage despite redundancy. Original LLM note: {redundant_explanation}"
                        })
                        self.learnings_updated_this_session_flag = True
                        print(f"KM: Stored learning (forced despite redundancy). Cache size: {len(self.active_learnings_cache)}", flush=True)
                        return
                    else:
                        print("KM: Skipping redundant learning.", flush=True)
                        return

                elif not_actionable_explanation:
                    print(f"KM: Learning deemed not actionable by Gemini: {not_actionable_explanation}", flush=True)
                    print("KM: Skipping this learning.", flush=True)
                    return

                else:
                    print("KM: Synthesis resulted in an unhandled state. Skipping.", flush=True)
                    return

            except Exception as e:
                print(f"KM: Error during learning synthesis attempt {attempt_count}: {e}", flush=True)
                import traceback; traceback.print_exc()
                if attempt_count >= max_attempts:
                    print("KM: Max attempts reached due to errors. Skipping this learning.", flush=True)
                    return
                time.sleep(1) # Brief pause before retrying on general error

        print("KM: Could not synthesize learning after maximum attempts. Skipping.", flush=True)

In [None]:
class WorkerAgentHandler:
    def __init__(self, llm_client, tool_schemas: List[Dict], tool_executor: ToolExecutor, storage_instance: Storage):
        self.llm_client = llm_client
        self.tool_schemas = tool_schemas
        self.tool_executor = tool_executor
        self.storage = storage_instance
        print("WorkerAgentHandler initialized.")

    def _execute_llm_interaction_loop(self, system_prompt: str, messages_for_api: List[Dict[str, Any]], query_type: str, conversation_manager: ConversationManager) -> Tuple[str, List[Dict]]:
        tools_for_this_call = self.tool_schemas if query_type == "OPERATIONAL" else []
        max_iterations = 10 if query_type == "OPERATIONAL" else 1 # Max tool use iterations for operational, 1 for others

        executed_tool_calls_log: List[Dict] = [] # Log for tool calls in this interaction loop

        for i in range(max_iterations):
            print(f"--- WorkerLLM Calling Anthropic (Iter {i+1}/{max_iterations}, QType: {query_type}) ---")
            current_text_response = "" # Initialize for this iteration
            try:
                response = self.llm_client.messages.create(
                    model=ANTHROPIC_MODEL_NAME,
                    max_tokens=4000,
                    temperature=0.0,
                    system=system_prompt,
                    tools=tools_for_this_call,
                    messages=messages_for_api
                )
            except Exception as e:
                error_message = f"Error communicating with Worker LLM: {e}"
                print(f"LLM API Error: {e}")
                return error_message, executed_tool_calls_log # Return error and any logs so far

            assistant_response_blocks = response.content
            # It's important to add the raw assistant response blocks to the API history
            # This includes text parts and tool_use parts if any.
            messages_for_api.append({"role": "assistant", "content": assistant_response_blocks})

            text_blocks = [block.text for block in assistant_response_blocks if block.type == "text"]
            current_text_response = " ".join(text_blocks).strip()

            if current_text_response.startswith("CLARIFICATION_REQUESTED:"):
                return current_text_response, executed_tool_calls_log # Return immediately for clarification

            tool_calls_to_process = [block for block in assistant_response_blocks if block.type == "tool_use"]

            if not tool_calls_to_process or query_type != "OPERATIONAL":
                # If no tools to call, or not an operational query, this is the final response from the LLM for this loop.
                final_response_text = current_text_response if current_text_response else "Worker AI provided no text content in its final turn."
                return final_response_text, executed_tool_calls_log

            # If there are tool calls to process (and it's an OPERATIONAL query)
            tool_results_for_next_llm_call_content = [] # This will be the content for the next "user" role message (tool results)

            for tool_use_block in tool_calls_to_process:
                tool_name, tool_input, tool_use_id = tool_use_block.name, tool_use_block.input, tool_use_block.id
                print(f"WorkerLLM: Requesting Tool Call: {tool_name}, Input: {tool_input}")

                # Execute the tool
                tool_result_data = self.tool_executor.execute_tool(tool_name, tool_input, self.storage)

                # Log the tool call and its result for the orchestrator/evaluator
                executed_tool_calls_log.append({
                    "tool_name": tool_name,
                    "tool_input": copy.deepcopy(tool_input), # Deepcopy to avoid modification issues
                    "tool_output": copy.deepcopy(tool_result_data)
                })

                # Update conversation manager's context (this was already here)
                # Example: update context based on product/order/customer IDs in tool_result_data
                entity_type_map = {
                    "order_details": "orders", "order_id": "orders",
                    "product": "products", "product_id": "products",
                    "customer": "customers", "customer_id": "customers"
                }
                found_entity_type = "unknown"
                found_entity_id = "unknown_id"
                found_entity_data = tool_result_data

                for key, etype in entity_type_map.items():
                    if key in tool_result_data and tool_result_data[key]:
                        found_entity_type = etype
                        if isinstance(tool_result_data[key], dict) and ("id" in tool_result_data[key] or etype[:-1]+"_id" in tool_result_data[key]): # e.g. order_details might have 'id'
                             found_entity_id = tool_result_data[key].get("id") or tool_result_data[key].get(etype[:-1]+"_id")
                             found_entity_data = tool_result_data[key]
                        elif isinstance(tool_result_data.get(etype[:-1]+"_id"), str): # e.g. direct product_id
                            found_entity_id = tool_result_data.get(etype[:-1]+"_id")
                        break # Take first match for simplicity

                # Try to get ID more robustly if it's directly in tool_result_data
                if found_entity_id == "unknown_id":
                     if "order_id" in tool_result_data: found_entity_id = tool_result_data["order_id"]
                     elif "product_id" in tool_result_data: found_entity_id = tool_result_data["product_id"]
                     elif "customer_id" in tool_result_data: found_entity_id = tool_result_data["customer_id"]

                if found_entity_id != "unknown_id":
                    conversation_manager.update_entity_in_context(
                        entity_type=found_entity_type,
                        entity_id=found_entity_id,
                        data=found_entity_data
                    )
                conversation_manager.set_last_action(f"tool_{tool_name}_Anthropic", {"input": tool_input, "result_summary": tool_result_data.get("status", "unknown_status")})

                tool_results_for_next_llm_call_content.append({
                    "type": "tool_result",
                    "tool_use_id": tool_use_id,
                    "content": json.dumps(tool_result_data) if isinstance(tool_result_data, dict) else str(tool_result_data)
                    # Consider adding an error field from tool_result_data if status is error
                    # "is_error": tool_result_data.get("status") == "error" if isinstance(tool_result_data, dict) else False
                })

            # Add the aggregated tool results as a new "user" message for the next LLM call
            if tool_results_for_next_llm_call_content:
                messages_for_api.append({"role": "user", "content": tool_results_for_next_llm_call_content})
            else: # Should not happen if tool_calls_to_process was non-empty
                print("WorkerLLM: No tool results to append, though tool calls were present. This is unexpected.")


        # If loop finishes (max_iterations reached)
        final_response_text = current_text_response if current_text_response else "Worker AI reached max tool iterations without a final text response."
        return final_response_text, executed_tool_calls_log

In [None]:
class ResponseEvaluator:
    def __init__(self, evaluator_llm_instance):
        self.evaluator_llm = evaluator_llm_instance
        print("ResponseEvaluator initialized.")

    def evaluate_turn(self, user_message: str, query_type: str, worker_response_text: str,
                      context_summary: str, rag_learnings_provided: str,
                      clarification_interactions: Optional[List[Dict]],
                      initial_datastore_state: Dict[str, Any],
                      final_datastore_state: Dict[str, Any],
                      executed_tool_calls_log: List[Dict]) -> Dict[str, Any]:

        initial_ds_prompt = f"Data Store State *Before* AI Action:\\n{json.dumps(initial_datastore_state, indent=2, default=str)}"
        final_ds_prompt = f"Data Store State *After* AI Action:\\n{json.dumps(final_datastore_state, indent=2, default=str)}"

        clarification_info_prompt = "No worker AI clarifications this turn."
        if clarification_interactions:
            clar_summary = [f"  Q: '{c.get('agent_question', 'N/A')}' -> User A: '{c.get('user_answer', 'N/A')}'" for c in clarification_interactions]
            clarification_info_prompt = f"Worker AI Clarification Interactions:\\n" + "\\n".join(clar_summary)

        tool_log_prompt = "No tools were executed by the Worker AI this turn."
        if executed_tool_calls_log:
            formatted_tool_calls = []
            for i, call in enumerate(executed_tool_calls_log):
                output_summary = call.get('tool_output', {})
                if isinstance(output_summary, dict):
                    if "status" in output_summary:
                        summary_dict = {"status": output_summary["status"]}
                        if "message" in output_summary: summary_dict["message"] = output_summary["message"]
                        if "order_id" in output_summary: summary_dict["order_id"] = output_summary["order_id"]
                        if "product_id" in output_summary: summary_dict["product_id"] = output_summary["product_id"]
                        output_summary_str = json.dumps(summary_dict, default=str)
                    else: # Fallback for dicts without status
                        output_summary_str = json.dumps(output_summary, indent=1, default=str)
                        if len(output_summary_str) > 200:
                            output_summary_str = output_summary_str[:200] + "..."
                else:
                    output_summary_str = str(output_summary)
                    if len(output_summary_str) > 200:
                         output_summary_str = output_summary_str[:200] + "..."

                formatted_tool_calls.append(
                    f"  Tool Call {i+1}:\n"
                    f"    Name: {call.get('tool_name')}\n"
                    f"    Input: {json.dumps(call.get('tool_input'), default=str)}\n"
                    f"    Output Summary (from tool perspective): {output_summary_str}" # Clarified this is tool's raw output summary
                )
            tool_log_prompt = f"Worker AI Tool Calls Executed This Turn:\\n" + "\\n".join(formatted_tool_calls)

        eval_content_prompt = f"""
            User query: {user_message}
            Classified Query Type: {query_type}

            Context provided to assistant (summary):
            {context_summary}

            Relevant RAG Learnings provided to assistant:
            {rag_learnings_provided}

            {initial_ds_prompt}

            {tool_log_prompt}

            {final_ds_prompt}

            {clarification_info_prompt}

            Worker AI (Claude) final textual response to user:
            {worker_response_text}
            ---
            INSTRUCTIONS FOR EVALUATOR (You are Gemini {EVAL_MODEL_NAME}):
            Based on your system prompt (which emphasizes impartiality and detailed assessment criteria) and the classified query type ({query_type}), please evaluate the AI assistant's response.
            - If OPERATIONAL, focus on tool use accuracy (refer to 'Worker AI Tool Calls Executed'), data store changes (Before vs. After), and whether the final user-facing textual response accurately and clearly conveys these actions and their outcomes.
            - If METACOGNITIVE_LEARNINGS_SUMMARY, focus on whether the AI accurately summarized the 'Relevant RAG Learnings' it was provided.
            Provide detailed reasoning for scores for each criterion (Accuracy, Efficiency, Context Awareness, Helpfulness & Clarity) and an **Overall Score**. All scores should be an integer from 1 to 10.
            Format the Overall Score clearly, for example: "Overall Score: X/10".
            Explicitly reference the tool call log and datastore states when assessing operational tasks.
            """
        try:
            gemini_response_obj = self.evaluator_llm.generate_content(eval_content_prompt)
            evaluation_text = gemini_response_obj.text

            score = self._extract_score(evaluation_text)

            return {
                "anthropic_score": score,
                "full_evaluation": evaluation_text,
                "clarification_details_evaluator": {"used": False},
                "query_type_evaluated": query_type,
                "raw_evaluation_text": evaluation_text
            }
        except Exception as e:
            print(f"Evaluator: Error during Gemini evaluation: {e}") # This is an actual error print
            import traceback; traceback.print_exc()
            return {
                "error": str(e),
                "anthropic_score": 0,
                "full_evaluation": f"Evaluation failed: {e}",
                "clarification_details_evaluator": {},
                "query_type_evaluated": query_type,
                "raw_evaluation_text": f"Evaluation Error: {e}"
            }

    def _extract_score(self, evaluation_text: str) -> int:
        # Attempt to find "Overall Score: X/10" or similar, more flexibly.
        # Prioritize patterns that explicitly mention "/10".
        patterns = [
            r"Overall Score\s*:\s*(\d{1,2})(?:/10)?",  # "Overall Score : X" or "Overall Score : X/10"
            r"Overall Score\s*is\s*(\d{1,2})(?:/10)?",
            r"Overall\s*:\s*(\d{1,2})(?:/10)?",
            r"Score\s*:\s*(\d{1,2})(?:/10)?", # Generic score if "Overall" is missed but clearly formatted
            r"\b(\d{1,2})/10\b" # Simple X/10 if it stands alone near typical score phrases
        ]

        # Search for "Overall Score" section first
        overall_score_section_match = re.search(r"(Overall Score.*?)(?:\n\n|\n---|$)", evaluation_text, re.IGNORECASE | re.DOTALL)
        text_to_search = evaluation_text
        if overall_score_section_match:
            text_to_search = overall_score_section_match.group(1) # Search within this section first

        for p_str in patterns:
            match = re.search(p_str, text_to_search, re.IGNORECASE) # Search for first best match
            if match:
                try:
                    score_value = int(match.group(1))
                    if 0 <= score_value <= 10:
                        # print(f"DEBUG: Extracted score {score_value} using pattern: {p_str}") # Optional debug
                        return score_value
                except (ValueError, IndexError):
                    continue

        # Fallback: if no "Overall Score" section found, or no match within it, search whole text.
        if not overall_score_section_match:
            for p_str in patterns:
                match = re.search(p_str, evaluation_text, re.IGNORECASE)
                if match:
                    try:
                        score_value = int(match.group(1))
                        if 0 <= score_value <= 10:
                            # print(f"DEBUG: Extracted score {score_value} (full text search) using pattern: {p_str}") # Optional debug
                            return score_value
                    except (ValueError, IndexError):
                        continue

        # This print should ONLY occur if extraction truly fails after trying all patterns.
        # The stray "Evaluator: Could not reliably extract..." message in the user's log
        # was likely coming from an older version or a different part of the code if not from here.
        # For the *final summary*, this message will appear if this function returns 0.
        # print(f"Evaluator (INFO): Could not reliably extract a 0-10 'Overall Score' from evaluation text. Defaulting to 0. Text sample: ...{evaluation_text[-350:]}")
        return 0

In [None]:
class AgentOrchestrator:
    def __init__(self):
        self.classifier_llm_client = gemini.GenerativeModel(model_name=CLASSIFIER_MODEL_NAME)
        self.query_classifier = QueryClassifier(llm_client=self.classifier_llm_client)
        self.storage = Storage()
        self.conversation_manager = ConversationManager()
        self.tool_functions_map = {
            "create_customer": create_customer,
            "get_customer_info": get_customer_info,
            "update_customer": update_customer,
            "create_product": create_product,
            "update_product": update_product,
            "get_product_info": get_product_info,
            "list_all_products": list_all_products,
            "create_order": create_order,
            "get_order_details": get_order_details,
            "update_order_status": update_order_status,
        }
        self.tool_executor = ToolExecutor(self.tool_functions_map)

        knowledge_synthesis_llm = gemini.GenerativeModel(model_name=EVAL_MODEL_NAME)
        self.knowledge_manager = KnowledgeManager(LEARNINGS_DRIVE_BASE_PATH, DRIVE_MOUNT_PATH, DEFAULT_LEARNINGS_DRIVE_SUBPATH, knowledge_synthesis_llm)

        self.worker_agent_handler = WorkerAgentHandler(anthropic_client, tools_schemas_list, self.tool_executor, self.storage)

        main_evaluator_llm = gemini.GenerativeModel(model_name=EVAL_MODEL_NAME, system_instruction=evaluator_system_prompt)
        self.response_evaluator = ResponseEvaluator(evaluator_llm_instance=main_evaluator_llm)

        self.evaluation_results_log: List[Dict] = []
        print("AgentOrchestrator initialized.")

    def _handle_worker_clarification_interaction(self, agent_question_text: str, system_prompt: str,
                                                current_turn_history: List[Dict], query_type: str,
                                                conversation_manager: ConversationManager,
                                                max_attempts: int = 2) -> Tuple[str, List[Dict], List[Dict]]:
        clarification_interactions = []
        response_text = agent_question_text
        executed_tool_calls_log_clarification_phase: List[Dict] = []

        for attempt in range(max_attempts):
            actual_question = response_text.split("CLARIFICATION_REQUESTED:", 1)[-1].strip() if "CLARIFICATION_REQUESTED:" in response_text else response_text
            print(f"--- Worker AI requests clarification: {actual_question} ---", flush=True) # Added flush

            user_clarification = ""
            try:
                user_clarification = input(f"Your response to Worker AI: ").strip()
            except EOFError:
                print("EOFError encountered during input. Assuming no user clarification.", flush=True) # Added flush
                user_clarification = "(User provided no further input due to EOF)"

            if not user_clarification and not user_clarification.startswith("(User provided no further input"):
                 user_clarification = "(User provided no further input)"

            clarification_interactions.append({"agent_question": actual_question, "user_answer": user_clarification})
            current_turn_history.append({"role": "user", "content": user_clarification})

            response_text, tools_log_this_iteration = self.worker_agent_handler._execute_llm_interaction_loop(
                system_prompt, current_turn_history, query_type, conversation_manager
            )
            executed_tool_calls_log_clarification_phase.extend(tools_log_this_iteration)

            if not response_text.startswith("CLARIFICATION_REQUESTED:"):
                return response_text, clarification_interactions, executed_tool_calls_log_clarification_phase

        print("Max clarification attempts reached for worker AI.", flush=True) # Added flush
        return response_text, clarification_interactions, executed_tool_calls_log_clarification_phase


    def run_agent_turn(self, user_message: str) -> Dict[str, Any]:
        print("\n--- Orchestrator: Starting Agent Turn ---", flush=True)
        self.conversation_manager.add_user_message(user_message)
        print("Orchestrator: Classifying query...", flush=True)
        query_type = self.query_classifier.classify(user_message)
        print(f"--- Orchestrator: Classified Query Type: {query_type} ---", flush=True)

        context_summary_for_worker = self.conversation_manager.get_context_summary()
        print("Orchestrator: Retrieving RAG learnings for Worker Agent...", flush=True)
        rag_learnings_for_worker = self.knowledge_manager.get_relevant_learnings_for_prompt(
            user_message, query_type, recipient_role="Agent"
        )
        if rag_learnings_for_worker != "No specific relevant learnings from knowledge base provided for this query after all filters." and \
           rag_learnings_for_worker != "No specific relevant learnings from knowledge base found for this query based on role." and \
           rag_learnings_for_worker != "No specific relevant learnings from knowledge base provided for this query.":
            # Multi-line RAG output will flush with its own newlines usually.
            print(f"Orchestrator: RAG for Worker:\n{rag_learnings_for_worker}")
        else:
            print("Orchestrator: No specific RAG learnings found for Worker Agent for this query.", flush=True)

        full_worker_prompt = (
            f"{worker_operational_system_prompt if query_type == 'OPERATIONAL' else worker_metacognitive_learnings_system_prompt}\\n\\n"
            f"Conversation Context Summary (recent entities and last action):\\n{context_summary_for_worker}\\n\\n"
            f"{rag_learnings_for_worker}"
        )

        initial_datastore_state = self.storage.get_full_datastore_copy()
        current_turn_processing_history = self.conversation_manager.get_full_conversation_for_api()

        print("Orchestrator: Worker Agent processing request (this may involve LLM calls and tool use)...", flush=True)
        worker_response_text, executed_tool_calls_log = self.worker_agent_handler._execute_llm_interaction_loop(
            full_worker_prompt, current_turn_processing_history, query_type, self.conversation_manager
        )

        clarification_interactions = []
        if worker_response_text.startswith("CLARIFICATION_REQUESTED:"):
            print("Orchestrator: Worker Agent requested clarification. Initiating clarification loop...", flush=True)
            worker_response_text, clarification_interactions, tools_during_clarif = self._handle_worker_clarification_interaction(
                worker_response_text, full_worker_prompt, current_turn_processing_history, query_type, self.conversation_manager
            )
            executed_tool_calls_log.extend(tools_during_clarif)

        self.conversation_manager.add_assistant_message(worker_response_text, query_type)
        final_datastore_state = self.storage.get_full_datastore_copy()
        print("--- Orchestrator: Agent Turn Completed ---", flush=True)

        return {
            "user_message": user_message,
            "query_type": query_type,
            "anthropic_response": worker_response_text,
            "executed_tool_calls": executed_tool_calls_log,
            "context_summary_for_worker": context_summary_for_worker,
            "initial_datastore_state": initial_datastore_state,
            "final_datastore_state": final_datastore_state,
            "clarification_interactions": clarification_interactions
        }

    def handle_feedback_on_worker_response(self, original_user_query: str,
                                           context_summary_for_worker: str,
                                           human_feedback_on_worker: str):
        print("--- Orchestrator: Processing Feedback on Worker AI's Response ---", flush=True)
        if human_feedback_on_worker.lower() not in ['skip', ''] and human_feedback_on_worker:
            self.knowledge_manager.synthesize_and_store_learning(
                human_feedback_on_worker,
                original_user_query,
                context_summary_for_worker,
                learning_target="AgentAndEvaluator"
            )
            # persist_active_learnings already has its own print statements with flush
            if self.knowledge_manager.learnings_updated_this_session_flag:
                self.knowledge_manager.persist_active_learnings()
        else:
            print("Orchestrator: No feedback provided for worker response or 'skip' entered.", flush=True)
        print("--- Orchestrator: Worker Feedback Processing Completed ---", flush=True)


    def run_evaluation_turn(self, agent_turn_data: Dict[str, Any]) -> Dict[str, Any]:
        print("\n--- Orchestrator: Starting Evaluation Turn ---", flush=True) # Added flush
        user_message = agent_turn_data["user_message"]
        query_type = agent_turn_data["query_type"]
        worker_response_text = agent_turn_data["anthropic_response"]
        context_summary_for_worker = agent_turn_data["context_summary_for_worker"]
        clarification_interactions = agent_turn_data["clarification_interactions"]
        initial_datastore_state = agent_turn_data["initial_datastore_state"]
        final_datastore_state = agent_turn_data["final_datastore_state"]
        executed_tool_calls_log = agent_turn_data["executed_tool_calls"]

        print("Orchestrator: Retrieving RAG learnings for AI Evaluator...", flush=True) # Added flush
        rag_learnings_for_evaluator = self.knowledge_manager.get_relevant_learnings_for_prompt(
            user_message, query_type, recipient_role="Evaluator"
        )
        if rag_learnings_for_evaluator != "No specific relevant learnings from knowledge base provided for this query after all filters." and \
           rag_learnings_for_evaluator != "No specific relevant learnings from knowledge base found for this query based on role." and \
           rag_learnings_for_evaluator != "No specific relevant learnings from knowledge base provided for this query.":
            # Multi-line RAG output will likely flush due to newlines.
            print(f"Orchestrator: RAG for Evaluator:\n{rag_learnings_for_evaluator}")
        else:
            print("Orchestrator: No specific RAG learnings found for AI Evaluator for this query.", flush=True) # Added flush


        print(f"Orchestrator: Performing AI evaluation with Gemini ({EVAL_MODEL_NAME}) (this may take a moment)...", flush=True) # Added flush
        evaluation_result = self.response_evaluator.evaluate_turn(
            user_message, query_type, worker_response_text,
            context_summary_for_worker,
            rag_learnings_for_evaluator,
            clarification_interactions,
            initial_datastore_state,
            final_datastore_state,
            executed_tool_calls_log
        )

        # This log append is internal, no print needed here
        self.evaluation_results_log.append({
            "user_message": user_message,
            "query_type": query_type,
            "worker_response": worker_response_text,
            "tool_calls": copy.deepcopy(executed_tool_calls_log),
            "evaluation_details": evaluation_result
        })
        print("--- Orchestrator: Evaluation Turn Completed ---", flush=True) # Added flush
        return evaluation_result

    def handle_feedback_on_evaluation(self, original_user_query: str, worker_response_summary: str,
                                      evaluation_text_summary: str, human_feedback_on_evaluator: str):
        print("--- Orchestrator: Processing Feedback on AI Evaluator's Assessment ---", flush=True) # Added flush
        if human_feedback_on_evaluator.lower() not in ['skip', ''] and human_feedback_on_evaluator:
            eval_feedback_context = (
                f"Feedback is on an evaluation. Original query: '{original_user_query}'. "
                f"Worker response (summary): '{worker_response_summary[:100]}...'. "
                f"Evaluation (summary): '{evaluation_text_summary[:150]}...'"
            )
            self.knowledge_manager.synthesize_and_store_learning(
                human_feedback_on_evaluator,
                original_user_query,
                eval_feedback_context,
                learning_target="EvaluatorOnly"
            )
            # persist_active_learnings already has its own print statements with flush
            if self.knowledge_manager.learnings_updated_this_session_flag:
                self.knowledge_manager.persist_active_learnings()
        else:
            print("Orchestrator: No feedback provided for evaluator or 'skip' entered.", flush=True)
        print("--- Orchestrator: Evaluator Feedback Processing Completed ---", flush=True)


    def get_evaluation_log(self) -> List[Dict]:
        return self.evaluation_results_log

    def persist_learnings_on_exit(self):
        if self.knowledge_manager.learnings_updated_this_session_flag:
            print("\nOrchestrator: Persisting any remaining updated learnings on exit (this may take a moment)...", flush=True)
            self.knowledge_manager.persist_active_learnings()
        else:
            print("\nOrchestrator: No new learnings to persist on exit.", flush=True)

In [None]:
def main():
    print("\\nStarting Main Execution with REFACTORED Agent System...")
    print("=" * 70)
    orchestrator = AgentOrchestrator()

    while True:
        try:
            print("\\n" + "=" * 70)
            print("AWAITING USER INPUT...")
            user_query = input("Enter query (or 'quit' to exit): ").strip()

            if user_query.lower() == 'quit':
                orchestrator.persist_learnings_on_exit()
                print("Exiting system.")
                print("=" * 70)
                break
            if not user_query:
                continue

            print(f"\n>>> User Query: '{user_query}'")

            # === AGENT ACTION PHASE ===
            agent_turn_data = orchestrator.run_agent_turn(user_query)

            worker_response_text = agent_turn_data.get('anthropic_response', "No worker response found.")
            query_type_from_turn = agent_turn_data.get('query_type', "N/A")
            context_for_worker_feedback = agent_turn_data.get('context_summary_for_worker', "N/A")

            print(f"\\n--- Worker AI Final Response (Type: {query_type_from_turn}) ---")
            print(worker_response_text)
            print("--- End of Worker AI Response ---")

            # === FEEDBACK ON AGENT PHASE ===
            print("\nREQUESTING FEEDBACK ON WORKER AI:")
            try:
                human_feedback_on_worker = input("Orchestrator: Feedback on Worker AI's response? (type or 'skip'): ").strip()
                orchestrator.handle_feedback_on_worker_response(
                    original_user_query=user_query,
                    context_summary_for_worker=context_for_worker_feedback,
                    human_feedback_on_worker=human_feedback_on_worker
                )
            except EOFError:
                print("Orchestrator: Skipping feedback on worker response (EOF).")

            # === EVALUATION PHASE ===
            evaluation_details = orchestrator.run_evaluation_turn(agent_turn_data)

            raw_eval_text_for_feedback = "No evaluation text." # Default for feedback context
            print("\n--- AI Evaluator's Assessment ---") # Header for the block
            if evaluation_details and not evaluation_details.get("error"):
                raw_eval_text_for_feedback = evaluation_details.get("raw_evaluation_text", "No raw evaluation text found.")
                # The raw_eval_text already contains good formatting from the LLM
                print(raw_eval_text_for_feedback)
            elif evaluation_details and evaluation_details.get("error"):
                error_message = evaluation_details.get('error', 'Unknown evaluation error.')
                raw_eval_text_for_feedback = f"Evaluation Error: {error_message}"
                print(raw_eval_text_for_feedback)
            else:
                print("No evaluation details found for this turn.")
            print("--- End of AI Evaluator's Assessment ---")

            # === FEEDBACK ON EVALUATION PHASE ===
            print("\nREQUESTING FEEDBACK ON AI EVALUATOR:")
            try:
                human_feedback_on_evaluator = input("Orchestrator: Feedback on Evaluator's assessment? (type or 'skip'): ").strip()
                orchestrator.handle_feedback_on_evaluation(
                    original_user_query=user_query,
                    worker_response_summary=worker_response_text,
                    evaluation_text_summary=raw_eval_text_for_feedback,
                    human_feedback_on_evaluator=human_feedback_on_evaluator
                )
            except EOFError:
                print("Orchestrator: Skipping feedback on evaluator assessment (EOF).")

        except SystemExit:
            print("\nSystem exit requested.")
            orchestrator.persist_learnings_on_exit()
            break
        except EOFError:
            print("\\nEOF encountered. Exiting gracefully.")
            orchestrator.persist_learnings_on_exit()
            break
        except KeyboardInterrupt:
            print("\\nKeyboard interrupt detected. Exiting.")
            orchestrator.persist_learnings_on_exit()
            break
        except Exception as e:
            print(f"\nCRITICAL ERROR in main loop: {e}")
            import traceback
            traceback.print_exc()
            # Consider whether to persist learnings on critical unknown errors
            # orchestrator.persist_learnings_on_exit()

    # --- Final Evaluation Summary ---
    print("\\n" + "=" * 30 + " FINAL EVALUATION SUMMARY " + "=" * 30)
    results_log = orchestrator.get_evaluation_log()
    total_score, num_q_evaluated = 0, 0

    if not results_log:
        print("No queries were processed and evaluated in this session.")
    else:
        for i, turn_data in enumerate(results_log):
            user_msg = turn_data.get('user_message', 'N/A')
            q_type = turn_data.get('query_type', 'N/A')
            eval_details_from_log = turn_data.get('evaluation_details', {})

            score = 0 # Default score for the summary line if extraction fails or error
            if isinstance(eval_details_from_log, dict) and not eval_details_from_log.get("error"):
                # Use the score extracted and stored during evaluate_turn
                score = eval_details_from_log.get('anthropic_score', 0)
                if score > 0 : # Only count towards average if a valid score was extracted
                    total_score += score
                    num_q_evaluated += 1
                print(f"Q{i+1}: '{user_msg}' (Type: {q_type}) -> Score: {score}")
            elif isinstance(eval_details_from_log, dict) and eval_details_from_log.get("error"):
                print(f"Q{i+1}: '{user_msg}' (Type: {q_type}) -> Evaluation Error: {eval_details_from_log.get('error')}")
            else:
                print(f"Q{i+1}: '{user_msg}' (Type: {q_type}) -> No valid evaluation details or score logged.")

    if num_q_evaluated > 0:
        print(f"\\nAverage Score over {num_q_evaluated} successfully scored queries: {total_score / num_q_evaluated:.2f}")
    else:
        print("\\nNo queries were successfully scored to calculate an average score.")
    print(f"Total aggregate score (sum of successfully extracted scores) for the session: {total_score}")
    print("=" * 70)
    print("Execution Finished.")

# To run:
# main()

In [None]:
""" Sample queries:
* Show me all the products available
* I'd like to order 25 Perplexinators, please
* Show me the status of my order
* (If the order is not in Shipped state, then) Please ship my order now
* How many Perplexinators are now left in stock?
* Add a new customer: Bill Leece, bill.leece@mail.com, +1.222.333.4444
* Change bill.leece@mail.com's phone number to +1.999.888.7777
* Add new new product: Gizmo X, description: A fancy gizmo, price: 29.99, inventory: 50
* Update Gizzmo's price to 99.99 #Note the misspelling of 'Gizmo'
* When was the last time the Toronto Maple Leafs won the Stanley Cup?
* I need to update our insurance company about the total value of products that we have in stock. Calculate that value and help me draft a short email to Clark Kent of SuperInsurance with those details
* Summarize your learnings from our recent interactions.
"""

main()