In [None]:
from rich.console import Console
from openai import OpenAI
from dotenv import load_dotenv
import os
import json
import anthropic

In [None]:
load_dotenv(override=True)
openai = OpenAI()
groq = OpenAI(api_key=os.getenv("GROQ_API_KEY"), base_url="https://api.groq.com/openai/v1")
claude = anthropic.Anthropic()
ollama = OpenAI(api_key=os.getenv("OLLAMA_API_KEY"), base_url="http://localhost:11434/v1")
console = Console()

In [None]:
def show(text):
    try:
        console.print(text)
    except Exception:
        print(text)

In [None]:
prompts, scores = [], []

In [None]:
system_message = """
You are PromptOptimizer, a tool-using agent that improves prompts through an iterative loop.
Mission:
- Given a bad_prompt, produce an improved prompt.
- First, evaluate the given prompt.
- Immediately after, rewrite the prompt.
- Then, continue the loop.
- You MUST run exactly 5 iterations, without counting the first one.
- Each iteration is: REWRITE (Using Tools)→ EVALUATE (using tools) → IMPROVE.
- Keep the user's original intent. Do not change the task, only how it is requested.

Available tools:
1) rewrite_prompt(input_prompt, score, feedback)
   - Generates a candidate rewritten prompt.
   - The parameters are: input_prompt, score, feedback.
2) evaluate_prompt(prompt)
   - Returns a checklist-based score and diagnostics.
   - The only parameter is the prompt.
3) select_best()
   - Selects the best prompt across iterations.
   - This does not receive any parameter.

Hard rules:
- You MUST use tools. Do not do the rewrite or scoring “in your head”.
- If critical info is missing, make minimal assumptions.
- Do not ask the user questions unless explicitly allowed.
- Avoid vague language. Replace subjective words with measurable constraints.
- Always specify the expected output format inside the final prompt.
- Do not reveal chain-of-thought. Only output the rewrite and the score.

Stop condition:
- After exactly 5 iterations, call select_best() and output the best prompt.
"""

In [None]:
user_prompt = """I don't how how to start with AI. Help me"""

In [None]:
def get_scores_report() -> str:
    result = ""
    for index, (prompt, score) in enumerate(zip(prompts, scores)):
        color = 'bold_red' if score < 60 else 'bold_yellow' if 60 <= score < 80 else 'bold_green'
        if index == 0:
            result += f"Initial prompt: {user_prompt}\n"
        else:
            result += f"Iteration {index}: "
            result += f"New Prompt: {prompt}. -> "
        result += f"[{color}]  Score: {score}[/{color}]\n\n\n"
    return result

In [None]:
#For Claude I need to extract the first json object from the response in text format
def extract_first_json_object(text: str) -> dict:
    start = text.find("{")
    end = text.rfind("}")

    if start == -1 or end == -1 or end <= start:
        raise ValueError("No JSON object found in the text")

    json_str = text[start:end+1]
    return json.loads(json_str)

claude_activated = True

def evaluate_prompt(prompt: str) -> dict:
    system_prompt= f"""You are an expert Prompt Evaluator (Prompt Critic).

Your job is to evaluate the quality of a given prompt.
You must be strict, practical, and specific.

You will receive:
- prompt: the prompt to evaluate

Your evaluation must judge whether the prompt, as written, would reliably produce high-quality outputs from an LLM.

Rules:
1) DO NOT rewrite the prompt.
2) DO NOT invent external context.
3) Evaluate ONLY what is explicitly present in the prompt.
4) If critical information is missing, list it in the feedback.
5) Do not show step-by-step reasoning. Provide only clear conclusions.
6) You must evaluate between 1 and 100 where below 60 is bad. Between 60 and 80 is so so. Above 80 is excellent.

Evaluation criteria: Objective & task definition, sufficient context, Output format specified, Quality criteria (definition of done), Ambiguity handling, Robustness / expected consistency, Safety / hallucination prevention (when relevant), Efficiency / signal-to-noise ratio

Required output:
Return ONLY valid JSON with this exact structure:
    - prompt: the prompt that you are evaluating
    - score: the score of the prompt
    - feedback: the feedback of the prompt
example: 
{{
    "prompt": "prompt",
    "score": 0,
    "feedback": "feedback"
}}
    """
    messages = [
        {"role": "assistant" if claude_activated else "system", "content": system_prompt},
        {"role": "user", "content": prompt}
    ]
    
    if claude_activated:
        response = claude.messages.create(
            model="claude-opus-4-6",
            max_tokens=1000,
            messages=messages
        )
        content = extract_first_json_object(response.content[0].text)
    else:
        response = ollama.chat.completions.create(
            model="deepseek-r1:1.5b",
            max_tokens=1000,
            messages=messages,
            response_format={"type": "json_object"}
        )
        content = json.loads(response.choices[0].message.content)
    
    score = content.get("score", content.get("Score", 0))
    prompts.append(prompt)
    scores.append(score)
    show(f"I received the prompt: {prompt}. The score is: {score}\n\n")
    return content

In [None]:
def select_best():
    if len(scores) == 0:
        return None
    best_score, best_prompt = max(zip(scores, prompts), key=lambda x: x[0])
    show(f"The best prompt is: {best_prompt}. The score is: {best_score}\n\n")
    return {
        "best_prompt": best_prompt,
        "score": best_score,
    }

In [None]:
def rewrite_prompt(prompt: str, score: str, feedback: str) -> str:
    system_prompt = """You are a prompt writer, you will receive a prompt, a score and a feedback.
    You will rewrite the prompt to improve it by 2 points in the score.
    You need to output a json with the following fields:
    - new_prompt: the rewritten prompt
    """
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Prompt: {prompt}\nScore: {score}\nFeedback: {feedback}"}
    ]
    response = groq.chat.completions.create(
        model="openai/gpt-oss-20b",
        messages=messages,
        response_format={"type": "json_object"}
    )
    print(f"I Have to rewrite this prompt: {prompt}. \nThe score given by an evaluator is: {score}. \nThe feedback provider is: {feedback}. \n\n")
    return response.choices[0].message.content

In [None]:
evaluate_prompt_json = {
    "name": "evaluate_prompt",
    "description": "Evaluate a prompt and return a json with the score and the feedback. Return the evaluation as structured JSON",
    "parameters": {
        "type": "object",
        "properties": {
            "prompt": {"type": "string", "description": "The prompt to evaluate"}
        },
        "required": ["prompt"],
        "additionalProperties": False
    }
}

In [None]:
rewrite_prompt_json = {
    "name": "rewrite_prompt",
    "description": "Rewrite a prompt to improve it",
    "parameters": {
        "type": "object",
        "properties": {
            "prompt": {"type": "string", "description": "The prompt to rewrite"},
            "score": {"type": "string", "description": "The score of the prompt"},
            "feedback": {"type": "string", "description": "The feedback of the prompt"}
        },
        "required": ["prompt", "score", "feedback"],
        "additionalProperties": False
    }
}

In [None]:
select_best_json = {
    "name": "select_best",
    "description": "Select the best prompt from the history",
    "parameters": {
        "type": "object",
        "properties": {},
        "additionalProperties": False
    }
}

In [None]:
tools = [
    {"type": "function", "function": evaluate_prompt_json}, 
    {"type": "function", "function": rewrite_prompt_json}, 
    {"type": "function", "function": select_best_json}
]

In [None]:
def handle_tool_calls(tool_calls):
    results = []
    for tool_call in tool_calls:
        tool_name = tool_call.function.name
        arguments = json.loads(tool_call.function.arguments)
        tool = globals().get(tool_name)
        result = tool(**arguments) if tool else {}
        results.append({"role": "tool","content": json.dumps(result),"tool_call_id": tool_call.id})
    return results

In [None]:
def loop(messages):
    done = False
    while not done:
        response = openai.chat.completions.create(model="gpt-5.2", messages=messages, tools=tools, reasoning_effort="none")
        finish_reason = response.choices[0].finish_reason
        if finish_reason=="tool_calls":
            message = response.choices[0].message
            tool_calls = message.tool_calls
            results = handle_tool_calls(tool_calls)
            messages.append(message)
            messages.extend(results)
        else:
            done = True

messages = [
    {"role": "system", "content": system_message},
    {"role": "user", "content": user_prompt}
]
loop(messages)