# Agent Workflows and Recipes

## Setup and Utils

In [None]:
import asyncio
import os, json
import together
from pydantic import ValidationError
from together import AsyncTogether, Together

client = Together(api_key= "abc")
async_client = AsyncTogether(api_key= "abc")

In [None]:
def run_llm(user_prompt : str, model : str, system_prompt : str = None):

    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    
    messages.append({"role": "user", "content": user_prompt})
    
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0.7,
        max_tokens=4000,        
    )

    return response.choices[0].message.content

In [None]:
# The function below will call the reference LLMs in parallel
async def run_llm_parallel(user_prompt : str, model : str, system_prompt : str = None):
    """Run a single LLM call with a reference model."""
    for sleep_time in [1, 2, 4]:
        try:
            messages = []
            if system_prompt:
                messages.append({"role": "system", "content": system_prompt})
    
            messages.append({"role": "user", "content": user_prompt})

            response = await async_client.chat.completions.create(
                model=model,
                messages=messages,
                temperature=0.7,
                max_tokens=2000,
            )
            break
        except together.error.RateLimitError as e:
            print(e)
            await asyncio.sleep(sleep_time)
    return response.choices[0].message.content

['The capital of the United States of America (USA) is Washington, D.C. (short for District of Columbia).',
 'The capital of the United States of America (USA) is Washington, D.C. (short for District of Columbia).']

In [None]:
def JSON_llm(user_prompt : str, schema, system_prompt : str = None):
    
    try:
        messages = []
        if system_prompt:
            messages.append({"role": "system", "content": system_prompt})
    
        messages.append({"role": "user", "content": user_prompt})
        
        extract = client.chat.completions.create(
            messages=messages,
            model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
            response_format={
                "type": "json_object",
                "schema": schema.model_json_schema(),
            },
        )
        return json.loads(extract.choices[0].message.content)

    except ValidationError as e:
        error_message = f"Failed to parse JSON: {e}"
        print(error_message)

## Prompt Chaining Recipe
A simple snippet of serial prompt chaining.

In [None]:
def serial_chain_workflow(input_query: str, prompt_chain : List[str]) -> List[str]:
    """Run a serial chain of LLM calls to address the `input_query` 
    using a prompts specified in a list `prompt_chain`.
    """
    response_chain = []
    response = input_query
    for i, prompt in enumerate(prompt_chain):
        print(f"Step {i+1}")
        response = run_llm(f"{prompt}\nInput:\n{response}", model='meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo')
        response_chain.append(response)
        print(f"{response}\n")
    return response_chain

# Toy Example

question = "Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?"

prompt_chain = ["""Given the math problem, ONLY extract any relevant numerical information and how it can be used.""",
                """Given the numberical information extracted, ONLY express the steps you would take to solve the problem.""",
                """Given the steps, express the final answer to the problem."""]

responses = serial_chain_workflow(question, prompt_chain)

final_answer = responses[-1]


Step 1
Relevant numerical information: 
- $12 (hourly wage)
- 50 minutes (time worked, can be converted to hours for calculation: 50 / 60 = 5/6 hour)

Step 2
1. Convert 50 minutes to hours: 50 / 60 = 5/6 hour.
2. Multiply the hourly wage by the time worked in hours: $12 * (5/6).
3. Calculate the result of the multiplication to find the earnings.

Step 3
To find the earnings, we need to perform the multiplication of $12 and 5/6.

First, convert the fraction to a decimal: 5/6 ≈ 0.83

Then, multiply $12 by 0.83: 
$12 * 0.83 ≈ $9.96

So, the earnings are approximately $9.96.



## Routing
A simple snippet of the consitional routing workflow.

In [None]:
def router_workflow(input_query: str, routes : Dict[str, str]) -> str:
    """ Given a `input_qeury` and a dictionary of `routes` containing options and details for each.
    Selects the best model for the task and return the response from the model.
    """
    ROUTER_PROMPT = """Given a user prompt/query: {user_query}, select the best option out of the following routes:
    {routes}. Answer only in JSON format."""

    # Create a schema from the routes dictionary
    class Schema(BaseModel):
        route: Literal[tuple(routes.keys())]
    
        reason: str = Field(
            description="Short one-liner explanation why this route was selected for the task in the prompt/query."
        )

    # Call LLM to select route
    selected_route = JSON_llm(ROUTER_PROMPT.format(user_query=input_query, routes=routes), Schema)
    print(f"Selcted route:{selected_route['route']}\nReason: {selected_route['reason']}\n")

    # Use LLM on selected route. 
    # Could also have different prompts that need to be used for each route.
    response = run_llm(user_prompt= input_query, model = selected_route['route'])
    print(f"Response: {response}\n")
    
    return response

In [None]:
prompt_list = ["Produce python snippet to check to see if a number is prime or not.",
               "Plan and provide a short itenary for a 2 week vacation in Europe.",
               "Write a short story about a dragon and a knight."]

model_routes = {
    "Qwen/Qwen2.5-Coder-32B-Instruct" : "Best model choice for code generation tasks.",
    "Gryphe/MythoMax-L2-13b" : "Best model choice for story-telling, role-playing and fantasy tasks.",
    "Qwen/QwQ-32B-Preview" : "Best model for reasoning, planning and muilti-step tasks",
}

for i, prompt in enumerate(prompt_list):
    print(f"Task {i+1}: {prompt}\n")
    print(20*'==')
    router_workflow(prompt, model_routes)


Task 1: Produce python snippet to check to see if a number is prime or not.

Seelction route:Qwen/Qwen2.5-Coder-32B-Instruct
 Reason: The task requires generating a Python code snippet to check if a number is prime or not, which falls under code generation tasks.

Response: Certainly! Below is a Python function that checks whether a given number is prime or not:

```python
def is_prime(n):
    """Check if a number is prime."""
    if n <= 1:
        return False
    if n <= 3:
        return True
    if n % 2 == 0 or n % 3 == 0:
        return False
    i = 5
    while i * i <= n:
        if n % i == 0 or n % (i + 2) == 0:
            return False
        i += 6
    return True

# Example usage:
number = 29
if is_prime(number):
    print(f"{number} is a prime number.")
else:
    print(f"{number} is not a prime number.")
```

### Explanation:
1. **Initial Checks**: 
   - Numbers less than or equal to 1 are not prime.
   - Numbers 2 and 3 are prime.
   
2. **Divisibility Check**:
   - If

## Parallel Recipe
A simple snippet of parallel agent workflow.

In [None]:
async def parallel_workflow(prompt : str, proposer_models : List[str], aggregator_model : str, aggregator_prompt: str):
    """Run a parallel chain of LLM calls to address the `input_query` 
    using a list of models specified in `models`.

    Returns output from final aggregator model.
    """

    # Gather intermediate responses from proposer models
    proposed_responses = await asyncio.gather(*[run_llm_parallel(prompt, model) for model in proposer_models])
    
    # Aggregate responses using an aggregator model
    final_output = run_llm(user_prompt=prompt,
                           model=aggregator_model,
                           system_prompt=aggregator_prompt + "\n" + "\n".join(f"{i+1}. {str(element)}" for i, element in enumerate(proposed_responses)
           ))
    
    return final_output, proposed_responses


In [None]:
reference_models = [
    "microsoft/WizardLM-2-8x22B",
    "Qwen/Qwen2.5-72B-Instruct-Turbo",
    "google/gemma-2-27b-it",
    "meta-llama/Llama-3.3-70B-Instruct-Turbo",
]

user_prompt = """Jenna and her mother picked some apples from their apple farm. 
Jenna picked half as many apples as her mom. If her mom got 20 apples, how many apples did they both pick?"""

aggregator_model = "deepseek-ai/DeepSeek-V3"

aggregator_system_prompt = """You have been provided with a set of responses from various open-source models to the latest user query.
Your task is to synthesize these responses into a single, high-quality response. It is crucial to critically evaluate the information
provided in these responses, recognizing that some of it may be biased or incorrect. Your response should not simply replicate the
given answers but should offer a refined, accurate, and comprehensive reply to the instruction. Ensure your response is well-structured,
coherent, and adheres to the highest standards of accuracy and reliability.

Responses from models:"""

answer, intermediate_reponses = await parallel_workflow(prompt = user_prompt, 
                                                        proposer_models = reference_models, 
                                                        aggregator_model = aggregator_model, 
                                                        aggregator_prompt = aggregator_system_prompt)

In [None]:
for i, response in enumerate(intermediate_reponses):
    print(f"Intermetidate Response {i+1}:\n\n{response}\n")

Intermetidate Response 1:

 Let's think step by step.To solve the problem, we need to determine the total number of apples picked by Jenna and her mother combined. We are given two pieces of information:

1. Jenna's mother picked 20 apples.
2. Jenna picked half as many apples as her mother.

Let's break down the solution process:

Step 1: Determine the number of apples Jenna's mother picked.
- According to the information provided, Jenna's mother picked 20 apples.

Step 2: Calculate the number of apples Jenna picked.
- Since Jenna picked half as many apples as her mother, we need to find half of the mother's count.
- Half of 20 apples is calculated by dividing 20 by 2, which gives us 10 apples.
- Therefore, Jenna picked 10 apples.

Step 3: Find the total number of apples they both picked.
- To find the total, we add the number of apples picked by Jenna to the number of apples picked by her mother.
- Adding Jenna's 10 apples to her mother's 20 apples gives us a total of 30 apples.

Step

In [None]:
print(f"Final Answer: {answer}\n")

Final Answer: To determine the total number of apples Jenna and her mother picked together, follow these steps:

1. **Determine the number of apples Jenna's mother picked:**
   - Jenna's mother picked **20 apples**.

2. **Calculate the number of apples Jenna picked:**
   - Jenna picked half as many apples as her mother.
   - Half of 20 is \( \frac{1}{2} \times 20 = 10 \) apples.
   - So, Jenna picked **10 apples**.

3. **Find the total number of apples they both picked:**
   - Add the number of apples picked by Jenna and her mother: \( 20 + 10 = 30 \) apples.

**Final Answer:** Jenna and her mother picked a total of **30 apples**.



## Orchestrator Agent Workflow
A simple snippet of the parallel orchestrator-worker agent workflow.

In [None]:
from pydantic import BaseModel, Field
from typing import Literal, List

ORCHESTRATOR_PROMPT = """
Analyze this task and break it down into 2-3 distinct approaches:

Task: {task}

Provide an Analysis:

Explain your understanding of the task and which variations would be valuable.
Focus on how each approach serves different aspects of the task.

Along with the analysis, provide 2-3 approaches to tackle the task, each with a brief description:

Formal style: Write technically and precisely, focusing on detailed specifications
Conversational style: Write in a friendly and engaging way that connects with the reader
Hybrid style: Tell a story that includes technical details, combining emotional elements with specifications

Return only JSON output.
"""

WORKER_PROMPT = """
Generate content based on:
Task: {original_task}
Style: {task_type}
Guidelines: {task_description}

Return only your response:
[Your content here, maintaining the specified style and fully addressing requirements.]
"""

task = """Write a product description for a new eco-friendly water bottle.
The target_audience is environmentally conscious millennials and key product features are: plastic-free, insulated, lifetime warranty
"""

In [None]:
class Task(BaseModel):
    type: Literal["formal", "conversational", "hybrid"]
    description: str

class TaskList(BaseModel):
    analysis: str
    tasks: List[Task]  = Field(..., default_factory=list)

async def orchestrator_workflow(task : str, orchestrator_prompt : str, worker_prompt : str): 
    """Use a orchestrator model to break down a task into sub-tasks and then use worker models to generate and return responses."""

    # Use orchestrator model to break the task up into sub-tasks
    orchestrator_response = JSON_llm(orchestrator_prompt.format(task=task), schema=TaskList)
 
    # Parse orchestrator response
    analysis = orchestrator_response["analysis"]
    tasks= orchestrator_response["tasks"]

    print("\n=== ORCHESTRATOR OUTPUT ===")
    print(f"\nANALYSIS:\n{analysis}")
    print(f"\nTASKS:\n{json.dumps(tasks, indent=2)}")

    worker_model =  ["meta-llama/Llama-3.3-70B-Instruct-Turbo"]*len(tasks)

    # Gather intermediate responses from worker models
    return tasks , await asyncio.gather(*[run_llm_parallel(user_prompt=worker_prompt.format(original_task=task, task_type=task_info['type'], task_description=task_info['description']), model=model) for task_info, model in zip(tasks,worker_model)])

In [None]:
task = """Write a product description for a new eco-friendly water bottle. 
The target_audience is environmentally conscious millennials and key product features are: plastic-free, insulated, lifetime warranty
"""

tasks, worker_resp = await orchestrator_workflow(task, orchestrator_prompt=ORCHESTRATOR_PROMPT, worker_prompt=WORKER_PROMPT)


=== ORCHESTRATOR OUTPUT ===

ANALYSIS:
The task requires writing a product description for an eco-friendly water bottle targeting environmentally conscious millennials. The key features to highlight are the plastic-free material, insulated design, and lifetime warranty. A valuable product description should effectively communicate these features while resonating with the target audience.

TASKS:
[
  {
    "type": "formal",
    "description": "Write a technically precise product description focusing on detailed specifications, such as the materials used, insulation technology, and warranty terms. This approach serves the task by providing a clear understanding of the product's features and benefits."
  },
  {
    "type": "conversational",
    "description": "Write a friendly and engaging product description that connects with the reader on an emotional level. This approach serves the task by building a relationship with the target audience and highlighting the product's eco-friendly as

In [None]:
for task_info, response in zip(tasks, worker_resp):
    print(f"\n=== WORKER RESULT ({task_info['type']}) ===\n{response}\n")


=== WORKER RESULT (formal) ===
Introduction to the HydraGreen Water Bottle

We are pleased to introduce the HydraGreen water bottle, a revolutionary, eco-friendly hydration solution designed specifically for environmentally conscious millennials. This premium product boasts a plastic-free construction, advanced insulation technology, and a comprehensive lifetime warranty, ensuring a superior user experience while minimizing its ecological footprint.

Materials and Construction

The HydraGreen water bottle is crafted from high-quality, BPA-free stainless steel (18/8 food-grade) and features a durable, non-toxic silicone sleeve. The bottle's body is constructed using a proprietary double-walled insulation process, which provides exceptional thermal retention while maintaining a slim, ergonomic design. The lid is manufactured from a sustainable, plant-based polymer, further reducing the product's reliance on petroleum-derived materials.

Insulation Technology

The HydraGreen water bottle

## Loop Optimizer Agent Workflow
A simple snippet of looping generator-evaluator workflow.

In [None]:
task = """
Implement a Stack with:
1. push(x)
2. pop()
3. getMin()
All operations should be O(1).
"""

In [None]:
GENERATOR_PROMPT = """
Your goal is to complete the task based on <user input>. If there are feedback 
from your previous generations, you should reflect on them to improve your solution

Output your answer concisely in the following format: 

Thoughts:
[Your understanding of the task and feedback and how you plan to improve]

Response:
[Your code implementation here]
"""

def generate(task: str, generator_prompt: str, context: str = "") -> tuple[str, str]:
    """Generate and improve a solution based on feedback."""
    full_prompt = f"{generator_prompt}\n{context}\nTask: {task}" if context else f"{generator_prompt}\nTask: {task}"

    response = run_llm(full_prompt, model="Qwen/Qwen2.5-Coder-32B-Instruct")
    
    print("\n=== GENERATION START ===")
    print(f"Output:\n{response}\n")
    print("=== GENERATION END ===\n")
    
    return response

EVALUATOR_PROMPT = """
Evaluate this following code implementation for:
1. code correctness
2. time complexity
3. style and best practices

You should be evaluating only and not attemping to solve the task.

Only output "PASS" if all criteria are met and you have no further suggestions for improvements.

Provide detailed feedback if there are areas that need improvement. You should specify what needs improvement and why.

Only output JSON.
"""

def evaluate(task : str, evaluator_prompt : str, generated_content: str, schema) -> tuple[str, str]:
    """Evaluate if a solution meets requirements."""
    full_prompt = f"{evaluator_prompt}\nOriginal task: {task}\nContent to evaluate: {generated_content}"
    
    response = JSON_llm(full_prompt, schema)
    
    evaluation = response["evaluation"]
    feedback = response["feedback"]

    print("=== EVALUATION START ===")
    print(f"Status: {evaluation}")
    print(f"Feedback: {feedback}")
    print("=== EVALUATION END ===\n")

    return evaluation, feedback



In [None]:
def loop_workflow(task: str, evaluator_prompt: str, generator_prompt: str) -> tuple[str, list[dict]]:
    """Keep generating and evaluating until the evaluator passes the last generated response."""
    # Store previous responses from generator
    memory = []
    
    # Generate initial response
    response = generate(task, generator_prompt)
    memory.append(response)

    #Build a schema for the evaluation
    class Evaluation(BaseModel):
        evaluation: Literal["PASS", "NEEDS_IMPROVEMENT", "FAIL"]
        feedback: str

    # While the generated response is not passing, keep generating and evaluating
    while True:
        evaluation, feedback = evaluate(task, evaluator_prompt, response, Evaluation)
        # Terminating condition
        if evaluation == "PASS":
            return response
        
        # Add current response and feedback to context and generate a new response
        context = "\n".join([
            "Previous attempts:",
            *[f"- {m}" for m in memory],
            f"\nFeedback: {feedback}"
        ])
        
        response = generate(generator_prompt, task, context)
        memory.append(response)

In [None]:
loop_workflow(task, EVALUATOR_PROMPT, GENERATOR_PROMPT)


=== GENERATION START ===
Output:
Thoughts:
To implement a stack with the operations `push(x)`, `pop()`, and `getMin()` all in O(1) time complexity, we can use two stacks. One stack will store the actual stack elements, and the other will store the minimum values. The minimum stack will help us keep track of the minimum element efficiently.

Response:
```python
class MinStack:
    def __init__(self):
        self.stack = []
        self.min_stack = []

    def push(self, x: int) -> None:
        self.stack.append(x)
        if not self.min_stack or x <= self.min_stack[-1]:
            self.min_stack.append(x)

    def pop(self) -> None:
        if self.stack:
            x = self.stack.pop()
            if x == self.min_stack[-1]:
                self.min_stack.pop()

    def top(self) -> int:
        if self.stack:
            return self.stack[-1]
        raise IndexError("Stack is empty")

    def getMin(self) -> int:
        if self.min_stack:
            return self.min_stack[-1]


'Thoughts:\nThe current implementation is mostly correct and handles edge cases by returning `None` for empty stack scenarios. However, we can further improve the code by:\n1. Ensuring consistent method naming (e.g., `getMin` to `get_min` for consistency with `get_top_element`).\n2. Adding type hints and docstrings for better readability and maintainability.\n3. Ensuring that the implementation adheres to best practices by handling empty stack scenarios gracefully and providing clear documentation.\n\nResponse:\n```python\nclass MinStack:\n    def __init__(self):\n        """Initialize the stack and the minimum stack."""\n        self.stack = []\n        self.min_stack = []\n\n    def push(self, x: int) -> None:\n        """Push element x onto the stack.\n\n        Args:\n            x (int): The element to be pushed onto the stack.\n        """\n        self.stack.append(x)\n        if not self.min_stack or x <= self.min_stack[-1]:\n            self.min_stack.append(x)\n\n    def pop(