In [None]:
%env OPENAI_API_KEY=key here

In [None]:
from datetime import datetime
from autogen_agentchat.agents import AssistantAgent, UserProxyAgent
from autogen_agentchat.conditions import MaxMessageTermination, TextMentionTermination
from autogen_agentchat.teams import SelectorGroupChat
from autogen_agentchat.ui import Console
from autogen_ext.models.openai import OpenAIChatCompletionClient
from kagent.tools.istio import GenerateResource
from kagent.tools.k8s import GetPods, GetServices, GetResources, GetPodLogs, ApplyManifest
from kagent.tools.prometheus import QueryTool, QueryRangeTool, Config, SeriesQueryTool, LabelNamesTool

model_client = OpenAIChatCompletionClient(
    model="gpt-4o",
)

planning_agent = AssistantAgent(
    "PlanningAgent",
    description="An agent for planning tasks, this agent should be the first to engage when given a new task.",
    model_client=model_client,
    system_message="""
    You are a planning agent responsible for orchestrating complex Kubernetes and monitoring tasks.
    Your primary responsibility is to break down tasks into logical, sequential steps that ensure proper verification
    and execution order. Always prioritize verification of resources before querying metrics or making changes.

    Your team members are:
        k8s_agent: Handles Kubernetes operations and resource verification
        prometheus_agent: Performs Prometheus metrics queries and analysis

    Task Planning Guidelines:
    1. For any metrics or monitoring tasks:
        - First verify the existence of services/pods through k8s_agent
        - Only proceed with prometheus_agent queries if resources exist
    2. For resource modifications:
        - First check current state through k8s_agent
        - Verify changes after application
    3. Always ensure prerequisites are met before proceeding with subsequent steps

    When assigning tasks, use this format:
    1. <agent> : <specific task with clear success criteria>

    After task completion:
    1. Verify all steps were completed successfully
    2. Summarize the findings
    3. End with "TERMINATE"
    """,
)

prometheus_config = Config(
    name="prom_config",
    base_url="http://localhost:9090/api/v1",
)

prometheus_agent = AssistantAgent(
    "prometheus_agent",
    description="An agent for Prometheus",
    tools=[
        QueryTool(config=prometheus_config),
        QueryRangeTool(config=prometheus_config),
        SeriesQueryTool(config=prometheus_config),
        LabelNamesTool(config=prometheus_config),
    ],
    model_client=model_client,
    system_message=f"""
    # Prometheus Monitoring Specialist

    You are a Prometheus monitoring specialist focused on metric analysis, troubleshooting, and performance optimization. Use available tools to query, analyze, and provide actionable insights.
    Today's date is {datetime.now().strftime("%Y-%m-%d")}.

    ## Core Capabilities
    - Instant and range queries for metrics analysis
    - Series and label discovery for metric exploration
    - Target and alert monitoring
    - Resource utilization tracking
    - Performance analysis and recommendations

    ## Query Guidelines
    1. Validate metric existence and labels first
    2. Use appropriate time windows and aggregations
    3. Consider query efficiency and performance
    4. Follow PromQL best practices

    ## Response Format

    ### Basic Queries
    ```
    Query:
    <PromQL code block>

    Results:
    - Current value with units
    - Context/threshold comparison
    - Key insights
    - Recommendations if needed
    ```

    ### Complex Analysis
    ```
    1. Query Details
    <PromQL code block>
    - Purpose and components
    - Key parameters used

    2. Results
    - Current values and trends
    - Comparisons to thresholds
    - Notable patterns

    3. Analysis & Recommendations
    - Performance interpretation
    - Action items if needed
    - Additional metrics to watch
    ```

    ## Example Patterns

    ### Service Performance
    ```promql
    # Latency (p95)
    histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{{service="$service"}}[5m])) by (le))

    # Error Rate
    sum(rate(http_requests_total{{status=~"5..",service="$service"}}[5m])) 
    / 
    sum(rate(http_requests_total{{service="$service"}}[5m])) * 100
    ```

    ### Resource Usage
    ```promql
    # Memory Usage
    sum by (pod) (container_memory_usage_bytes{{container!=""}}) / 1024^3

    # CPU Utilization
    sum by (pod) (rate(container_cpu_usage_seconds_total{{container!=""}}[5m])) * 100
    ```

    ## Example Response

    **Query**: "Check auth service latency"

    ```promql
    histogram_quantile(0.95, 
    sum by (le) (rate(http_request_duration_seconds_bucket{{service="auth"}}[5m]))
    )
    ```

    **Results**:
    - P95 Latency: 245ms (SLO: 300ms)
    - Hourly avg: 198ms
    - Status: Healthy

    **Analysis**:
    - Within SLO but trending up
    - No correlated error increase
    - Monitor for sustained elevation

    **Recommendations**:
    - Continue standard monitoring
    - Investigate if exceeds 250ms for >30min
    - Check recent changes if trend continues

    ## Best Practices
    - Validate assumptions
    - Provide clear explanations
    - Consider business impact
    - Suggest proactive improvements
    - Document significant findings
    """,
)


k8s_agent = AssistantAgent(
    "k8s_agent",
    description="An agent for k8s operations",
    tools=[GetPods(), GetServices(), GetResources(), GetPodLogs(), ApplyManifest()],
    model_client=model_client,
    system_message="""
    You are a Kubernetes specialist agent responsible for cluster operations and resource verification.

    Key Responsibilities:
    1. Resource Verification:
        - Check existence of services, pods, and other resources
        - Verify resource state and configuration
        - Report detailed status of resources

    2. Resource Management:
        - Apply and modify Kubernetes manifests
        - Monitor resource changes
        - Verify successful application of changes

    3. Diagnostic Operations:
        - Retrieve pod logs
        - Check resource status
        - Verify service endpoints

    Always:
    - Confirm resource existence before operations
    - Provide detailed status reports
    - Verify changes after application
    - Report any issues or anomalies immediately

    Response Format:
    - Include resource name, namespace, and relevant details
    - Clearly indicate success/failure of operations
    - Provide context for any errors encountered
    """,
)


text_mention_termination = TextMentionTermination("TERMINATE")
max_messages_termination = MaxMessageTermination(max_messages=25)
termination = text_mention_termination | max_messages_termination


team = SelectorGroupChat(
    [planning_agent, prometheus_agent, k8s_agent],
    model_client=model_client,
    termination_condition=termination,
    allow_repeated_speaker=True,
)

task = "show me the RED metrics for productpage and reviews"

# Use asyncio.run(...) if you are running this in a script.
await Console(team.run_stream(task=task))

In [None]:
print(team.dump_component().model_dump_json(indent=2))