diff --git a/AGENTS.md b/AGENTS.md index e0a8b07..0f5c174 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -64,6 +64,7 @@ The handler filters tools dynamically based on `GetMyPermissions` from Sysdig Se | `troubleshoot_kubernetes_list_top_cpu_consumed_by_workload` | `tool_troubleshoot_kubernetes_list_top_cpu_consumed_by_workload.go` | Identifies the Kubernetes workloads (all containers) consuming the most CPU (in cores). | `promql.exec` | "Show the top 10 workloads consuming the most CPU in cluster 'production'" | | `troubleshoot_kubernetes_list_top_cpu_consumed_by_container` | `tool_troubleshoot_kubernetes_list_top_cpu_consumed_by_container.go` | Identifies the Kubernetes containers consuming the most CPU (in cores). | `promql.exec` | "Show the top 10 containers consuming the most CPU in cluster 'production'" | | `troubleshoot_kubernetes_list_top_memory_consumed_by_workload` | `tool_troubleshoot_kubernetes_list_top_memory_consumed_by_workload.go` | Lists memory-intensive workloads (all containers). | `promql.exec` | "Show the top 10 workloads consuming the most memory in cluster 'production'" | +| `troubleshoot_kubernetes_list_top_memory_consumed_by_container` | `tool_troubleshoot_kubernetes_list_top_memory_consumed_by_container.go` | Lists memory-intensive containers. | `promql.exec` | "Show the top 10 containers consuming the most memory in cluster 'production'" | Every tool has a companion `_test.go` file that exercises request validation, permission metadata, and Sysdig client calls through mocks. Note that if you add more tools you need to also update this file to reflect that. diff --git a/README.md b/README.md index 1b65d62..7d421e4 100644 --- a/README.md +++ b/README.md @@ -183,6 +183,11 @@ The server dynamically filters the available tools based on the permissions asso - **Required Permission**: `promql.exec` - **Sample Prompt**: "Show the top 10 workloads consuming the most memory in cluster 'production'" +- **`troubleshoot_kubernetes_list_top_memory_consumed_by_container`** + - **Description**: Lists memory-intensive containers. + - **Required Permission**: `promql.exec` + - **Sample Prompt**: "Show the top 10 containers consuming the most memory in cluster 'production'" + ## Requirements - [Go](https://go.dev/doc/install) 1.25 or higher (if running without Docker). diff --git a/cmd/server/main.go b/cmd/server/main.go index 45d2812..401e0dc 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -110,6 +110,7 @@ func setupHandler(sysdigClient sysdig.ExtendedClientWithResponsesInterface) *mcp tools.NewTroubleshootKubernetesListTopCPUConsumedByContainer(sysdigClient), tools.NewTroubleshootKubernetesListUnderutilizedPodsByMemoryQuota(sysdigClient), tools.NewTroubleshootKubernetesListTopMemoryConsumedByWorkload(sysdigClient), + tools.NewTroubleshootKubernetesListTopMemoryConsumedByContainer(sysdigClient), ) return handler } diff --git a/internal/infra/mcp/tools/tool_troubleshoot_kubernetes_list_top_memory_consumed_by_container.go b/internal/infra/mcp/tools/tool_troubleshoot_kubernetes_list_top_memory_consumed_by_container.go new file mode 100644 index 0000000..6cda76e --- /dev/null +++ b/internal/infra/mcp/tools/tool_troubleshoot_kubernetes_list_top_memory_consumed_by_container.go @@ -0,0 +1,96 @@ +package tools + +import ( + "context" + "encoding/json" + "fmt" + "io" + "strings" + + "github.com/mark3labs/mcp-go/mcp" + "github.com/mark3labs/mcp-go/server" + "github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig" +) + +type TroubleshootKubernetesListTopMemoryConsumedByContainer struct { + SysdigClient sysdig.ExtendedClientWithResponsesInterface +} + +func NewTroubleshootKubernetesListTopMemoryConsumedByContainer(sysdigClient sysdig.ExtendedClientWithResponsesInterface) *TroubleshootKubernetesListTopMemoryConsumedByContainer { + return &TroubleshootKubernetesListTopMemoryConsumedByContainer{ + SysdigClient: sysdigClient, + } +} + +func (t *TroubleshootKubernetesListTopMemoryConsumedByContainer) RegisterInServer(s *server.MCPServer) { + tool := mcp.NewTool("troubleshoot_kubernetes_list_top_memory_consumed_by_container", + mcp.WithDescription("Lists memory-intensive containers."), + mcp.WithString("cluster_name", mcp.Description("The name of the cluster to filter by.")), + mcp.WithString("namespace_name", mcp.Description("The name of the namespace to filter by.")), + mcp.WithString("workload_type", mcp.Description("The type of the workload to filter by.")), + mcp.WithString("workload_name", mcp.Description("The name of the workload to filter by.")), + mcp.WithNumber("limit", + mcp.Description("Maximum number of containers to return."), + mcp.DefaultNumber(20), + ), + mcp.WithOutputSchema[map[string]any](), + WithRequiredPermissions(), // FIXME(fede): Add the required permissions. It should be `promql.exec` but somehow the token does not have that permission even if you are able to execute queries. + ) + s.AddTool(tool, t.handle) +} + +func (t *TroubleshootKubernetesListTopMemoryConsumedByContainer) handle(ctx context.Context, request mcp.CallToolRequest) (*mcp.CallToolResult, error) { + clusterName := mcp.ParseString(request, "cluster_name", "") + namespaceName := mcp.ParseString(request, "namespace_name", "") + workloadType := mcp.ParseString(request, "workload_type", "") + workloadName := mcp.ParseString(request, "workload_name", "") + limit := mcp.ParseInt(request, "limit", 20) + + query := buildTopMemoryConsumedByContainerQuery(clusterName, namespaceName, workloadType, workloadName, limit) + + limitQuery := sysdig.LimitQuery(limit) + params := &sysdig.GetQueryV1Params{ + Query: query, + Limit: &limitQuery, + } + + httpResp, err := t.SysdigClient.GetQueryV1(ctx, params) + if err != nil { + return mcp.NewToolResultErrorFromErr("failed to get container list", err), nil + } + + if httpResp.StatusCode != 200 { + bodyBytes, _ := io.ReadAll(httpResp.Body) + return mcp.NewToolResultErrorf("failed to get container list: status code %d, body: %s", httpResp.StatusCode, string(bodyBytes)), nil + } + + var queryResponse sysdig.QueryResponseV1 + if err := json.NewDecoder(httpResp.Body).Decode(&queryResponse); err != nil { + return mcp.NewToolResultErrorFromErr("failed to decode response", err), nil + } + + return mcp.NewToolResultJSON(queryResponse) +} + +func buildTopMemoryConsumedByContainerQuery(clusterName, namespaceName, workloadType, workloadName string, limit int) string { + filters := []string{} + if clusterName != "" { + filters = append(filters, fmt.Sprintf(`kube_cluster_name="%s"`, clusterName)) + } + if namespaceName != "" { + filters = append(filters, fmt.Sprintf(`kube_namespace_name="%s"`, namespaceName)) + } + if workloadType != "" { + filters = append(filters, fmt.Sprintf(`kube_workload_type="%s"`, workloadType)) + } + if workloadName != "" { + filters = append(filters, fmt.Sprintf(`kube_workload_name="%s"`, workloadName)) + } + + filterString := "" + if len(filters) > 0 { + filterString = "{" + strings.Join(filters, ", ") + "}" + } + + return fmt.Sprintf(`topk(%d, sum by (kube_cluster_name, kube_namespace_name, kube_workload_type, kube_workload_name, container_label_io_kubernetes_container_name) (sysdig_container_memory_used_bytes%s))`, limit, filterString) +} diff --git a/internal/infra/mcp/tools/tool_troubleshoot_kubernetes_list_top_memory_consumed_by_container_test.go b/internal/infra/mcp/tools/tool_troubleshoot_kubernetes_list_top_memory_consumed_by_container_test.go new file mode 100644 index 0000000..c04aff4 --- /dev/null +++ b/internal/infra/mcp/tools/tool_troubleshoot_kubernetes_list_top_memory_consumed_by_container_test.go @@ -0,0 +1,109 @@ +package tools_test + +import ( + "bytes" + "context" + "io" + "net/http" + + "github.com/mark3labs/mcp-go/mcp" + "github.com/mark3labs/mcp-go/server" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/sysdiglabs/sysdig-mcp-server/internal/infra/mcp/tools" + "github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig" + "github.com/sysdiglabs/sysdig-mcp-server/internal/infra/sysdig/mocks" + "go.uber.org/mock/gomock" +) + +var _ = Describe("TroubleshootKubernetesListTopMemoryConsumedByContainer Tool", func() { + var ( + tool *tools.TroubleshootKubernetesListTopMemoryConsumedByContainer + mockSysdig *mocks.MockExtendedClientWithResponsesInterface + mcpServer *server.MCPServer + ctrl *gomock.Controller + ) + + BeforeEach(func() { + ctrl = gomock.NewController(GinkgoT()) + mockSysdig = mocks.NewMockExtendedClientWithResponsesInterface(ctrl) + tool = tools.NewTroubleshootKubernetesListTopMemoryConsumedByContainer(mockSysdig) + mcpServer = server.NewMCPServer("test", "test") + tool.RegisterInServer(mcpServer) + }) + + It("should register successfully in the server", func() { + Expect(mcpServer.GetTool("troubleshoot_kubernetes_list_top_memory_consumed_by_container")).NotTo(BeNil()) + }) + + When("listing top memory consumed by container", func() { + DescribeTable("it succeeds", func(ctx context.Context, toolName string, request mcp.CallToolRequest, expectedParamsRequested sysdig.GetQueryV1Params) { + mockSysdig.EXPECT().GetQueryV1(gomock.Any(), &expectedParamsRequested).Return(&http.Response{ + StatusCode: http.StatusOK, + Body: io.NopCloser(bytes.NewBufferString(`{"status":"success"}`)), + }, nil) + + serverTool := mcpServer.GetTool(toolName) + result, err := serverTool.Handler(ctx, request) + Expect(err).NotTo(HaveOccurred()) + + resultData, ok := result.Content[0].(mcp.TextContent) + Expect(ok).To(BeTrue()) + Expect(resultData.Text).To(MatchJSON(`{"status":"success"}`)) + }, + Entry(nil, + "troubleshoot_kubernetes_list_top_memory_consumed_by_container", + mcp.CallToolRequest{ + Params: mcp.CallToolParams{ + Name: "troubleshoot_kubernetes_list_top_memory_consumed_by_container", + Arguments: map[string]any{}, + }, + }, + sysdig.GetQueryV1Params{ + Query: `topk(20, sum by (kube_cluster_name, kube_namespace_name, kube_workload_type, kube_workload_name, container_label_io_kubernetes_container_name) (sysdig_container_memory_used_bytes))`, + Limit: toPtr(sysdig.LimitQuery(20)), + }, + ), + Entry(nil, + "troubleshoot_kubernetes_list_top_memory_consumed_by_container", + mcp.CallToolRequest{ + Params: mcp.CallToolParams{ + Name: "troubleshoot_kubernetes_list_top_memory_consumed_by_container", + Arguments: map[string]any{ + "cluster_name": "prod", + "namespace_name": "default", + "limit": 10, + }, + }, + }, + sysdig.GetQueryV1Params{ + Query: `topk(10, sum by (kube_cluster_name, kube_namespace_name, kube_workload_type, kube_workload_name, container_label_io_kubernetes_container_name) (sysdig_container_memory_used_bytes{kube_cluster_name="prod", kube_namespace_name="default"}))`, + Limit: toPtr(sysdig.LimitQuery(10)), + }, + ), + Entry(nil, + "troubleshoot_kubernetes_list_top_memory_consumed_by_container", + mcp.CallToolRequest{ + Params: mcp.CallToolParams{ + Name: "troubleshoot_kubernetes_list_top_memory_consumed_by_container", + Arguments: map[string]any{ + "cluster_name": "prod", + "namespace_name": "default", + "workload_name": "api", + "workload_type": "deployment", + "limit": 5, + }, + }, + }, + sysdig.GetQueryV1Params{ + Query: `topk(5, sum by (kube_cluster_name, kube_namespace_name, kube_workload_type, kube_workload_name, container_label_io_kubernetes_container_name) (sysdig_container_memory_used_bytes{kube_cluster_name="prod", kube_namespace_name="default", kube_workload_type="deployment", kube_workload_name="api"}))`, + Limit: toPtr(sysdig.LimitQuery(5)), + }, + ), + ) + }) +}) + +func toPtr[T any](v T) *T { + return &v +}