diff --git a/redisdb/assets/monitors/cmd_latency.json b/redisdb/assets/monitors/cmd_latency.json new file mode 100644 index 0000000000000..0702bf43a8d22 --- /dev/null +++ b/redisdb/assets/monitors/cmd_latency.json @@ -0,0 +1,35 @@ +{ + "version": 2, + "created_at": "2025-05-27", + "last_updated_at": "2025-05-27", + "title": "Command latency is high", + "description": "Redis is built for speed, and command latency is a key performance indicator. This monitor tracks the 99th percentile of command latency to catch slowdowns early, helping prevent cascading delays across your systems.", + "definition": { + "name": "[Redis] High command latency (p99)", + "type": "query alert", + "query": "avg(last_5m):avg:redis.slowlog.micros.95percentile{*} > 20", + "message": "## 🚨 What’s happening\n\nHigh Redis command latency detected (p95 > 20ms for 5 minutes).\n\nRedis is experiencing elevated command latency, which suggests that operations are not responding within expected thresholds. This could be caused by internal contention, blocked commands, slow clients, or downstream pressure from connected services.\n\n---\n\n## πŸ“ˆ Impact\n\nIncreased command latency can lead to:\n\n- Slower application performance and timeouts\n- Delayed cache reads/writes\n- Poor user experience in latency-sensitive applications\n- Potential cascading effects on dependent systems\n\n---\n\n## πŸ› οΈ Runbook\n\n### Initial Troubleshooting Steps\n\n1. **Identify the affected Redis node**.\n2. Go to [**Redis integration metrics**](https://app.datadoghq.com/monitors/manage?filter=redis) in Datadog.\n3. Review these metrics:\n - `redis.net.latency_ms.p99`\n - `redis.commands.per_sec`\n - `redis.clients.blocked`\n - Host-level CPU/memory/disk metrics\n4. Check for slow logs or blocked clients.\n5. Ensure no network congestion or saturation between Redis and calling services.\n\n---\n\n### Cause and Resolution\n\nCause | Resolution\n------|-----------\nCommand backlog or slow queries | Investigate slow logs and blocked clients.\nHigh memory or CPU pressure | Scale the node or optimize Redis configuration.\nNetwork degradation | Check latency and packet loss metrics.\nMisbehaving client | Identify traffic spike source or connection issues.\n\n---\n\n### πŸ‘₯ Who should be notified?\n\nPlease route to the appropriate team: \n`@slack-yourteam-alerts`\n", + "tags": [ + "integration:redisdb" + ], + "options": { + "thresholds": { + "critical": 20000 + }, + "notify_audit": false, + "evaluation_delay": 0, + "require_full_window": false, + "include_tags": true, + "silenced": {}, + "avalanche_window": 20, + "on_missing_data": "default" + }, + "priority": null, + "restriction_policy": { + "bindings": [] + } + }, + "tags": [ + "integration:redis" + ] +} diff --git a/redisdb/assets/monitors/error.json b/redisdb/assets/monitors/error.json new file mode 100644 index 0000000000000..7090c963aea6c --- /dev/null +++ b/redisdb/assets/monitors/error.json @@ -0,0 +1,36 @@ +{ + "version": 2, + "created_at": "2025-05-27", + "last_updated_at": "2025-05-27", + "title": "Error rate is elevated", + "description": "This monitor tracks Redis command errors per second. It triggers when error rate exceeds 5/sec, which may indicate misconfigurations, failed AUTH attempts, or denied commands.", + "definition": { + "id": 173301042, + "name": "[Redis] Error rate elevated", + "type": "query alert", + "query": "avg(last_5m):avg:redis.errors.per_sec{integration:redis} > 5", + "message": "## 🚨 What’s happening\n\nHigh Redis error rate detected (more than 5 errors per second over 5 minutes).\n\nRedis is returning errors for client commands at an elevated rate. This may signal misconfigured applications, authentication issues, or denied commands β€” all of which can disrupt application logic and degrade system reliability.\n\n---\n\n## πŸ“ˆ Impact\n\nIf left unresolved, a high error rate can lead to:\n\n- Failed read/write operations from your application\n- Increased latency due to retries and fallbacks\n- Broken user experiences or data inconsistencies\n- Cascading failures in downstream systems\n\n---\n\n## πŸ› οΈ Runbook\n\n### Initial Troubleshooting Steps\n\n1. Check recent error trends in Redis error metrics.\n2. Review logs or dashboards for:\n - AUTH failures\n - Invalid or restricted commands\n - Redis ACL or role changes\n3. Cross-reference `redis.commands.*` and `redis.clients.*` to pinpoint problematic patterns.\n4. Validate that client applications are using correct credentials and command syntax.\n\n---\n\n### Cause and Resolution\n\nCause | Resolution\n------|-----------\nFailed AUTH attempts | Ensure clients use correct credentials and Redis ACL settings.\nApplication bug or misconfiguration | Fix command usage or update client libraries.\nSecurity policy changes | Roll back or refine ACL/role settings.\nCommand misuse by external systems | Audit external integrations or rate-limit where needed.\n\n---\n\n### πŸ‘₯ Who should be notified?\n\nPlease notify the appropriate team: \n`@slack-yourteam-alerts`\n", + "tags": [ + "integration:redis" + ], + "options": { + "thresholds": { + "critical": 5 + }, + "notify_audit": false, + "evaluation_delay": 300, + "require_full_window": true, + "include_tags": true, + "silenced": {}, + "avalanche_window": 20, + "on_missing_data": "default" + }, + "priority": null, + "restriction_policy": { + "bindings": [] + } + }, + "tags": [ + "integration:redis" + ] +} diff --git a/redisdb/manifest.json b/redisdb/manifest.json index 713fd096f17c9..ba24818a056af 100644 --- a/redisdb/manifest.json +++ b/redisdb/manifest.json @@ -61,7 +61,9 @@ "redis": "assets/dashboards/overview.json" }, "monitors": { - "Memory consumption is high": "assets/monitors/high_mem.json" + "Memory consumption is high": "assets/monitors/high_mem.json", + "Command latency is high": "assets/monitors/cmd_latency.json", + "Error rate is elevated": "assets/monitors/error.json" }, "saved_views": { "error_warning_status": "assets/saved_views/error_warning_status.json", @@ -70,4 +72,4 @@ "redis_processes": "assets/saved_views/redis_processes.json" } } -} \ No newline at end of file +}