Rules+Alert config w/ Alertmanager & cleanup

- Recording Rules and Alert example for normal and HA - Alertmanager for normal and HA - Fixed up HA to match HA in a bunch of ways - scrape_configs - database (timescale -> postgres) - password - db args -> connstring
timescale · May 18, 2022 · 73bae80 · 73bae80
1 parent 022ccba
commit 73bae80
Show file tree

Hide file tree

Showing 7 changed files with 86 additions and 24 deletions.
diff --git a/docker-compose/alerts.yml b/docker-compose/alerts.yml
@@ -0,0 +1,10 @@
+groups:
+- name: alerts
+  rules:
+  - alert: Watchdog
+    annotations:
+      description: > 
+        This is a Watchdog alert is meant to ensure that the entire Alerting  
+        pipeline is functional. It is always firing in normal operation
+      summary: Alerting Watchdog
+    expr: vector(1)
diff --git a/docker-compose/docker-compose.yaml b/docker-compose/docker-compose.yaml
@@ -18,6 +18,11 @@ services:
     volumes:
       - ${PWD}/prometheus.yml:/etc/prometheus/prometheus.yml
 
+  alertmanager:
+    image: prom/alertmanager:latest
+    ports:
+      - 9093:9093/tcp
+
   promscale:
     image: timescale/promscale:latest
     ports:
@@ -26,12 +31,17 @@ services:
     restart: on-failure
     depends_on:
       - db
+    volumes:
+      - ${PWD}/promscale_prometheus.yml:/prometheus.yml
+      - ${PWD}/rules.yml:/rules.yml
+      - ${PWD}/alerts.yml:/alerts.yml
     environment:
       PROMSCALE_WEB_TELEMETRY_PATH: /metrics-text
       PROMSCALE_DB_URI: postgres://postgres:password@db:5432/postgres?sslmode=allow
       PROMSCALE_TRACING_OTLP_SERVER_ADDRESS: ":9202"
       PROMSCALE_TELEMETRY_TRACE_JAEGER_ENDPOINT: "http://otel-collector:14268/api/traces"
       PROMSCALE_TELEMETRY_TRACE_SAMPLING_RATIO: "0.1"
+      PROMSCALE_METRICS_RULES_CONFIG_FILE: /prometheus.yml
 
   otel-collector:
     platform: linux/amd64

diff --git a/docker-compose/high-availability/docker-compose.yaml b/docker-compose/high-availability/docker-compose.yaml
@@ -6,9 +6,8 @@ services:
     ports:
       - 5432:5432/tcp
     environment:
-      POSTGRES_PASSWORD: postgres
+      POSTGRES_PASSWORD: password
       POSTGRES_USER: postgres
-      POSTGRES_DB: timescale
 
   prometheus1:
     image: prom/prometheus:latest
@@ -24,6 +23,11 @@ services:
     volumes:
       - ./prometheus2.yml:/etc/prometheus/prometheus.yml:ro
 
+  alertmanager:
+    image: prom/alertmanager:latest
+    ports:
+      - 9093:9093/tcp
+
   promscale-connector1:
     image: timescale/promscale:latest
     ports:
@@ -32,29 +36,35 @@ services:
     depends_on:
       - db
       - prometheus1
+    volumes:
+      - ${PWD}/../promscale_prometheus.yml:/prometheus.yml
+      - ${PWD}/../rules.yml:/rules.yml
+      - ${PWD}/../alerts.yml:/alerts.yml
     environment:
       PROMSCALE_METRICS_HIGH_AVAILABILITY: true
-      PROMSCALE_DB_CONNECT_RETRIES: 10
-      PROMSCALE_DB_HOST: db
-      PROMSCALE_DB_PASSWORD: postgres
+      PROMSCALE_DB_URI: postgres://postgres:password@db:5432/postgres?sslmode=allow
       PROMSCALE_WEB_TELEMETRY_PATH: /metrics-text
-      PROMSCALE_DB_SSL_MODE: allow
+      PROMSCALE_METRICS_RULES_CONFIG_FILE: /prometheus.yml
 
   promscale-connector2:
     image: timescale/promscale:latest
     ports:
       - 9202:9201/tcp
-    build:
-      context: .
     restart: on-failure
     depends_on:
       - db
       - prometheus2
+    volumes:
+      - ${PWD}/../promscale_prometheus.yml:/prometheus.yml
+      - ${PWD}/../rules.yml:/rules.yml
+      - ${PWD}/../alerts.yml:/alerts.yml
     environment:
       PROMSCALE_METRICS_HIGH_AVAILABILITY: true
-      PROMSCALE_DB_CONNECT_RETRIES: 10
-      PROMSCALE_DB_HOST: db
-      PROMSCALE_DB_PASSWORD: postgres
+      PROMSCALE_DB_URI: postgres://postgres:password@db:5432/postgres?sslmode=allow
       PROMSCALE_WEB_TELEMETRY_PATH: /metrics-text
-      PROMSCALE_DB_SSL_MODE: allow
+      PROMSCALE_METRICS_RULES_CONFIG_FILE: /prometheus.yml
 
+  node_exporter:
+    image: quay.io/prometheus/node-exporter
+    ports:
+      - "9100:9100"
diff --git a/docker-compose/high-availability/prometheus1.yml b/docker-compose/high-availability/prometheus1.yml
@@ -25,10 +25,15 @@ remote_read:
 # Here it's Prometheus itself.
 scrape_configs:
   # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
-  - job_name: 'prometheus'
-
-    # metrics_path defaults to '/metrics'
-    # scheme defaults to 'http'.
-
+  - job_name: prometheus
     static_configs:
       - targets: ['localhost:9090']
+  - job_name: node-exporter
+    static_configs:
+      - targets: ['node_exporter:9100']
+  - job_name: promscale
+    metrics_path: '/metrics-text'
+    static_configs:
+      - targets: 
+          - 'promscale-connector1:9201'
+          - 'promscale-connector2:9201'
diff --git a/docker-compose/high-availability/prometheus2.yml b/docker-compose/high-availability/prometheus2.yml
@@ -21,14 +21,18 @@ remote_write:
 remote_read:
     - url: "http://promscale-connector2:9201/read"
 
-# A scrape configuration containing exactly one endpoint to scrape:
-# Here it's Prometheus itself.
 scrape_configs:
   # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
-  - job_name: 'prometheus'
-
-    # metrics_path defaults to '/metrics'
-    # scheme defaults to 'http'.
-
+  - job_name: prometheus
     static_configs:
       - targets: ['localhost:9090']
+  - job_name: node-exporter
+    static_configs:
+      - targets: ['node_exporter:9100']
+  - job_name: promscale
+    metrics_path: '/metrics-text'
+    static_configs:
+      - targets: 
+          - 'promscale-connector1:9201'
+          - 'promscale-connector2:9201'
+
diff --git a/docker-compose/promscale_prometheus.yml b/docker-compose/promscale_prometheus.yml
@@ -0,0 +1,16 @@
+# Rules and alerts are read from the specified file(s)
+rule_files:
+  - rules.yml
+  - alerts.yml
+
+# Alerting specifies settings related to the Alertmanager
+alerting:
+  alert_relabel_configs:
+   - replacement: "production"
+     target_label: "env"
+     action: "replace"
+  alertmanagers:
+    - static_configs:
+      - targets:
+        # Alertmanager's default port is 9093
+        - alertmanager:9093
diff --git a/docker-compose/rules.yml b/docker-compose/rules.yml
@@ -0,0 +1,7 @@
+groups:
+- name: rules
+  rules:
+  - record: instance_cpu:node_cpu_seconds_not_idle:rate5m
+    expr: >
+      sum(rate(node_cpu_seconds_total{mode!="idle"}[5m])) 
+      without (mode,cpu)