Skip to content

Commit

Permalink
Merge pull request #72 from technicalpickles/revisit-prometheus
Browse files Browse the repository at this point in the history
Revisit prometheus
  • Loading branch information
technicalpickles committed Nov 28, 2018
2 parents c1f43ed + 41cf019 commit f0304a5
Show file tree
Hide file tree
Showing 7 changed files with 150 additions and 2 deletions.
2 changes: 2 additions & 0 deletions .env.sample
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ DNSIMPLE_EMAIL=test@example.com
EXTERNAL_HOSTNAME=example.com
INTERNAL_HOSTNAME=example.local
HOMEASSISTANT_PROMETHEUS_BEARER_TOKEN=12234
ALERTMANAGER_SLACK_CHANNEL=ops
ALERTMANAGER_SLACK_WEBHOOK_URL=https://hooks.slack.com/services/blah/blah/blah
12 changes: 12 additions & 0 deletions alertmanager/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
FROM prom/alertmanager

ARG slack_channel
ARG slack_webhook_url

COPY config.yml /etc/alertmanager/config.yml

RUN sed -i -e "s=%%slack_channel%%=$slack_channel=" \
-e "s=%%slack_webhook_url%%=$slack_webhook_url=" \
/etc/alertmanager/config.yml

#RUN cat /etc/alertmanager/config.yml
11 changes: 11 additions & 0 deletions alertmanager/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
route:
receiver: 'slack'

receivers:
- name: 'slack'
slack_configs:
- send_resolved: true
text: "{{ .CommonAnnotations.description }}"
username: 'Prometheus'
channel: '#%%slack_channel%%'
api_url: '%%slack_webhook_url%%'
61 changes: 60 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,39 @@ services:
- prometheus-data:/prometheus-data
network_mode: "host"

# For reference in case? Use host os instead
#node-exporter:
# image: prom/node-exporter
# restart: always
# user: root
# privileged: true
# ports:
# - "9100:9100"
# volumes:
# - "/proc:/host/proc:ro"
# - "/sys:/host/sys:ro"
# - "/:/host:ro,rslave"
# network_mode: "host"
# pid: "host"
# command:
# - "--path.procfs=/host/proc"
# - "--path.sysfs=/host/sys"
# #- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host/var/lib/snapd/hostfs/var/lib/lxcfs|host/var/lib/snapd)($$|/)'
# #- '--collector.filesystem.ignored-fs-types=^(aufs|autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|efivarfs|fusectl|hugetlbfs|lxcfs|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|squashfs|sysfs|tmpfs|tracefs)$$'

cadvisor:
image: google/cadvisor:latest
restart: always
privileged: true
ports:
- "8080:8080"
network_mode: "host"
volumes:
- "/:/rootfs:ro"
- "/var/run:/var/run:rw"
- "/sys:/sys:ro"
- "/var/snap/docker/current/run/docker:/var/lib/docker:ro"

influxdb:
image: influxdb:1.4
container_name: influxdb
Expand All @@ -134,11 +167,37 @@ services:
- 3000:3000
volumes:
- grafana-data:/var/lib/grafana
command: --config /etc/grafana/grafana.ini
- ./grafana/datasources:/etc/grafana/datasources
- ./grafana/dashboards:/etc/grafana/dashboards
- ./grafana/setup.sh:/setup.sh
entrypoint: /setup.sh
environment:
- GF_SECURITY_ADMIN_USER=${ADMIN_USER:-admin}
- GF_SECURITY_ADMIN_PASSWORD=${ADMIN_PASSWORD:-admin}
- GF_USERS_ALLOW_SIGN_UP=false
depends_on:
- influxdb
- prometheus
restart: always
network_mode: host

alertmanager:
#image: prom/alertmanager
build:
context: ./alertmanager/
args:
- slack_channel=$ALERTMANAGER_SLACK_CHANNEL
- slack_webhook_url=$ALERTMANAGER_SLACK_WEBHOOK_URL
network_mode: host
#entrypoint: /bin/sh
#volumes:
# - ./alertmanager/:/etc/alertmanager/
command:
- '--config.file=/etc/alertmanager/config.yml'
- '--storage.path=/alertmanager'
restart: always
expose:
- 9093

airconnect:
restart: always
Expand Down
5 changes: 4 additions & 1 deletion prometheus/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
FROM prom/prometheus

COPY prometheus.yml /etc/prometheus/prometheus/prometheus.yml
ARG bearer_token

COPY prometheus.yml /etc/prometheus/prometheus.yml
COPY alert.rules /etc/prometheus/alert.rules

RUN sed -i -e "s/%%bearer_token%%/$bearer_token/" \
/etc/prometheus/prometheus.yml
40 changes: 40 additions & 0 deletions prometheus/alert.rules
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
groups:
- name: targets
rules:
- alert: monitor_service_down
expr: up == 0
for: 30s
labels:
severity: critical
annotations:
summary: "Monitor service non-operational"
description: "Service {{ $labels.instance }} is down."

- name: host
rules:
- alert: high_cpu_load
expr: node_load1 > 1.5
for: 30s
labels:
severity: warning
annotations:
summary: "Server under high load"
description: "Docker host is under high load, the avg load 1m is at {{ $value}}. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."

- alert: high_memory_load
expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
for: 30s
labels:
severity: warning
annotations:
summary: "Server memory is almost full"
description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."

- alert: high_storage_load
expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85
for: 30s
labels:
severity: warning
annotations:
summary: "Server storage is almost full"
description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
21 changes: 21 additions & 0 deletions prometheus/prometheus.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ global:
external_labels:
monitor: 'codelab-monitor'

rule_files:
- "alert.rules"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
Expand All @@ -26,3 +29,21 @@ scrape_configs:
scheme: http
static_configs:
- targets: ['localhost:8123']

- job_name: 'node-exporter'
static_configs:
- targets: ['localhost:9100']

- job_name: 'cadvisor'
scrape_interval: 5s
static_configs:
- targets: ['localhost:8080']

alerting:
alertmanagers:
- scheme: http
static_configs:
- targets:
- 'localhost:9093'
#- 'alertmanager:9093'

0 comments on commit f0304a5

Please sign in to comment.