Skip to content

Commit

Permalink
Performance Grafana alerts
Browse files Browse the repository at this point in the history
Bug: T156245

Introduces a Nagios adapter for Grafana alerts and
sets one up for WebPageTest.

Change-Id: Ieb902f45b9b3e54e7e21a8bae461df58f3d09f7a
  • Loading branch information
Gilles Dubuc authored and filippog committed Mar 24, 2017
1 parent c055f48 commit 401973a
Show file tree
Hide file tree
Showing 9 changed files with 128 additions and 0 deletions.
1 change: 1 addition & 0 deletions modules/icinga/manifests/ircbot.pp
Expand Up @@ -13,6 +13,7 @@
'/var/log/icinga/irc-analytics.log' => '#wikimedia-analytics',
'/var/log/icinga/irc-ores.log' => '#wikimedia-ai',
'/var/log/icinga/irc-interactive.log' => '#wikimedia-interactive',
'/var/log/icinga/irc-performance.log' => '#wikimedia-perf-bots',
}
$ircecho_nick = 'icinga-wm'
$ircecho_server = 'chat.freenode.net'
Expand Down
8 changes: 8 additions & 0 deletions modules/icinga/manifests/monitor/performance.pp
@@ -0,0 +1,8 @@
# == Class: icinga::monitor::performance
#
# Monitor Performance
class icinga::monitor::performance {
monitoring::grafana_alert { 'db/webpagetest-alerts':
contact_group => 'performance',
}
}
40 changes: 40 additions & 0 deletions modules/monitoring/manifests/grafana_alert.pp
@@ -0,0 +1,40 @@
# == Define: monitoring::grafana_alert
#
# Provisions an Icinga check that "forwards" Grafana alerts
# for a given dashboard.
#
# === Parameters
#
# [*metric*]
# Grafana dashboard uri. For example: 'db/webpagetest-alerts'.
# Defaults to the resource title.
#
# [*grafana_url*]
# URL of Grafana.
# Defaults to 'https://grafana.wikimedia.org'.
#
# [*contact_group*]
# Icinga contact group that should receive alerts.
# Defaults to 'admins'.
#
# === Examples
#
# # Emit a critical if any grafana alert on the db/webpagetest-alerts
# # dashboard is in "alterting" state.
# monitoring::grafana_alert { 'db/webpagetest-alerts':
# contact_group => 'performance',
# }
#
define monitoring::grafana_alert(
$dashboard = $title,
$ensure = present,
$grafana_url = 'https://grafana.wikimedia.org',
$contact_group = 'admins'
) {
monitoring::service { $title:
ensure => $ensure,
description => "${grafana_url}/dashboard/${title} grafana alert",
check_command => "check_grafana_alert!${title}!${grafana_url}",
contact_group => $contact_group,
}
}
@@ -0,0 +1,4 @@
define command{
command_name check_grafana_alert
command_line $USER1$/check_grafana_alert.py $ARG1$ $ARG2$
}
56 changes: 56 additions & 0 deletions modules/nagios_common/files/check_commands/check_grafana_alert.py
@@ -0,0 +1,56 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
check_grafana_alert
~~~~~~~~~~~~~~~~~~~~~~~~
Checks a Grafana dashboard and generates CRITICAL states if
it has Grafana alerts in "alerting" state.
Usage:
check_grafana_alert DASHBOARD_URI GRAFANA_URL
Positional arguments:
DASHBOARD_URI Grafana dashboard URI
GRAFANA_URL URL of grafana
"""
from __future__ import print_function

import sys
reload(sys)
sys.setdefaultencoding("utf-8")

import argparse
import json
import urllib2


ap = argparse.ArgumentParser(description='Grafana dashboard alert')
ap.add_argument('dashboard', help='dashboard URI')
ap.add_argument('grafana_url', help="URL of grafana")
args = ap.parse_args()

alerting_names = []

try:
url = args.grafana_url + '/api/alerts'
data = json.load(urllib2.urlopen(url))

for record in data:
if 'dashboardUri' in record and record['dashboardUri'] == args.dashboard:
if 'state' in record and record['state'] == 'alerting' and 'name' in record:
alerting_names.append(record['name'])
except Exception as e:
print('UNKNOWN: failed to check %s/dashboard/%s due to exception: %s' % (
args.grafana_url, args.dashboard, e.msg))
sys.exit(3)

if len(alerting_names) > 0:
print('CRITICAL: %s/dashboard/%s is alerting: %s.' % (
args.grafana_url, args.dashboard, ', '.join(alerting_names)), file=sys.stderr)
sys.exit(2)
else:
print('OK: %s/dashboard/%s is not alerting.' % (
args.grafana_url, args.dashboard), file=sys.stderr)
sys.exit(0)
6 changes: 6 additions & 0 deletions modules/nagios_common/files/contactgroups.cfg
Expand Up @@ -95,3 +95,9 @@ define contactgroup {
contactgroup_name wikitech-static
contactgroup_members admins
}

define contactgroup {
contactgroup_name performance
contactgroup_members performance-team,irc-performance
}

1 change: 1 addition & 0 deletions modules/nagios_common/manifests/commands.pp
Expand Up @@ -44,6 +44,7 @@
'check_all_memcached.php',
'check_bgp',
'check_dsh_groups',
'check_grafana_alert',
'check_graphite',
'check_graphite_freshness',
'check_ifstatus_nomon',
Expand Down
11 changes: 11 additions & 0 deletions modules/nagios_common/templates/notification_commands.cfg.erb
Expand Up @@ -144,3 +144,14 @@ define command{
command_name notify-service-by-irc-ores
command_line echo "$NOTIFICATIONTYPE$ - $SERVICEDESC$ on $HOSTNAME$ is $SERVICESTATE$: $SERVICEOUTPUT$ $SERVICEACKAUTHOR$ $SERVICEACKCOMMENT$" >> <%= @irc_dir_path %>/irc-ores.log
}

# IRC output for performance
define command{
command_name notify-host-by-irc-performance
command_line echo "$NOTIFICATIONTYPE$ - Host $HOSTALIAS$ is $HOSTSTATE$: $HOSTOUTPUT$ $HOSTACKAUTHOR$ $HOSTACKCOMMENT$" >> <%= @irc_dir_path %>/irc-performance.log
}

define command{
command_name notify-service-by-irc-performance
command_line echo "$NOTIFICATIONTYPE$ - $SERVICEDESC$ on $HOSTNAME$ is $SERVICESTATE$: $SERVICEOUTPUT$ $SERVICEACKAUTHOR$ $SERVICEACKCOMMENT$" >> <%= @irc_dir_path %>/irc-performance.log
}
1 change: 1 addition & 0 deletions modules/role/manifests/icinga.pp
Expand Up @@ -26,6 +26,7 @@
include icinga::monitor::commons
include icinga::monitor::elasticsearch
include icinga::monitor::wdqs
include icinga::monitor::performance
include icinga::event_handlers::raid

include role::authdns::monitoring
Expand Down

0 comments on commit 401973a

Please sign in to comment.