Skip to content

Commit

Permalink
[Platform] New Health check for Replication status in 2dc setup #4843
Browse files Browse the repository at this point in the history
Summary:
Add ability to configure alerts to be triggered based off of prometheus queries.

Also add UI for replication lag alert. Configurable to be turned on/off at runtime as well as have the threshold (in ms) configurable by the user. Default is 3m.

Test Plan:
Use a proxy metric (cpu_utime) to test the end-to-end flow including UI + triggering alert + resolving alert (through flipping the ">" to "<" in the query) + verifying corresponding emails are sent if the alert is triggered + resolved/not sent if the setting is disabled in the UI.

```
---------- MESSAGE FOLLOWS ----------
Content-Type: multipart/alternative;
 boundary="===============1491229132144892902=="
MIME-Version: 1.0
Subject: Yugabyte Platform Alert - <[admin][admin]>
From:
To: daniel@yugabyte.com
X-Peer: 127.0.0.1

--===============1491229132144892902==
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit

Replication Lag Alert for daniel-test-backup-alert-oct-20-4 is firing.
--===============1491229132144892902==--
------------ END MESSAGE ------------
```

{F14427}

{F14428}

{F14429}

Reviewers: arnav, wesley, spotachev, sanketh, andrew

Reviewed By: sanketh, andrew

Subscribers: andrew, sshevchenko, jenkins-bot

Differential Revision: https://phabricator.dev.yugabyte.com/D9726
  • Loading branch information
daniel-yb committed Nov 12, 2020
1 parent bc8d4b5 commit 12a69e4
Show file tree
Hide file tree
Showing 22 changed files with 996 additions and 61 deletions.
36 changes: 23 additions & 13 deletions managed/devops/bin/cluster_health.py
Expand Up @@ -499,15 +499,25 @@ def send_email(subject, msg, sender, destination):
s.quit()


def send_alert_email(customer_tag, task_info_json, destination):
if task_info_json['alertname'] == 'Backup failure':
task_type = task_info_json['task_type']
target_type = task_info_json['target_type']
target_name = task_info_json['target_name']
task_info = task_info_json['task_info']
subject = "Yugabyte Platform Alert - <{}>".format(customer_tag)
msg_content = "{} {} failed for {}.\n\nTask Info: {}"\
def send_alert_email(customer_tag, alert_info_json, destination):
is_valid = True
subject = "Yugabyte Platform Alert - <{}>".format(customer_tag)
if 'alert_name' in alert_info_json and alert_info_json['alert_name'] == 'Backup failure':
task_type = alert_info_json['task_type']
target_type = alert_info_json['target_type']
target_name = alert_info_json['target_name']
task_info = alert_info_json['task_info']
msg_content = "{} {} failed for {}.\n\nTask Info: {}" \
.format(task_type, target_type, target_name, task_info)
elif 'alert_name' in alert_info_json:
alert_name = alert_info_json['alert_name']
alert_state = alert_info_json['state']
universe_name = alert_info_json['universe_name']
msg_content = "{} for {} is {}.".format(alert_name, universe_name, alert_state)
else:
logging.error("Invalid alert_info_json")
is_valid = False
if is_valid:
sender = EMAIL_FROM
msg = MIMEMultipart('alternative')
msg['Subject'] = subject
Expand Down Expand Up @@ -762,13 +772,13 @@ def main():
help='Only report nodes with errors')
parser.add_argument('--send_notification', action="store_true",
help='Whether this is to alert to notify on or not')
parser.add_argument('--task_info', type=str, default=None, required=False,
parser.add_argument('--alert_info', type=str, default=None, required=False,
help='JSON serialized payload of backups that have failed')
args = parser.parse_args()
if args.send_notification and args.task_info is not None:
task_info_json = json.loads(args.task_info)
send_alert_email(args.customer_tag, task_info_json, args.destination)
print(task_info_json)
if args.send_notification and args.alert_info is not None:
alert_info_json = json.loads(args.alert_info)
send_alert_email(args.customer_tag, alert_info_json, args.destination)
print(alert_info_json)
elif args.cluster_payload is not None and args.universe_name is not None:
universe = UniverseDefinition(args.cluster_payload)
coordinator = CheckCoordinator(args.retry_interval_secs)
Expand Down
3 changes: 3 additions & 0 deletions managed/src/main/java/Module.java
Expand Up @@ -5,6 +5,7 @@
import com.yugabyte.yw.cloud.AWSInitializer;
import com.yugabyte.yw.commissioner.HealthChecker;
import com.yugabyte.yw.commissioner.CallHome;
import com.yugabyte.yw.commissioner.QueryAlerts;
import com.yugabyte.yw.commissioner.SetUniverseKey;
import com.yugabyte.yw.common.*;
import com.yugabyte.yw.controllers.PlatformHttpActionAdapter;
Expand Down Expand Up @@ -78,6 +79,8 @@ public void configure() {
bind(SetUniverseKey.class).asEagerSingleton();
bind(CustomerTaskManager.class).asEagerSingleton();
bind(YamlWrapper.class).asEagerSingleton();
bind(AlertManager.class).asEagerSingleton();
bind(QueryAlerts.class).asEagerSingleton();

final CallbackController callbackController = new CallbackController();
callbackController.setDefaultUrl(config.getString("yb.url", ""));
Expand Down
Expand Up @@ -15,7 +15,10 @@
import com.yugabyte.yw.common.HealthManager;
import com.yugabyte.yw.common.ShellProcessHandler;
import com.yugabyte.yw.forms.CustomerRegisterFormData;
import com.yugabyte.yw.models.*;
import com.yugabyte.yw.models.Customer;
import com.yugabyte.yw.models.CustomerConfig;
import com.yugabyte.yw.models.CustomerTask;
import com.yugabyte.yw.models.Universe;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -175,7 +178,7 @@ public void sendNotification() {
.put("task_type", task.getType().name())
.put("target_type", task.getTarget().name())
.put("target_name", task.getNotificationTargetName())
.put("task_info", taskInfo);
.put("alert_info", taskInfo);
String customerTag = String.format("[%s][%s]", customer.name, customer.code);
List<String> destinations = new ArrayList<>();
String ybEmail = appConfig.getString("yb.health.default_email", null);
Expand Down
Expand Up @@ -119,10 +119,9 @@ private void initialize() {
);

try {
healthMetric = Gauge.build(kUnivMetricName, "Boolean result of health checks").
labelNames(kUnivUUIDLabel, kUnivNameLabel, kNodeLabel, kCheckLabel).
register(this.promRegistry);

healthMetric = Gauge.build(kUnivMetricName, "Boolean result of health checks")
.labelNames(kUnivUUIDLabel, kUnivNameLabel, kNodeLabel, kCheckLabel)
.register(this.promRegistry);
} catch (IllegalArgumentException e) {
LOG.warn("Failed to build prometheus gauge for name: " + kUnivMetricName);
}
Expand Down
127 changes: 127 additions & 0 deletions managed/src/main/java/com/yugabyte/yw/commissioner/QueryAlerts.java
@@ -0,0 +1,127 @@
/*
* Copyright 2020 YugaByte, Inc. and Contributors
*
* Licensed under the Polyform Free Trial License 1.0.0 (the "License"); you
* may not use this file except in compliance with the License. You
* may obtain a copy of the License at
*
* https://github.com/YugaByte/yugabyte-db/blob/master/licenses/POLYFORM-FREE-TRIAL-LICENSE-1.0.0.txt
*/

package com.yugabyte.yw.commissioner;

import akka.actor.ActorSystem;
import com.google.common.annotations.VisibleForTesting;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.yugabyte.yw.common.AlertManager;
import com.yugabyte.yw.metrics.MetricQueryHelper;
import com.yugabyte.yw.models.*;

import java.util.*;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.concurrent.ExecutionContext;
import scala.concurrent.duration.Duration;

@Singleton
public class QueryAlerts {
public static final Logger LOG = LoggerFactory.getLogger(QueryAlerts.class);

private AtomicBoolean running = new AtomicBoolean(false);

private final ActorSystem actorSystem;

private final ExecutionContext executionContext;

private final MetricQueryHelper queryHelper;

private final AlertManager alertManager;

private final int YB_QUERY_ALERTS_INTERVAL = 1;

@Inject
public QueryAlerts(
ExecutionContext executionContext,
ActorSystem actorSystem,
AlertManager alertManager,
MetricQueryHelper queryHelper
) {
this.actorSystem = actorSystem;
this.executionContext = executionContext;
this.queryHelper = queryHelper;
this.alertManager = alertManager;
this.initialize();
}

public void setRunningState(AtomicBoolean state) {
this.running = state;
}

private void initialize() {
this.actorSystem.scheduler().schedule(
Duration.create(0, TimeUnit.MINUTES),
Duration.create(YB_QUERY_ALERTS_INTERVAL, TimeUnit.MINUTES),
this::scheduleRunner,
this.executionContext
);
}

public Set<Alert> processAlertDefinitions(UUID customerUUID) {
Set<Alert> alertsStillActive = new HashSet<>();
AlertDefinition.listActive(customerUUID).forEach(definition -> {
if (!queryHelper.queryDirect(definition.query).isEmpty()) {
Universe universe = Universe.get(definition.universeUUID);
Alert existingAlert = Alert.getActiveCustomerAlert(customerUUID, definition.uuid);
// Create an alert to activate if it doesn't exist already
if (existingAlert == null) {
Alert.create(
customerUUID,
definition.universeUUID,
Alert.TargetType.UniverseType,
"CUSTOMER_ALERT",
"Error",
String.format("%s for %s is firing", definition.name, universe.name),
definition.isActive,
definition.uuid
);
} else {
alertsStillActive.add(existingAlert);
}
}
});

return alertsStillActive;
}

@VisibleForTesting
void scheduleRunner() {
if (running.compareAndSet(false, true)) {
try {
Set<Alert> alertsStillActive = new HashSet<>();

// Pick up all alerts still active + create new alerts
Customer.getAll().forEach(c -> alertsStillActive.addAll(processAlertDefinitions(c.uuid)));

// Pick up all created alerts that are waiting to be activated
Set<Alert> alertsToTransition = new HashSet<>(Alert.listToActivate());

// Pick up all alerts that should be resolved internally but are currently active
Customer.getAll().forEach(c ->
Alert.listActiveCustomerAlerts(c.uuid).forEach(alert -> {
if (!alertsStillActive.contains(alert)) alertsToTransition.add(alert);
}));

// Trigger alert transitions
alertsToTransition.forEach(alertManager::transitionAlert);
} catch (Exception e) {
LOG.error("Error querying for alerts", e);
}

running.set(false);
}
}
}
114 changes: 114 additions & 0 deletions managed/src/main/java/com/yugabyte/yw/common/AlertManager.java
@@ -0,0 +1,114 @@
/*
* Copyright 2020 YugaByte, Inc. and Contributors
*
* Licensed under the Polyform Free Trial License 1.0.0 (the "License"); you
* may not use this file except in compliance with the License. You
* may obtain a copy of the License at
*
* https://github.com/YugaByte/yugabyte-db/blob/master/licenses/POLYFORM-FREE-TRIAL-LICENSE-1.0.0.txt
*/

package com.yugabyte.yw.common;

import com.fasterxml.jackson.databind.node.ObjectNode;
import com.yugabyte.yw.forms.CustomerRegisterFormData;
import com.yugabyte.yw.models.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import play.libs.Json;
import play.Configuration;

import javax.inject.Inject;
import javax.inject.Singleton;
import java.util.ArrayList;
import java.util.List;

@Singleton
public class AlertManager {
@Inject
HealthManager healthManager;

@Inject
Configuration appConfig;


public static final Logger LOG = LoggerFactory.getLogger(AlertManager.class);

public void sendEmail(Alert alert, String state) {
if (!alert.sendEmail) {
return;
}

AlertDefinition definition = AlertDefinition.get(alert.definitionUUID);
Universe universe = Universe.get(definition.universeUUID);
ObjectNode alertData = Json.newObject()
.put("alert_name", definition.name)
.put("state", state)
.put("universe_name", universe.name);
Customer customer = Customer.get(alert.customerUUID);
String customerTag = String.format("[%s][%s]", customer.name, customer.code);
List<String> destinations = new ArrayList<>();
String ybEmail = appConfig.getString("yb.health.default_email", null);
CustomerConfig config = CustomerConfig.getAlertConfig(customer.uuid);
CustomerRegisterFormData.AlertingData alertingData =
Json.fromJson(config.data, CustomerRegisterFormData.AlertingData.class);
if (alertingData.sendAlertsToYb && ybEmail != null && !ybEmail.isEmpty()) {
destinations.add(ybEmail);
}

if (alertingData.alertingEmail != null && !alertingData.alertingEmail.isEmpty()) {
destinations.add(alertingData.alertingEmail);
}

// Skip sending email if there aren't any destinations to send it to
if (destinations.isEmpty()) {
return;
}

CustomerConfig smtpConfig = CustomerConfig.getSmtpConfig(customer.uuid);
CustomerRegisterFormData.SmtpData smtpData = null;
if (smtpConfig != null) {
smtpData = Json.fromJson(smtpConfig.data, CustomerRegisterFormData.SmtpData.class);
}

healthManager.runCommand(
customerTag,
String.join(",", destinations),
smtpData,
alertData
);
}

/**
* A method to run a state transition for a given alert
*
* @param alert the alert to transition states on
* @return the alert in a new state
*/
public Alert transitionAlert(Alert alert) {
try {
switch (alert.state) {
case CREATED:
LOG.info("Transitioning alert {} to active", alert.uuid);
sendEmail(alert, "firing");
alert.state = Alert.State.ACTIVE;
break;
case ACTIVE:
LOG.info("Transitioning alert {} to resolved", alert.uuid);
sendEmail(alert, "resolved");
alert.state = Alert.State.RESOLVED;
break;
case RESOLVED:
LOG.info("Transitioning alert {} to resolved", alert.uuid);
alert.state = Alert.State.RESOLVED;
break;
}

alert.save();
} catch (Exception e) {
LOG.error("Error transitioning alert state for alert {}", alert.uuid, e);
}

return alert;
}
}
Expand Up @@ -174,7 +174,7 @@ public ShellProcessHandler.ShellResponse runCommand(

if (isTaskNotification) {
commandArgs.add("--send_notification");
commandArgs.add("--task_info");
commandArgs.add("--alert_info");
commandArgs.add(Json.stringify(taskInfo));
}

Expand Down

0 comments on commit 12a69e4

Please sign in to comment.