Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

30s down-moratorium before allowing suspension #14455

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -12,6 +12,10 @@
public class ApplicationInstanceId {
public static final ApplicationInstanceId CONFIG_SERVER = new ApplicationInstanceId("zone-config-servers");
public static final ApplicationInstanceId CONTROLLER = new ApplicationInstanceId("controller");
// Unfortunately, for config server host the ApplicationInstanceId is: configserver-host:prod:cd-us-central-1:default
public boolean isConfigServerHost() { return id.startsWith("configserver-host:"); }
public static final ApplicationInstanceId CONTROLLER_HOST = new ApplicationInstanceId("controller-host:prod:default:default");
public boolean isTenantHost() { return id.startsWith("tenant-host:"); }

private final String id;

Expand Down
Expand Up @@ -10,7 +10,7 @@
* @author bjorncs
*/
// TODO: Remove this and use ApplicationId instead (if you need it for the JSON stuff move it to that layer and don't let it leak)
public class ApplicationInstanceReference {
public class ApplicationInstanceReference implements Comparable<ApplicationInstanceReference> {

private final TenantId tenantId;
private final ApplicationInstanceId applicationInstanceId;
Expand Down Expand Up @@ -42,6 +42,11 @@ public String asString() {
return tenantId.s() + ":" + applicationInstanceId.s();
}

@Override
public int compareTo(ApplicationInstanceReference o) {
return this.asString().compareTo(o.asString());
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
Expand Down
Expand Up @@ -12,6 +12,9 @@ public class ClusterId {

public static final ClusterId CONFIG_SERVER = new ClusterId("zone-config-servers");
public static final ClusterId CONTROLLER = new ClusterId("controller");
public static final ClusterId CONFIG_SERVER_HOST = new ClusterId("configserver-host");
public static final ClusterId CONTROLLER_HOST = new ClusterId("controller-host");
public static final ClusterId TENANT_HOST = new ClusterId("tenant-host");

private final String id;

Expand Down
Expand Up @@ -7,6 +7,7 @@
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.function.Predicate;

/**
* Represents a collection of service instances that together make up a service with a single cluster id.
Expand Down Expand Up @@ -51,24 +52,53 @@ public ApplicationInstance getApplicationInstance() {
return applicationInstance.get();
}

public boolean isConfigServerClusterLike() {
// config server
if (Objects.equals(applicationInstance.map(ApplicationInstance::tenantId), Optional.of(TenantId.HOSTED_VESPA)) &&
Objects.equals(applicationInstance.map(ApplicationInstance::applicationInstanceId), Optional.of(ApplicationInstanceId.CONFIG_SERVER)) &&
Objects.equals(clusterId, ClusterId.CONFIG_SERVER) &&
Objects.equals(serviceType, ServiceType.CONFIG_SERVER)) {
return true;
}
public boolean isConfigServerLike() {
return isConfigServer() || isController();
}

// controller
if (Objects.equals(applicationInstance.map(ApplicationInstance::tenantId), Optional.of(TenantId.HOSTED_VESPA)) &&
Objects.equals(applicationInstance.map(ApplicationInstance::applicationInstanceId), Optional.of(ApplicationInstanceId.CONTROLLER)) &&
public boolean isController() {
return isHostedVespaApplicationWithId(ApplicationInstanceId.CONTROLLER) &&
Objects.equals(clusterId, ClusterId.CONTROLLER) &&
Objects.equals(serviceType, ServiceType.CONTROLLER)) {
return true;
}
Objects.equals(serviceType, ServiceType.CONTROLLER);
}

/** Is a config server (and not controller!) */
public boolean isConfigServer() {
return isHostedVespaApplicationWithId(ApplicationInstanceId.CONFIG_SERVER) &&
Objects.equals(clusterId, ClusterId.CONFIG_SERVER) &&
Objects.equals(serviceType, ServiceType.CONFIG_SERVER);
}

public boolean isConfigServerHost() {
return isHostedVespaApplicationWithPredicate(ApplicationInstanceId::isConfigServerHost) &&
Objects.equals(clusterId, ClusterId.CONFIG_SERVER_HOST) &&
Objects.equals(serviceType, ServiceType.HOST_ADMIN);
}

public boolean isControllerHost() {
return isHostedVespaApplicationWithId(ApplicationInstanceId.CONTROLLER_HOST) &&
Objects.equals(clusterId, ClusterId.CONTROLLER_HOST) &&
Objects.equals(serviceType, ServiceType.HOST_ADMIN);
}

public boolean isTenantHost() {
return isHostedVespaApplicationWithPredicate(ApplicationInstanceId::isTenantHost) &&
Objects.equals(clusterId, ClusterId.TENANT_HOST) &&
Objects.equals(serviceType, ServiceType.HOST_ADMIN);
}

private boolean isHostedVespaApplicationWithId(ApplicationInstanceId id) {
return isHostedVespaTenant() &&
applicationInstance.map(app -> Objects.equals(app.applicationInstanceId(), id)).orElse(false);
}

private boolean isHostedVespaApplicationWithPredicate(Predicate<ApplicationInstanceId> predicate) {
return isHostedVespaTenant() &&
applicationInstance.map(app -> predicate.test(app.applicationInstanceId())).orElse(false);
}

return false;
private boolean isHostedVespaTenant() {
return applicationInstance.map(a -> Objects.equals(a.tenantId(), TenantId.HOSTED_VESPA)).orElse(false);
}

@Override
Expand Down
Expand Up @@ -66,6 +66,27 @@ public String toString() {
'}';
}

/**
* Get a name that can be used in e.g. config server logs that makes it easy to understand which
* service instance this is.
*/
public String descriptiveName() {
if (getServiceCluster().isController() || getServiceCluster().isConfigServer()) {
return getHostnamePrefix();
} else if (getServiceCluster().isControllerHost() || getServiceCluster().isConfigServerHost()) {
return "host-admin on " + getHostnamePrefix();
} else if (getServiceCluster().isTenantHost()) {
return "host-admin on " + hostName.s();
} else {
return configId.s();
}
}

private String getHostnamePrefix() {
int dotIndex = hostName.s().indexOf('.');
return dotIndex == -1 ? hostName().s() : hostName.s().substring(0, dotIndex);
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
Expand Down
Expand Up @@ -29,6 +29,7 @@
import com.yahoo.vespa.orchestrator.policy.HostedVespaClusterPolicy;
import com.yahoo.vespa.orchestrator.policy.HostedVespaPolicy;
import com.yahoo.vespa.orchestrator.policy.Policy;
import com.yahoo.vespa.orchestrator.policy.SuspensionReasons;
import com.yahoo.vespa.orchestrator.status.ApplicationInstanceStatus;
import com.yahoo.vespa.orchestrator.status.ApplicationLock;
import com.yahoo.vespa.orchestrator.status.HostInfo;
Expand Down Expand Up @@ -76,13 +77,13 @@ public OrchestratorImpl(ClusterControllerClientFactory clusterControllerClientFa
{
this(new HostedVespaPolicy(new HostedVespaClusterPolicy(),
clusterControllerClientFactory,
new ApplicationApiFactory(configServerConfig.zookeeperserver().size())),
new ApplicationApiFactory(configServerConfig.zookeeperserver().size(), Clock.systemUTC())),
clusterControllerClientFactory,
statusService,
serviceMonitor,
orchestratorConfig.serviceMonitorConvergenceLatencySeconds(),
Clock.systemUTC(),
new ApplicationApiFactory(configServerConfig.zookeeperserver().size()),
new ApplicationApiFactory(configServerConfig.zookeeperserver().size(), Clock.systemUTC()),
flagSource);
}

Expand Down Expand Up @@ -227,6 +228,7 @@ public void acquirePermissionToRemove(HostName hostName) throws OrchestrationExc
void suspendGroup(OrchestratorContext context, NodeGroup nodeGroup) throws HostStateChangeDeniedException {
ApplicationInstanceReference applicationReference = nodeGroup.getApplicationReference();

final SuspensionReasons suspensionReasons;
try (ApplicationLock lock = statusService.lockApplication(context, applicationReference)) {
ApplicationInstanceStatus appStatus = lock.getApplicationInstanceStatus();
if (appStatus == ApplicationInstanceStatus.ALLOWED_TO_BE_DOWN) {
Expand All @@ -235,8 +237,10 @@ void suspendGroup(OrchestratorContext context, NodeGroup nodeGroup) throws HostS

ApplicationApi applicationApi = applicationApiFactory.create(
nodeGroup, lock, clusterControllerClientFactory);
policy.grantSuspensionRequest(context.createSubcontextWithinLock(), applicationApi);
suspensionReasons = policy.grantSuspensionRequest(context.createSubcontextWithinLock(), applicationApi);
}

suspensionReasons.makeLogMessage().ifPresent(log::info);
}

@Override
Expand Down
Expand Up @@ -4,21 +4,26 @@
import com.yahoo.vespa.orchestrator.controller.ClusterControllerClientFactory;
import com.yahoo.vespa.orchestrator.status.ApplicationLock;

import java.time.Clock;

/**
* @author mpolden
*/
public class ApplicationApiFactory {

private final int numberOfConfigServers;
private final Clock clock;

public ApplicationApiFactory(int numberOfConfigServers) {
public ApplicationApiFactory(int numberOfConfigServers, Clock clock) {
this.numberOfConfigServers = numberOfConfigServers;
this.clock = clock;
}

public ApplicationApi create(NodeGroup nodeGroup,
ApplicationLock lock,
ClusterControllerClientFactory clusterControllerClientFactory) {
return new ApplicationApiImpl(nodeGroup, lock, clusterControllerClientFactory, numberOfConfigServers);
return new ApplicationApiImpl(nodeGroup, lock, clusterControllerClientFactory,
numberOfConfigServers, clock);
}

}
Expand Up @@ -14,6 +14,7 @@
import com.yahoo.vespa.orchestrator.status.HostStatus;
import com.yahoo.vespa.orchestrator.status.ApplicationLock;

import java.time.Clock;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashSet;
Expand All @@ -33,16 +34,18 @@ public class ApplicationApiImpl implements ApplicationApi {
private final ApplicationInstance applicationInstance;
private final NodeGroup nodeGroup;
private final ApplicationLock lock;
private final Clock clock;
private final List<ClusterApi> clusterInOrder;
private final HostInfos hostInfos;

public ApplicationApiImpl(NodeGroup nodeGroup,
ApplicationLock lock,
ClusterControllerClientFactory clusterControllerClientFactory,
int numberOfConfigServers) {
int numberOfConfigServers, Clock clock) {
this.applicationInstance = nodeGroup.getApplication();
this.nodeGroup = nodeGroup;
this.lock = lock;
this.clock = clock;
Collection<HostName> hosts = getHostsUsedByApplicationInstance(applicationInstance);
this.hostInfos = lock.getHostInfos();
this.clusterInOrder = makeClustersInOrder(nodeGroup, hostInfos, clusterControllerClientFactory, numberOfConfigServers);
Expand Down Expand Up @@ -122,7 +125,8 @@ private List<ClusterApi> makeClustersInOrder(NodeGroup nodeGroup,
nodeGroup,
hostInfos,
clusterControllerClientFactory,
numberOfConfigServers))
numberOfConfigServers,
clock))
.sorted(ApplicationApiImpl::compareClusters)
.collect(Collectors.toList());
}
Expand Down
Expand Up @@ -3,6 +3,7 @@

import com.yahoo.vespa.applicationmodel.ClusterId;
import com.yahoo.vespa.applicationmodel.ServiceType;
import com.yahoo.vespa.orchestrator.policy.SuspensionReasons;

import java.util.Optional;

Expand All @@ -17,7 +18,8 @@ public interface ClusterApi {
ServiceType serviceType();
boolean isStorageCluster();

boolean noServicesInGroupIsUp();
/** Returns the reasons no services are up in the implied group, or empty if some services are up. */
Optional<SuspensionReasons> reasonsForNoServicesInGroupIsUp();
boolean noServicesOutsideGroupIsDown();

int percentageOfServicesDown();
Expand Down
Expand Up @@ -8,9 +8,13 @@
import com.yahoo.vespa.applicationmodel.ServiceStatus;
import com.yahoo.vespa.applicationmodel.ServiceType;
import com.yahoo.vespa.orchestrator.controller.ClusterControllerClientFactory;
import com.yahoo.vespa.orchestrator.policy.SuspensionReasons;
import com.yahoo.vespa.orchestrator.status.HostInfos;
import com.yahoo.vespa.orchestrator.status.HostStatus;

import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
import java.util.Collections;
import java.util.HashSet;
import java.util.Map;
Expand All @@ -24,12 +28,14 @@
* @author hakonhall
*/
class ClusterApiImpl implements ClusterApi {
static final Duration downMoratorium = Duration.ofSeconds(30);

private final ApplicationApi applicationApi;
private final ServiceCluster serviceCluster;
private final NodeGroup nodeGroup;
private final HostInfos hostInfos;
private final ClusterControllerClientFactory clusterControllerClientFactory;
private final Clock clock;
private final Set<ServiceInstance> servicesInGroup;
private final Set<ServiceInstance> servicesDownInGroup;
private final Set<ServiceInstance> servicesNotInGroup;
Expand All @@ -53,12 +59,14 @@ public ClusterApiImpl(ApplicationApi applicationApi,
NodeGroup nodeGroup,
HostInfos hostInfos,
ClusterControllerClientFactory clusterControllerClientFactory,
int numberOfConfigServers) {
int numberOfConfigServers,
Clock clock) {
this.applicationApi = applicationApi;
this.serviceCluster = serviceCluster;
this.nodeGroup = nodeGroup;
this.hostInfos = hostInfos;
this.clusterControllerClientFactory = clusterControllerClientFactory;
this.clock = clock;

Map<Boolean, Set<ServiceInstance>> serviceInstancesByLocality =
serviceCluster.serviceInstances().stream()
Expand All @@ -73,7 +81,7 @@ public ClusterApiImpl(ApplicationApi applicationApi,
servicesDownAndNotInGroup = servicesNotInGroup.stream().filter(this::serviceEffectivelyDown).collect(Collectors.toSet());

int serviceInstances = serviceCluster.serviceInstances().size();
if (serviceCluster.isConfigServerClusterLike() && serviceInstances < numberOfConfigServers) {
if (serviceCluster.isConfigServerLike() && serviceInstances < numberOfConfigServers) {
missingServices = numberOfConfigServers - serviceInstances;
descriptionOfMissingServices = missingServices + " missing config server" + (missingServices > 1 ? "s" : "");
} else {
Expand Down Expand Up @@ -108,8 +116,36 @@ public ApplicationApi getApplication() {
}

@Override
public boolean noServicesInGroupIsUp() {
return servicesDownInGroup.size() == servicesInGroup.size();
public Optional<SuspensionReasons> reasonsForNoServicesInGroupIsUp() {
SuspensionReasons reasons = new SuspensionReasons();

for (ServiceInstance service : servicesInGroup) {
if (hostStatus(service.hostName()).isSuspended()) {
reasons.mergeWith(SuspensionReasons.nothingNoteworthy());
continue;
}

if (service.serviceStatus() == ServiceStatus.DOWN) {
Optional<Instant> since = service.serviceStatusInfo().since();
if (since.isEmpty()) {
reasons.mergeWith(SuspensionReasons.isDown(service));
continue;
}

// Make sure services truly are down for some period of time before we allow suspension.
// On the other hand, a service coming down and up repeatedly should probably
// also be allowed... difficult without keeping track of history in a better way.
final Duration downDuration = Duration.between(since.get(), clock.instant());
if (downDuration.compareTo(downMoratorium) > 0) {
reasons.mergeWith(SuspensionReasons.downSince(service, since.get(), downDuration));
continue;
}
}

return Optional.empty();
}

return Optional.of(reasons);
}

int missingServices() { return missingServices; }
Expand Down
Expand Up @@ -8,8 +8,10 @@ public interface ClusterPolicy {
* There's an implicit group of nodes known to clusterApi. This method answers whether
* it would be fine, just looking at this cluster (and disregarding Cluster Controller/storage
* which is handled separately), to allow all services on all the nodes in the group to go down.
*
* @return notable reasons for why this group is fine with going down.
*/
void verifyGroupGoingDownIsFine(ClusterApi clusterApi) throws HostStateChangeDeniedException;
SuspensionReasons verifyGroupGoingDownIsFine(ClusterApi clusterApi) throws HostStateChangeDeniedException;

/**
* There's an implicit group of nodes known to clusterApi. This method answers whether
Expand Down