From dee99336dda831e1fa69e7db0cb5bf01c8f480f7 Mon Sep 17 00:00:00 2001 From: Duo Zhang Date: Tue, 9 Jun 2020 11:07:16 +0800 Subject: [PATCH] HBASE-24117 Shutdown AssignmentManager before ProcedureExecutor may cause SCP to accidentally skip assigning a region (#1865) Signed-off-by: Michael Stack --- .../main/java/org/apache/hadoop/hbase/master/HMaster.java | 7 +++++-- .../hbase/master/procedure/ServerCrashProcedure.java | 7 +++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index e9b22a68b307..b9b70769b3c4 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -1464,6 +1464,11 @@ protected void stopServiceThreads() { LOG.debug("Stopping service threads"); + // stop procedure executor prior to other services such as server manager and assignment + // manager, as these services are important for some running procedures. See HBASE-24117 for + // example. + stopProcedureExecutor(); + if (this.quotaManager != null) { this.quotaManager.stop(); } @@ -1478,8 +1483,6 @@ protected void stopServiceThreads() { this.assignmentManager.stop(); } - stopProcedureExecutor(); - if (this.walManager != null) { this.walManager.stop(); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java index 916acfee3d57..a2e756306155 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java @@ -24,6 +24,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import org.apache.hadoop.hbase.DoNotRetryIOException; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.RegionInfoBuilder; @@ -473,6 +474,12 @@ private void assignRegions(MasterProcedureEnv env, List regions) thr // updated the region location, or even finished itself, so the region is no longer on us // any more, we should not try to assign it again. Please see HBASE-23594 for more details. if (!serverName.equals(regionNode.getRegionLocation())) { + // See HBASE-24117, though we have already changed the shutdown order, it is still worth + // double checking here to confirm that we do not skip assignment incorrectly. + if (!am.isRunning()) { + throw new DoNotRetryIOException( + "AssignmentManager has been stopped, can not process assignment any more"); + } LOG.info("{} found a region {} which is no longer on us {}, give up assigning...", this, regionNode, serverName); continue;