From 899422be44b4b2c5fbd3b9fa79ba782d965ac0d8 Mon Sep 17 00:00:00 2001 From: BukrosSzabolcs Date: Fri, 3 Jul 2020 16:43:41 +0200 Subject: [PATCH] HBASE-24562: Stabilize master startup with meta replicas enabled (#1997) Signed-off-by: Wellington Chevreuil --- .../apache/hadoop/hbase/master/HMaster.java | 6 +- .../hbase/master/MasterMetaBootstrap.java | 4 +- .../master/assignment/AssignmentManager.java | 34 +++++++- .../hbase/client/TestMetaWithReplicas.java | 77 +++++++++++++++++++ 4 files changed, 115 insertions(+), 6 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index e87560c8c308..0dc29974710d 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -1131,7 +1131,11 @@ private void finishActiveMasterInitialization(MonitoredTask status) assignmentManager.checkIfShouldMoveSystemRegionAsync(); status.setStatus("Assign meta replicas"); MasterMetaBootstrap metaBootstrap = createMetaBootstrap(); - metaBootstrap.assignMetaReplicas(); + try { + metaBootstrap.assignMetaReplicas(); + } catch (IOException | KeeperException e){ + LOG.error("Assigning meta replica failed: ", e); + } status.setStatus("Starting quota manager"); initQuotaManager(); if (QuotaUtil.isQuotaEnabled(conf)) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java index e57817e573ac..1cf6cf1be101 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java @@ -81,9 +81,9 @@ void assignMetaReplicas() // down hosting server which calls AM#stop. if (metaState != null && metaState.getServerName() != null) { // Try to retain old assignment. - assignmentManager.assign(hri, metaState.getServerName()); + assignmentManager.assignAsync(hri, metaState.getServerName()); } else { - assignmentManager.assign(hri); + assignmentManager.assignAsync(hri); } } unassignExcessMetaReplica(numReplicas); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index 9a5796fa5b08..8afe691e7c80 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -589,9 +589,9 @@ private void preTransitCheck(RegionStateNode regionNode, RegionState.State[] exp } } - // TODO: Need an async version of this for hbck2. - public long assign(RegionInfo regionInfo, ServerName sn) throws IOException { - // TODO: should we use getRegionStateNode? + private TransitRegionStateProcedure createAssignProcedure(RegionInfo regionInfo, ServerName sn) + throws IOException { + // TODO: should we use getRegionStateNode? RegionStateNode regionNode = regionStates.getOrCreateRegionStateNode(regionInfo); TransitRegionStateProcedure proc; regionNode.lock(); @@ -602,6 +602,12 @@ public long assign(RegionInfo regionInfo, ServerName sn) throws IOException { } finally { regionNode.unlock(); } + return proc; + } + + // TODO: Need an async version of this for hbck2. + public long assign(RegionInfo regionInfo, ServerName sn) throws IOException { + TransitRegionStateProcedure proc = createAssignProcedure(regionInfo, sn); ProcedureSyncWait.submitAndWaitProcedure(master.getMasterProcedureExecutor(), proc); return proc.getProcId(); } @@ -610,6 +616,28 @@ public long assign(RegionInfo regionInfo) throws IOException { return assign(regionInfo, null); } + /** + * Submits a procedure that assigns a region to a target server without waiting for it to finish + * @param regionInfo the region we would like to assign + * @param sn target server name + * @return submitProcedure + * @throws IOException if preTransitCheck fails + */ + public Future assignAsync(RegionInfo regionInfo, ServerName sn) throws IOException { + TransitRegionStateProcedure proc = createAssignProcedure(regionInfo, sn); + return ProcedureSyncWait.submitProcedure(master.getMasterProcedureExecutor(), proc); + } + + /** + * Submits a procedure that assigns a region without waiting for it to finish + * @param regionInfo the region we would like to assign + * @return submitProcedure + * @throws IOException if preTransitCheck fails + */ + public Future assignAsync(RegionInfo regionInfo) throws IOException { + return assignAsync(regionInfo, null); + } + public long unassign(RegionInfo regionInfo) throws IOException { RegionStateNode regionNode = regionStates.getRegionStateNode(regionInfo); if (regionNode == null) { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaWithReplicas.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaWithReplicas.java index ddf568817d12..cbfac2875456 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaWithReplicas.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestMetaWithReplicas.java @@ -23,12 +23,14 @@ import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; +import java.io.IOException; import java.util.Arrays; import java.util.Collection; import java.util.EnumSet; import java.util.HashSet; import java.util.List; import java.util.Set; +import java.util.concurrent.Future; import java.util.concurrent.atomic.AtomicBoolean; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.Abortable; @@ -41,8 +43,12 @@ import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.TableNotFoundException; +import org.apache.hadoop.hbase.master.HMaster; +import org.apache.hadoop.hbase.master.MasterServices; import org.apache.hadoop.hbase.master.assignment.AssignmentManager; import org.apache.hadoop.hbase.master.assignment.AssignmentTestingUtil; +import org.apache.hadoop.hbase.master.assignment.RegionStateNode; +import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure; import org.apache.hadoop.hbase.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.regionserver.StorefileRefresherChore; import org.apache.hadoop.hbase.testclassification.LargeTests; @@ -52,7 +58,9 @@ import org.apache.hadoop.hbase.zookeeper.ZKUtil; import org.apache.hadoop.hbase.zookeeper.ZKWatcher; import org.apache.hadoop.hbase.zookeeper.ZNodePaths; +import org.apache.zookeeper.KeeperException; import org.junit.After; +import org.junit.Assert; import org.junit.Before; import org.junit.ClassRule; import org.junit.Rule; @@ -350,4 +358,73 @@ public void testShutdownOfReplicaHolder() throws Exception { assertNotEquals(3, i); } } + + @Test + public void testFailedReplicaAssigment() throws InterruptedException, IOException { + //using our rigged master, to force a failed meta replica assignment + TEST_UTIL.getMiniHBaseCluster().getConfiguration().setClass(HConstants.MASTER_IMPL, + BrokenMetaReplicaMaster.class, HMaster.class); + TEST_UTIL.getMiniHBaseCluster().stopMaster(0).join(); + HMaster newMaster = TEST_UTIL.getMiniHBaseCluster().startMaster().getMaster(); + //waiting for master to come up + TEST_UTIL.waitFor(30000, () -> newMaster.isInitialized()); + TEST_UTIL.getMiniHBaseCluster().getConfiguration().unset(HConstants.MASTER_IMPL); + + AssignmentManager am = newMaster.getAssignmentManager(); + //showing one of the replicas got assigned + RegionInfo metaReplicaHri = RegionReplicaUtil.getRegionInfoForReplica( + RegionInfoBuilder.FIRST_META_REGIONINFO, 1); + TEST_UTIL.waitFor(30000, () -> + am.getRegionStates().getOrCreateRegionStateNode(metaReplicaHri) != null && + am.getRegionStates().getOrCreateRegionStateNode(metaReplicaHri).getRegionLocation() + != null); + RegionStateNode metaReplicaRegionNode = + am.getRegionStates().getOrCreateRegionStateNode(metaReplicaHri); + Assert.assertNotNull(metaReplicaRegionNode.getRegionLocation()); + //showing one of the replicas failed to be assigned + RegionInfo metaReplicaHri2 = RegionReplicaUtil.getRegionInfoForReplica( + RegionInfoBuilder.FIRST_META_REGIONINFO, 2); + RegionStateNode metaReplicaRegionNode2 = + am.getRegionStates().getOrCreateRegionStateNode(metaReplicaHri2); + Assert.assertNull(metaReplicaRegionNode2.getRegionLocation()); + + //showing master is active and running + Assert.assertFalse(newMaster.isStopping()); + Assert.assertFalse(newMaster.isStopped()); + Assert.assertTrue(newMaster.isActiveMaster()); + } + + public static class BrokenTransitRegionStateProcedure extends TransitRegionStateProcedure { + protected BrokenTransitRegionStateProcedure() { + //super(env, hri, assignCandidate, forceNewPlan, type); + super(null, null, null, false,TransitionType.ASSIGN); + } + } + + public static class BrokenMetaReplicaMaster extends HMaster{ + public BrokenMetaReplicaMaster(final Configuration conf) throws IOException, KeeperException { + super(conf); + } + + @Override + public AssignmentManager createAssignmentManager(MasterServices master) { + return new BrokenMasterMetaAssignmentManager(master); + } + } + + public static class BrokenMasterMetaAssignmentManager extends AssignmentManager{ + MasterServices master; + public BrokenMasterMetaAssignmentManager(final MasterServices master) { + super(master); + this.master = master; + } + + public Future assignAsync(RegionInfo regionInfo, ServerName sn) throws IOException { + RegionStateNode regionNode = getRegionStates().getOrCreateRegionStateNode(regionInfo); + if (regionNode.getRegionInfo().getReplicaId() == 2) { + regionNode.setProcedure(new BrokenTransitRegionStateProcedure()); + } + return super.assignAsync(regionInfo, sn); + } + } }