HDFS observer namenode implementation

This adds a new type of namenode: observer. A observer is like a standby NN (in fact they share most of the code), EXCEPT it doesn't participate in either NN failover (i.e., it is not part of the HA), or check pointing. A observer can be specified through configuration. First, it needs to be added into the config: dfs.ha.namenodes, just like a normal namenode, together with other configs such as dfs.namenode.rpc-address, dfs.namenode.http-address, etc. Second, it needs to be specified in a new config: dfs.ha.observer.namenodes. This differentiate it from the ordinary active/standby namenodes. A observer can be used to serve read-only requests from HDFS client, when the following two conditions are satisfied: 1. the config dfs.client.failover.proxy.provider.<nameservice> is set to org.apache.hadoop.hdfs.server.namenode.ha.StaleReadProxyProvider. 2. the config dfs.client.enable.stale-read is set to true This also changes the way edit logs are loaded from the standby/observer NNs. Instead of loading them all at once, the new implementation loads them one batch at a time (default batch size is 10K edits) through multiple iterations, while waiting for a short amount of time in between the iterations (default waiting time is 100ms). This is to make sure the global lock won't be held too long during loading edits. Otherwise, the RPC processing time would suffer. This patch does not include a mechanism for clients to specify the bound of the staleness using journal transction ID: excluding this allows us to deploy the observer more easily. In more specific, the deployment involves: 1. restarting all datanodes with the updated configs. No binary change on datanodes is required. 2. bootstraping and starting the observer namenode, with the updated configs. Existing namenodes do not need to change. Future tasks: 1. allow client to set a bound on staleness in observer in terms of time (e.g., 2min). If for some reason the lagging in edit tailing is larger than the bound, the client-side proxy provider will fail over all the RPCs to the active namenode. 2. use journal transaction ID to ensure bound on staleness. This can be embedded in the RPC header. 3. allow new standby/observer to be deployed without datanode restart.
sunchao · Dec 4, 2017 · ff29e13 · xkrogen · Dec 7, 2017 · sunchao
1 parent c7527fa
commit ff29e13
Show file tree

Hide file tree

Showing 33 changed files with 1,578 additions and 187 deletions.
diff --git a/...roject/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryInvocationHandler.java b/...roject/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryInvocationHandler.java
@@ -435,4 +435,9 @@ public void close() throws IOException {
   public ConnectionId getConnectionId() {
     return RPC.getConnectionIdForProxy(proxyDescriptor.getProxy());
   }
+
+  @VisibleForTesting
+  public FailoverProxyProvider getProxyProvider() {
+    return proxyDescriptor.fpp;
+  }
 }
diff --git a/...-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryPolicies.java b/...-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/retry/RetryPolicies.java
@@ -669,7 +669,8 @@ public RetryAction shouldRetry(Exception e, int retries,
           e instanceof UnknownHostException ||
           e instanceof StandbyException ||
           e instanceof ConnectTimeoutException ||
-          isWrappedStandbyException(e)) {
+          isWrappedStandbyException(e) ||
+          isStandbyException(e)) {
         return new RetryAction(RetryAction.RetryDecision.FAILOVER_AND_RETRY,
             getFailoverOrRetrySleepTime(failovers));
       } else if (e instanceof RetriableException
@@ -734,4 +735,18 @@ static RetriableException getWrappedRetriableException(Exception e) {
     return unwrapped instanceof RetriableException ? 
         (RetriableException) unwrapped : null;
   }
+
+  private static boolean isStandbyException(Exception ex) {
+    Throwable cause = ex.getCause();
+    if (cause != null) {
+      Throwable cause2 = cause.getCause();
+      if (cause2 instanceof RemoteException) {
+        RemoteException remoteException = (RemoteException)cause2;
+        IOException unwrapRemoteException =
+                remoteException.unwrapRemoteException();
+        return unwrapRemoteException instanceof StandbyException;
+      }
+    }
+    return false;
+  }
 }
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Client.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ipc/Client.java
@@ -39,6 +39,7 @@
 import org.apache.hadoop.ipc.RPC.RpcKind;
 import org.apache.hadoop.ipc.Server.AuthProtocol;
 import org.apache.hadoop.ipc.protobuf.IpcConnectionContextProtos.IpcConnectionContextProto;
+import org.apache.hadoop.ipc.protobuf.RpcHeaderProtos;
 import org.apache.hadoop.ipc.protobuf.RpcHeaderProtos.RpcRequestHeaderProto;
 import org.apache.hadoop.ipc.protobuf.RpcHeaderProtos.RpcRequestHeaderProto.OperationProto;
 import org.apache.hadoop.ipc.protobuf.RpcHeaderProtos.RpcResponseHeaderProto;

diff --git a/...p-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSUtilClient.java b/...p-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSUtilClient.java
@@ -18,6 +18,7 @@
 package org.apache.hadoop.hdfs;
 
 import static org.apache.hadoop.hdfs.client.HdfsClientConfigKeys.DFS_HA_NAMENODES_KEY_PREFIX;
+import static org.apache.hadoop.hdfs.client.HdfsClientConfigKeys.DFS_HA_OBSERVER_NAMENODES_KEY_PREFIX;
 import static org.apache.hadoop.hdfs.client.HdfsClientConfigKeys.DFS_NAMESERVICES;
 
 import java.io.IOException;
@@ -128,18 +129,52 @@ public static Collection<String> getNameServiceIds(Configuration conf) {
 
   /**
    * Namenode HighAvailability related configuration.
-   * Returns collection of namenode Ids from the configuration. One logical id
-   * for each namenode in the in the HA setup.
+   * Returns collection of namenode Ids (including observer NNs) from the configuration.
+   * One logical id for each namenode in the HA setup.
    *
    * @param conf configuration
-   * @param nsId the nameservice ID to look at, or null for non-federated
-   * @return collection of namenode Ids
+   * @param nsId the nameservice ID to look at, or null for
+   *             non-federated
+   * @return collection of namenode Ids, including observer namenodes.
    */
   public static Collection<String> getNameNodeIds(Configuration conf, String nsId) {
     String key = addSuffix(DFS_HA_NAMENODES_KEY_PREFIX, nsId);
     return conf.getTrimmedStringCollection(key);
   }
 
+  /**
+   * Returns collection of observer namenode Ids from the configuration.
+   * One logical id for each observer in the HA setup.
+   *
+   * @param conf configuration
+   * @param nsId the nameservice ID to look at, or null for non-federated
+   * @return collection of observer namenode Ids
+   */
+  public static Collection<String> getObserverNameNodeIds(Configuration conf, String nsId) {
+    String key = addSuffix(DFS_HA_OBSERVER_NAMENODES_KEY_PREFIX, nsId);
+    return conf.getTrimmedStringCollection(key);
+  }
+
+  /**
+   * Namenode HighAvailability related configuration.
+   * Returns collection of namenode Ids from the configuration, excluding observer namenodes.
+   * One logical id for each namenode in the HA setup.
+   *
+   * @param conf configuration
+   * @param nsId the nameservice ID to look at, or null for non-federated
+   * @return collection of namenode Ids, excluding observer namenodes.
+   */
+  public static Collection<String> getNameNodeIdsExcludingObservers(
+      Configuration conf, String nsId) {
+    Collection<String> allNNIds = getNameNodeIds(conf, nsId);
+    Collection<String> observerNNIds = getObserverNameNodeIds(conf, nsId);
+    if (!allNNIds.containsAll(observerNNIds)) {
+      throw new IllegalArgumentException("Observer NameNodes should be part of all NameNodes");
+    }
+    allNNIds.removeAll(observerNNIds);
+    return allNNIds;
+  }
+
   /** Add non empty and non null suffix to a key */
   static String addSuffix(String key, String suffix) {
     if (suffix == null || suffix.isEmpty()) {
@@ -152,15 +187,30 @@ static String addSuffix(String key, String suffix) {
 
   /**
    * Returns list of InetSocketAddress corresponding to HA NN RPC addresses from
-   * the configuration.
+   * the configuration. Note this does not include the NN RPC addresses from
+   * observer namenodes.
    *
    * @param conf configuration
-   * @return list of InetSocketAddresses
+   * @return list of InetSocketAddresse, not including those of observer NNs.
    */
   public static Map<String, Map<String, InetSocketAddress>> getHaNnRpcAddresses(
       Configuration conf) {
     return DFSUtilClient.getAddresses(conf, null,
-      HdfsClientConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY);
+        HdfsClientConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY);
+  }
+
+  /**
+   * Returns list of InetSocketAddress corresponding to observer namenode RPC
+   * addresses from the configuration.
+   *
+   * @param conf configuration
+   * @return list of InetSocketAddresses for observer namenodes
+   */
+  public static Map<String, Map<String, InetSocketAddress>> getObserverRpcAddresses(
+      Configuration conf) {
+    // Observer namenodes share the same RPC address key with ordinary namenodes
+    return DFSUtilClient.getObserverAddresses(conf, null,
+        HdfsClientConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY);
   }
 
   /**
@@ -320,6 +370,15 @@ static String concatSuffixes(String... suffixes) {
     return Joiner.on(".").skipNulls().join(suffixes);
   }
 
+  /**
+   * Enum for selecting different types of namenodes (e.g., namenode, observers)
+   */
+  enum NameNodeType {
+    NAMENODE,
+    OBSERVER,
+    ALL
+  }
+
   /**
    * Returns the configured address for all NameNodes in the cluster.
    * @param conf configuration
@@ -329,12 +388,34 @@ static String concatSuffixes(String... suffixes) {
    */
   static Map<String, Map<String, InetSocketAddress>> getAddresses(
       Configuration conf, String defaultAddress, String... keys) {
+    return getAddressesInternal(conf, defaultAddress, NameNodeType.NAMENODE, keys);
+  }
+
+  /**
+   * Returns the configured address for all Observer NameNodes in the cluster.
+   * @param conf configuration
+   * @param defaultAddress default address to return in case key is not found.
+   * @param keys Set of keys to look for in the order of preference
+   * @return a map(nameserviceId to map(namenodeId to InetSocketAddress))
+   */
+  static Map<String, Map<String, InetSocketAddress>> getObserverAddresses(
+      Configuration conf, String defaultAddress, String... keys) {
+    return getAddressesInternal(conf, defaultAddress, NameNodeType.OBSERVER, keys);
+  }
+
+  /**
+   * Internal function to get configured addresses for a particular namenode
+   * type 'type'.
+   */
+  static Map<String, Map<String, InetSocketAddress>> getAddressesInternal(
+      Configuration conf, String defaultAddress, NameNodeType type, String... keys) {
     Collection<String> nameserviceIds = getNameServiceIds(conf);
-    return getAddressesForNsIds(conf, nameserviceIds, defaultAddress, keys);
+    return getAddressesForNsIdsInternal(conf, nameserviceIds, defaultAddress, type, keys);
   }
 
   /**
-   * Returns the configured address for all NameNodes in the cluster.
+   * Returns the configured address for both ordinary (active/standby) namenodes
+   * and observer namenodes in the cluster.
    * @param conf configuration
    * @param defaultAddress default address to return in case key is not found.
    * @param keys Set of keys to look for in the order of preference
@@ -344,12 +425,23 @@ static Map<String, Map<String, InetSocketAddress>> getAddresses(
   static Map<String, Map<String, InetSocketAddress>> getAddressesForNsIds(
       Configuration conf, Collection<String> nsIds, String defaultAddress,
       String... keys) {
+    return getAddressesForNsIdsInternal(
+        conf, nsIds, defaultAddress, NameNodeType.ALL, keys);
+  }
+
+  /**
+   * Internal function to get configured addresses for namenodes of type 'type' and
+   * nameservices with IDs 'nsIds'.
+   */
+  static Map<String, Map<String, InetSocketAddress>> getAddressesForNsIdsInternal(
+      Configuration conf, Collection<String> nsIds, String defaultAddress,
+      NameNodeType type, String... keys) {
     // Look for configurations of the form <key>[.<nameserviceId>][.<namenodeId>]
     // across all of the configured nameservices and namenodes.
     Map<String, Map<String, InetSocketAddress>> ret = Maps.newLinkedHashMap();
     for (String nsId : emptyAsSingletonNull(nsIds)) {
       Map<String, InetSocketAddress> isas =
-          getAddressesForNameserviceId(conf, nsId, defaultAddress, keys);
+          getAddressesForNameserviceIdInternal(conf, nsId, defaultAddress, type, keys);
       if (!isas.isEmpty()) {
         ret.put(nsId, isas);
       }
@@ -359,7 +451,27 @@ static Map<String, Map<String, InetSocketAddress>> getAddressesForNsIds(
 
   static Map<String, InetSocketAddress> getAddressesForNameserviceId(
       Configuration conf, String nsId, String defaultValue, String... keys) {
-    Collection<String> nnIds = getNameNodeIds(conf, nsId);
+    return getAddressesForNameserviceIdInternal(
+        conf, nsId, defaultValue, NameNodeType.NAMENODE, keys);
+  }
+
+  private static Map<String, InetSocketAddress> getAddressesForNameserviceIdInternal(
+      Configuration conf, String nsId, String defaultValue,
+      NameNodeType type, String... keys) {
+    Collection<String> nnIds;
+    switch (type) {
+      case NAMENODE:
+        nnIds = getNameNodeIdsExcludingObservers(conf, nsId);
+        break;
+      case OBSERVER:
+        nnIds = getObserverNameNodeIds(conf, nsId);
+        break;
+      case ALL:
+        nnIds = getNameNodeIds(conf, nsId);
+        break;
+      default:
+        throw new IllegalArgumentException("Invalid namenode type: " + type.name());
+    }
     Map<String, InetSocketAddress> ret = Maps.newHashMap();
     for (String nnId : emptyAsSingletonNull(nnIds)) {
       String suffix = concatSuffixes(nsId, nnId);

diff --git a/.../hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/client/HdfsClientConfigKeys.java b/.../hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/client/HdfsClientConfigKeys.java
@@ -71,6 +71,7 @@ public interface HdfsClientConfigKeys {
   int     DFS_NAMENODE_HTTPS_PORT_DEFAULT = 50470;
   String  DFS_NAMENODE_HTTPS_ADDRESS_KEY = "dfs.namenode.https-address";
   String DFS_HA_NAMENODES_KEY_PREFIX = "dfs.ha.namenodes";
+  String DFS_HA_OBSERVER_NAMENODES_KEY_PREFIX = "dfs.ha.observer.namenodes";
   String  DFS_WEBHDFS_ENABLED_KEY = "dfs.webhdfs.enabled";
   boolean DFS_WEBHDFS_ENABLED_DEFAULT = true;
   String  DFS_NAMENODE_HTTP_PORT_KEY = "dfs.http.port";
@@ -117,6 +118,9 @@ public interface HdfsClientConfigKeys {
   String  DFS_CLIENT_DATANODE_RESTART_TIMEOUT_KEY =
       "dfs.client.datanode-restart.timeout";
   long    DFS_CLIENT_DATANODE_RESTART_TIMEOUT_DEFAULT = 30;
+  String DFS_CLIENT_ENABLE_STALE_READ =
+      "dfs.client.enable.stale-read";
+  boolean DFS_CLIENT_ENABLE_STALE_READ_DEFAULT = false;
   // Much code in hdfs is not yet updated to use these keys.
   // the initial delay (unit is ms) for locateFollowingBlock, the delay time
   // will increase exponentially(double) for each retry.