[SPARK-51940][SS] Add interface for managing streaming checkpoint metadata

Jackie Zhang · yhuang-db · commit d36b61e88ef4 · 2025-06-09T09:54:14.000-07:00
### What changes were proposed in this pull request? Minor refactor to introduce an interface for accessing the metadata (e.g. offset / commit logs) in a streaming checkpoint. ### Why are the changes needed? To standardize the access pattern. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? This is a pure refactoring, existing tests should suffice. ### Was this patch authored or co-authored using generative AI tooling? No. Closes apache#50746 from jackierwzhang/spark-51940-checkpoint-metadata-interface. Authored-by: Jackie Zhang <ruowang.zhang+data@databricks.com> Signed-off-by: Jungtaek Lim <kabhwan.opensource@gmail.com>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSource.scala
@@ -33,8 +33,8 @@ import org.apache.spark.sql.execution.datasources.v2.state.StateSourceOptions.{J
 import org.apache.spark.sql.execution.datasources.v2.state.StateSourceOptions.JoinSideValues.JoinSideValues
 import org.apache.spark.sql.execution.datasources.v2.state.metadata.{StateMetadataPartitionReader, StateMetadataTableEntry}
 import org.apache.spark.sql.execution.datasources.v2.state.utils.SchemaUtil
-import org.apache.spark.sql.execution.streaming.{CommitLog, OffsetSeqLog, OffsetSeqMetadata, TimerStateUtils, TransformWithStateOperatorProperties, TransformWithStateVariableInfo}
-import org.apache.spark.sql.execution.streaming.StreamingCheckpointConstants.{DIR_NAME_COMMITS, DIR_NAME_OFFSETS, DIR_NAME_STATE}
+import org.apache.spark.sql.execution.streaming.{OffsetSeqMetadata, StreamingQueryCheckpointMetadata, TimerStateUtils, TransformWithStateOperatorProperties, TransformWithStateVariableInfo}
+import org.apache.spark.sql.execution.streaming.StreamingCheckpointConstants.DIR_NAME_STATE
 import org.apache.spark.sql.execution.streaming.StreamingSymmetricHashJoinHelper.{LeftSide, RightSide}
 import org.apache.spark.sql.execution.streaming.state.{InMemoryStateSchemaProvider, KeyStateEncoderSpec, NoPrefixKeyStateEncoderSpec, PrefixKeyScanStateEncoderSpec, StateSchemaCompatibilityChecker, StateSchemaMetadata, StateSchemaProvider, StateStore, StateStoreColFamilySchema, StateStoreConf, StateStoreId, StateStoreProviderId}
 import org.apache.spark.sql.sources.DataSourceRegister
@@ -122,8 +122,7 @@ class StateDataSource extends TableProvider with DataSourceRegister with Logging
   override def supportsExternalMetadata(): Boolean = false
 
   private def buildStateStoreConf(checkpointLocation: String, batchId: Long): StateStoreConf = {
-    val offsetLog = new OffsetSeqLog(session,
-      new Path(checkpointLocation, DIR_NAME_OFFSETS).toString)
+    val offsetLog = new StreamingQueryCheckpointMetadata(session, checkpointLocation).offsetLog
     offsetLog.get(batchId) match {
       case Some(value) =>
         val metadata = value.metadata.getOrElse(
@@ -548,8 +547,7 @@ object StateSourceOptions extends DataSourceOptions {
   }
 
   private def getLastCommittedBatch(session: SparkSession, checkpointLocation: String): Long = {
-    val commitLog = new CommitLog(session,
-      new Path(checkpointLocation, DIR_NAME_COMMITS).toString)
+    val commitLog = new StreamingQueryCheckpointMetadata(session, checkpointLocation).commitLog
     commitLog.getLatest() match {
       case Some((lastId, _)) => lastId
       case None => throw StateDataSourceErrors.committedBatchUnavailable(checkpointLocation)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/AsyncProgressTrackingMicroBatchExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/AsyncProgressTrackingMicroBatchExecution.scala
@@ -83,16 +83,21 @@ class AsyncProgressTrackingMicroBatchExecution(
       }
     })
 
-  override val offsetLog = new AsyncOffsetSeqLog(
-    sparkSession,
-    checkpointFile("offsets"),
-    asyncWritesExecutorService,
-    asyncProgressTrackingCheckpointingIntervalMs,
-    clock = triggerClock
-  )
-
-  override val commitLog =
-    new AsyncCommitLog(sparkSession, checkpointFile("commits"), asyncWritesExecutorService)
+  /**
+   * Manages the metadata from this checkpoint location with async write operations.
+   */
+  private val asyncCheckpointMetadata =
+    new AsyncStreamingQueryCheckpointMetadata(
+      sparkSessionForStream,
+      resolvedCheckpointRoot,
+      asyncWritesExecutorService,
+      asyncProgressTrackingCheckpointingIntervalMs,
+      triggerClock
+    )
+
+  override lazy val offsetLog: AsyncOffsetSeqLog = asyncCheckpointMetadata.offsetLog
+
+  override lazy val commitLog: AsyncCommitLog = asyncCheckpointMetadata.commitLog
 
   // perform quick validation to fail faster
   validateAndGetTrigger()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/AsyncStreamingQueryCheckpointMetadata.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/AsyncStreamingQueryCheckpointMetadata.scala
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.streaming
+
+import java.util.concurrent.ThreadPoolExecutor
+
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.util.Clock
+
+/**
+ * A version of [[StreamingQueryCheckpointMetadata]] that supports async state checkpointing.
+ *
+ * @param sparkSession Spark session
+ * @param resolvedCheckpointRoot The resolved checkpoint root path
+ * @param asyncWritesExecutorService The executor service for async writes
+ * @param asyncProgressTrackingCheckpointingIntervalMs The interval for async progress
+ * @param triggerClock The clock to use for trigger time
+ */
+class AsyncStreamingQueryCheckpointMetadata(
+    sparkSession: SparkSession,
+    resolvedCheckpointRoot: String,
+    asyncWritesExecutorService: ThreadPoolExecutor,
+    asyncProgressTrackingCheckpointingIntervalMs: Long,
+    triggerClock: Clock)
+  extends StreamingQueryCheckpointMetadata(sparkSession, resolvedCheckpointRoot) {
+
+  override lazy val offsetLog = new AsyncOffsetSeqLog(
+    sparkSession,
+    checkpointFile(StreamingCheckpointConstants.DIR_NAME_OFFSETS),
+    asyncWritesExecutorService,
+    asyncProgressTrackingCheckpointingIntervalMs,
+    clock = triggerClock
+  )
+
+  override lazy val commitLog = new AsyncCommitLog(
+    sparkSession,
+    checkpointFile(StreamingCheckpointConstants.DIR_NAME_COMMITS),
+    asyncWritesExecutorService
+  )
+
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -151,16 +151,20 @@ abstract class StreamExecution(
    */
   protected def sources: Seq[SparkDataStream]
 
-  /** Metadata associated with the whole query */
-  protected val streamMetadata: StreamMetadata = {
-    val metadataPath = new Path(checkpointFile("metadata"))
-    val hadoopConf = sparkSession.sessionState.newHadoopConf()
-    StreamMetadata.read(metadataPath, hadoopConf).getOrElse {
-      val newMetadata = new StreamMetadata(UUID.randomUUID.toString)
-      StreamMetadata.write(newMetadata, metadataPath, hadoopConf)
-      newMetadata
-    }
-  }
+  /** Isolated spark session to run the batches with. */
+  protected val sparkSessionForStream: SparkSession = sparkSession.cloneSession()
+
+  /**
+   * Manages the metadata from this checkpoint location.
+   */
+  protected val checkpointMetadata =
+    new StreamingQueryCheckpointMetadata(sparkSessionForStream, resolvedCheckpointRoot)
+
+  private val streamMetadata: StreamMetadata = checkpointMetadata.streamMetadata
+
+  lazy val offsetLog: OffsetSeqLog = checkpointMetadata.offsetLog
+
+  lazy val commitLog: CommitLog = checkpointMetadata.commitLog
 
   /**
    * A map of current watermarks, keyed by the position of the watermark operator in the
@@ -209,9 +213,6 @@ abstract class StreamExecution(
   lazy val streamMetrics = new MetricsReporter(
     this, s"spark.streaming.${Option(name).getOrElse(id)}")
 
-  /** Isolated spark session to run the batches with. */
-  protected val sparkSessionForStream = sparkSession.cloneSession()
-
   /**
    * The thread that runs the micro-batches of this stream. Note that this thread must be
    * [[org.apache.spark.util.UninterruptibleThread]] to workaround KAFKA-1894: interrupting a
@@ -227,21 +228,6 @@ abstract class StreamExecution(
       }
     }
 
-  /**
-   * A write-ahead-log that records the offsets that are present in each batch. In order to ensure
-   * that a given batch will always consist of the same data, we write to this log *before* any
-   * processing is done.  Thus, the Nth record in this log indicated data that is currently being
-   * processed and the N-1th entry indicates which offsets have been durably committed to the sink.
-   */
-  val offsetLog = new OffsetSeqLog(sparkSession, checkpointFile("offsets"))
-
-  /**
-   * A log that records the batch ids that have completed. This is used to check if a batch was
-   * fully processed, and its output was committed to the sink, hence no need to process it again.
-   * This is used (for instance) during restart, to help identify which batch to run next.
-   */
-  val commitLog = new CommitLog(sparkSession, checkpointFile("commits"))
-
   /** Whether all fields of the query have been initialized */
   private def isInitialized: Boolean = state.get != INITIALIZING
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingCheckpointConstants.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingCheckpointConstants.scala
@@ -21,4 +21,5 @@ object StreamingCheckpointConstants {
   val DIR_NAME_COMMITS = "commits"
   val DIR_NAME_OFFSETS = "offsets"
   val DIR_NAME_STATE = "state"
+  val DIR_NAME_METADATA = "metadata"
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryCheckpointMetadata.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamingQueryCheckpointMetadata.scala
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.streaming
+
+import java.util.UUID
+
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.sql.SparkSession
+
+/**
+ * An interface for accessing the checkpoint metadata associated with a streaming query.
+ * @param sparkSession Spark session
+ * @param resolvedCheckpointRoot The resolved checkpoint root path
+ */
+class StreamingQueryCheckpointMetadata(sparkSession: SparkSession, resolvedCheckpointRoot: String) {
+
+  /**
+   * A write-ahead-log that records the offsets that are present in each batch. In order to ensure
+   * that a given batch will always consist of the same data, we write to this log *before* any
+   * processing is done.  Thus, the Nth record in this log indicated data that is currently being
+   * processed and the N-1th entry indicates which offsets have been durably committed to the sink.
+   */
+  lazy val offsetLog =
+    new OffsetSeqLog(sparkSession, checkpointFile(StreamingCheckpointConstants.DIR_NAME_OFFSETS))
+
+  /**
+   * A log that records the batch ids that have completed. This is used to check if a batch was
+   * fully processed, and its output was committed to the sink, hence no need to process it again.
+   * This is used (for instance) during restart, to help identify which batch to run next.
+   */
+  lazy val commitLog =
+    new CommitLog(sparkSession, checkpointFile(StreamingCheckpointConstants.DIR_NAME_COMMITS))
+
+  /** Metadata associated with the whole query */
+  final lazy val streamMetadata: StreamMetadata = {
+    val metadataPath = new Path(checkpointFile(StreamingCheckpointConstants.DIR_NAME_METADATA))
+    val hadoopConf = sparkSession.sessionState.newHadoopConf()
+    StreamMetadata.read(metadataPath, hadoopConf).getOrElse {
+      val newMetadata = new StreamMetadata(UUID.randomUUID.toString)
+      StreamMetadata.write(newMetadata, metadataPath, hadoopConf)
+      newMetadata
+    }
+  }
+
+  /** Returns the path of a file with `name` in the checkpoint directory. */
+  final protected def checkpointFile(name: String): String =
+    new Path(new Path(resolvedCheckpointRoot), name).toString
+
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/OperatorStateMetadata.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/state/OperatorStateMetadata.scala
@@ -31,9 +31,9 @@ import org.json4s.jackson.Serialization
 import org.apache.spark.internal.{Logging, LogKeys, MDC}
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.execution.datasources.v2.state.StateDataSourceErrors
-import org.apache.spark.sql.execution.streaming.{CheckpointFileManager, CommitLog, MetadataVersionUtil, OffsetSeqLog, StateStoreWriter}
+import org.apache.spark.sql.execution.streaming.{CheckpointFileManager, CommitLog, MetadataVersionUtil, StateStoreWriter, StreamingQueryCheckpointMetadata}
 import org.apache.spark.sql.execution.streaming.CheckpointFileManager.CancellableFSDataOutputStream
-import org.apache.spark.sql.execution.streaming.StreamingCheckpointConstants.{DIR_NAME_COMMITS, DIR_NAME_OFFSETS}
+import org.apache.spark.sql.execution.streaming.StreamingCheckpointConstants.DIR_NAME_OFFSETS
 import org.apache.spark.sql.execution.streaming.state.OperatorStateMetadataUtils.{OperatorStateMetadataReader, OperatorStateMetadataWriter}
 
 /**
@@ -172,14 +172,13 @@ object OperatorStateMetadataUtils extends Logging {
   }
 
   def getLastOffsetBatch(session: SparkSession, checkpointLocation: String): Long = {
-    val offsetLog = new OffsetSeqLog(session,
-      new Path(checkpointLocation, DIR_NAME_OFFSETS).toString)
+    val offsetLog = new StreamingQueryCheckpointMetadata(session, checkpointLocation).offsetLog
     offsetLog.getLatest().map(_._1).getOrElse(throw
       StateDataSourceErrors.offsetLogUnavailable(0, checkpointLocation))
   }
 
   def getLastCommittedBatch(session: SparkSession, checkpointLocation: String): Option[Long] = {
-    val commitLog = new CommitLog(session, new Path(checkpointLocation, DIR_NAME_COMMITS).toString)
+    val commitLog = new StreamingQueryCheckpointMetadata(session, checkpointLocation).commitLog
     commitLog.getLatest().map(_._1)
   }
 }

Original file line number	Diff line number	Diff line change
`@@ -21,4 +21,5 @@ object StreamingCheckpointConstants {`
`21`	`21`	`val DIR_NAME_COMMITS = "commits"`
`22`	`22`	`val DIR_NAME_OFFSETS = "offsets"`
`23`	`23`	`val DIR_NAME_STATE = "state"`
	`24`	`+ val DIR_NAME_METADATA = "metadata"`
`24`	`25`	`}`