OpenZFS 9962 - zil_commit should omit cache thrash

As a result of the changes made in 8585, it's possible for an excessive amount of vdev flush commands to be issued under some workloads. Specifically, when the workload consists of mostly async write activity, interspersed with some sync write and/or fsync activity, we can end up issuing more flush commands to the underlying storage than is actually necessary. As a result of these flush commands, the write latency and overall throughput of the pool can be poorly impacted (latency increases, throughput decreases). Currently, any time an lwb completes, the vdev(s) written to as a result of that lwb will be issued a flush command. The intenion is so the data written to that vdev is on stable storage, prior to communicating to any waiting threads that their data is safe on disk. The problem with this scheme, is that sometimes an lwb will not have any threads waiting for it to complete. This can occur when there's async activity that gets "converted" to sync requests, as a result of calling the zil_async_to_sync() function via zil_commit_impl(). When this occurs, the current code may issue many lwbs that don't have waiters associated with them, resulting in many flush commands, potentially to the same vdev(s). For example, given a pool with a single vdev, and a single fsync() call that results in 10 lwbs being written out (e.g. due to other async writes), that will result in 10 flush commands to that single vdev (a flush issued after each lwb write completes). Ideally, we'd only issue a single flush command to that vdev, after all 10 lwb writes completed. Further, and most important as it pertains to this change, since the flush commands are often very impactful to the performance of the pool's underlying storage, unnecessarily issuing these flush commands can poorly impact the performance of the lwb writes themselves. Thus, we need to avoid issuing flush commands when possible, in order to acheive the best possible performance out of the pool's underlying storage. This change attempts to address this problem by changing the ZIL's logic to only issue a vdev flush command when it detects an lwb that has a thread waiting for it to complete. When an lwb does not have threads waiting for it, the responsibility of issuing the flush command to the vdevs involved with that lwb's write is passed on to the "next" lwb. It's only once a write for an lwb with waiters completes, do we issue the vdev flush command(s). As a result, now when we issue the flush(s), we will issue them to the vdevs involved with that specific lwb's write, but potentially also to vdevs involved with "previous" lwb writes (i.e. if the previous lwbs did not have waiters associated with them). Thus, in our prior example with 10 lwbs, it's only once the last lwb completes (which will be the lwb containing the waiter for the thread that called fsync) will we issue the vdev flush command; all of the other lwbs will find they have no waiters, so they'll pass the responsibility of the flush to the "next" lwb (until reaching the last lwb that has the waiter). Porting Notes: * Reconciled conflicts with the fastwrite feature. Authored by: Prakash Surya <prakash.surya@delphix.com> Reviewed by: Matt Ahrens <matt@delphix.com> Reviewed by: Brad Lewis <brad.lewis@delphix.com> Reviewed by: Patrick Mooney <patrick.mooney@joyent.com> Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com> Approved by: Joshua M. Clulow <josh@sysmgr.org> Ported-by: Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> OpenZFS-issue: https://www.illumos.org/issues/9962 OpenZFS-commit: openzfs/openzfs@545190c6
openzfs · Dec 5, 2018 · 61e145a · 61e145a
1 parent b53cb02
commit 61e145a
Show file tree

Hide file tree

Showing 6 changed files with 206 additions and 78 deletions.
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
@@ -2157,6 +2157,7 @@ zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
  * ZIL get_data callbacks
  */
 
+/* ARGSUSED */
 static void
 ztest_get_done(zgd_t *zgd, int error)
 {
@@ -2169,9 +2170,6 @@ ztest_get_done(zgd_t *zgd, int error)
 	ztest_range_unlock((rl_t *)zgd->zgd_lr);
 	ztest_object_unlock(zd, object);
 
-	if (error == 0 && zgd->zgd_bp)
-		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
-
 	umem_free(zgd, sizeof (*zgd));
 }
 

diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h
@@ -47,10 +47,11 @@ extern "C" {
  * via zil_lwb_write_issue(). Again, the zilog's "zl_issuer_lock" must
  * be held when making this transition.
  *
- * After the lwb's zio completes, and the vdev's are flushed, the lwb
- * will transition into the "done" state via zil_lwb_write_done(). When
- * transitioning from "issued" to "done", the zilog's "zl_lock" must be
- * held, *not* the "zl_issuer_lock".
+ * After the lwb's write zio completes, it transitions into the "write
+ * done" state via zil_lwb_write_done(); and then into the "flush done"
+ * state via zil_lwb_flush_vdevs_done(). When transitioning from
+ * "issued" to "write done", and then from "write done" to "flush done",
+ * the zilog's "zl_lock" must be held, *not* the "zl_issuer_lock".
  *
  * The zilog's "zl_issuer_lock" can become heavily contended in certain
  * workloads, so we specifically avoid acquiring that lock when
@@ -67,13 +68,14 @@ extern "C" {
  * "zl_issuer_lock" will prevent a concurrent thread from transitioning
  * that lwb to the "issued" state. Likewise, if an lwb is already in the
  * "issued" state, holding the "zl_lock" will prevent a concurrent
- * thread from transitioning that lwb to the "done" state.
+ * thread from transitioning that lwb to the "write done" state.
  */
 typedef enum {
     LWB_STATE_CLOSED,
     LWB_STATE_OPENED,
     LWB_STATE_ISSUED,
-    LWB_STATE_DONE,
+    LWB_STATE_WRITE_DONE,
+    LWB_STATE_FLUSH_DONE,
     LWB_NUM_STATES
 } lwb_state_t;
 

diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
@@ -1757,6 +1757,15 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
 	dmu_sync_arg_t *dsa = varg;
 	dbuf_dirty_record_t *dr = dsa->dsa_dr;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
+	zgd_t *zgd = dsa->dsa_zgd;
+
+	/*
+	 * Record the vdev(s) backing this blkptr so they can be flushed after
+	 * the writes for the lwb have completed.
+	 */
+	if (zio->io_error == 0) {
+		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
+	}
 
 	mutex_enter(&db->db_mtx);
 	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
@@ -1806,14 +1815,23 @@ dmu_sync_late_arrival_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	dmu_sync_arg_t *dsa = zio->io_private;
-	ASSERTV(blkptr_t *bp_orig = &zio->io_bp_orig);
-
-	if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
-		ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
-		ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
-		ASSERT(zio->io_bp->blk_birth == zio->io_txg);
-		ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
-		zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
+	zgd_t *zgd = dsa->dsa_zgd;
+
+	if (zio->io_error == 0) {
+		/*
+		 * Record the vdev(s) backing this blkptr so they can be
+		 * flushed after the writes for the lwb have completed.
+		 */
+		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
+
+		if (!BP_IS_HOLE(bp)) {
+			ASSERTV(blkptr_t *bp_orig = &zio->io_bp_orig);
+			ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
+			ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
+			ASSERT(zio->io_bp->blk_birth == zio->io_txg);
+			ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
+			zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
+		}
 	}
 
 	dmu_tx_commit(dsa->dsa_tx);

diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
@@ -976,6 +976,7 @@ zfs_iput_async(struct inode *ip)
 		iput(ip);
 }
 
+/* ARGSUSED */
 void
 zfs_get_done(zgd_t *zgd, int error)
 {
@@ -992,9 +993,6 @@ zfs_get_done(zgd_t *zgd, int error)
 	 */
 	zfs_iput_async(ZTOI(zp));
 
-	if (error == 0 && zgd->zgd_bp)
-		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
-
 	kmem_free(zgd, sizeof (zgd_t));
 }
 
@@ -1118,11 +1116,7 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
 				 * TX_WRITE2 relies on the data previously
 				 * written by the TX_WRITE that caused
 				 * EALREADY.  We zero out the BP because
-				 * it is the old, currently-on-disk BP,
-				 * so there's no need to zio_flush() its
-				 * vdevs (flushing would needlesly hurt
-				 * performance, and doesn't work on
-				 * indirect vdevs).
+				 * it is the old, currently-on-disk BP.
 				 */
 				zgd->zgd_bp = NULL;
 				BP_ZERO(bp);