From 90ff732358390ac395999577b703dcb4d3e0df59 Mon Sep 17 00:00:00 2001
From: Rob N <robn@despairlabs.com>
Date: Thu, 21 Mar 2024 04:08:50 +1100
Subject: [PATCH 01/34] freebsd: fix missing headers in distribution tarball

arc_os.h and freebsd_event.h aren't included in release tarballs, so the
build fails on FreeBSD. This fixes it.

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #15963
---
 include/os/freebsd/Makefile.am | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/os/freebsd/Makefile.am b/include/os/freebsd/Makefile.am
index 9819e534b7f6..551f75f42a20 100644
--- a/include/os/freebsd/Makefile.am
+++ b/include/os/freebsd/Makefile.am
@@ -80,7 +80,9 @@ noinst_HEADERS = \
 	%D%/spl/sys/zmod.h \
 	%D%/spl/sys/zone.h \
 	\
+	%D%/zfs/sys/arc_os.h \
 	%D%/zfs/sys/freebsd_crypto.h \
+	%D%/zfs/sys/freebsd_event.h \
 	%D%/zfs/sys/vdev_os.h \
 	%D%/zfs/sys/zfs_bootenv_os.h \
 	%D%/zfs/sys/zfs_context_os.h \

From ef08a4d4065d21414d7fedccac20da6bfda4dfd0 Mon Sep 17 00:00:00 2001
From: Rob N <robn@despairlabs.com>
Date: Thu, 21 Mar 2024 10:46:15 +1100
Subject: [PATCH 02/34] Linux 6.8 compat: use splice_copy_file_range() for
 fallback

Linux 6.8 removes generic_copy_file_range(), which had been reduced to a
simple wrapper around splice_copy_file_range(). Detect that function
directly and use it if generic_ is not available.

Sponsored-by: https://despairlabs.com/sponsor/
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <robn@despairlabs.com>
Closes #15930
Closes #15931
---
 config/kernel-vfs-file_range.m4      | 27 +++++++++++++++++++++++++++
 config/kernel.m4                     |  2 ++
 module/os/linux/zfs/zpl_file_range.c | 16 ++++++++++++++--
 3 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/config/kernel-vfs-file_range.m4 b/config/kernel-vfs-file_range.m4
index cc96404d8bbe..8a5cbe2eeeed 100644
--- a/config/kernel-vfs-file_range.m4
+++ b/config/kernel-vfs-file_range.m4
@@ -16,6 +16,9 @@ dnl #
 dnl # 5.3: VFS copy_file_range() expected to do its own fallback,
 dnl #      generic_copy_file_range() added to support it
 dnl #
+dnl # 6.8: generic_copy_file_range() removed, replaced by
+dnl #      splice_copy_file_range()
+dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE], [
 	ZFS_LINUX_TEST_SRC([vfs_copy_file_range], [
 		#include <linux/fs.h>
@@ -72,6 +75,30 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE], [
 	])
 ])
 
+AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE], [
+	ZFS_LINUX_TEST_SRC([splice_copy_file_range], [
+		#include <linux/splice.h>
+	], [
+		struct file *src_file __attribute__ ((unused)) = NULL;
+		loff_t src_off __attribute__ ((unused)) = 0;
+		struct file *dst_file __attribute__ ((unused)) = NULL;
+		loff_t dst_off __attribute__ ((unused)) = 0;
+		size_t len __attribute__ ((unused)) = 0;
+		splice_copy_file_range(src_file, src_off, dst_file, dst_off,
+		    len);
+	])
+])
+AC_DEFUN([ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE], [
+	AC_MSG_CHECKING([whether splice_copy_file_range() is available])
+	ZFS_LINUX_TEST_RESULT([splice_copy_file_range], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_VFS_SPLICE_COPY_FILE_RANGE, 1,
+		    [splice_copy_file_range() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
+
 AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE], [
 	ZFS_LINUX_TEST_SRC([vfs_clone_file_range], [
 		#include <linux/fs.h>
diff --git a/config/kernel.m4 b/config/kernel.m4
index e3f8645774c5..1d0c5a27fc7f 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -118,6 +118,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_VFS_IOV_ITER
 	ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE
+	ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE
@@ -266,6 +267,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_VFS_IOV_ITER
 	ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE
+	ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE
diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c
index 3065d54fa9da..64728fdb1187 100644
--- a/module/os/linux/zfs/zpl_file_range.c
+++ b/module/os/linux/zfs/zpl_file_range.c
@@ -26,6 +26,9 @@
 #include <linux/compat.h>
 #endif
 #include <linux/fs.h>
+#ifdef HAVE_VFS_SPLICE_COPY_FILE_RANGE
+#include <linux/splice.h>
+#endif
 #include <sys/file.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_vnops.h>
@@ -102,7 +105,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
 	ret = zpl_clone_file_range_impl(src_file, src_off,
 	    dst_file, dst_off, len);
 
-#ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE
+#if defined(HAVE_VFS_GENERIC_COPY_FILE_RANGE)
 	/*
 	 * Since Linux 5.3 the filesystem driver is responsible for executing
 	 * an appropriate fallback, and a generic fallback function is provided.
@@ -111,6 +114,15 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
 	    ret == -EAGAIN)
 		ret = generic_copy_file_range(src_file, src_off, dst_file,
 		    dst_off, len, flags);
+#elif defined(HAVE_VFS_SPLICE_COPY_FILE_RANGE)
+	/*
+	 * Since 6.8 the fallback function is called splice_copy_file_range
+	 * and has a slightly different signature.
+	 */
+	if (ret == -EOPNOTSUPP || ret == -EINVAL || ret == -EXDEV ||
+	    ret == -EAGAIN)
+		ret = splice_copy_file_range(src_file, src_off, dst_file,
+		    dst_off, len);
 #else
 	/*
 	 * Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal
@@ -118,7 +130,7 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off,
 	 */
 	if (ret == -EINVAL || ret == -EXDEV || ret == -EAGAIN)
 		ret = -EOPNOTSUPP;
-#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */
+#endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE || HAVE_VFS_SPLICE_COPY_FILE_RANGE */
 
 	return (ret);
 }

From 45e23abed55cc1c7216e98df28f1b6c6f172b790 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Wed, 20 Mar 2024 20:22:36 -0400
Subject: [PATCH 03/34] Update resume token at object receive.

Before this change resume token was updated only on data receive.
Usually it is enough to resume replication without much overlap.
But we've got a report of a curios case, where replication source
was traversed with recursive grep, which through enabled atime
modified every object without modifying any data.  It produced
several gigabytes of replication traffic without a single data
write and so without a single resume point.

While the resume token was not designed to resume from an object,
I've found that the send implementation always sends object before
any data. So by requesting resume from offset 0 we are effectively
resuming from the object, followed (or not) by the data at offset
0, just as we need it.

Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15927
---
 module/zfs/dmu_recv.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c
index 54aa60259ea1..2cf10909738b 100644
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -2110,6 +2110,16 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
 		dmu_buf_rele(db, FTAG);
 		dnode_rele(dn, FTAG);
 	}
+
+	/*
+	 * If the receive fails, we want the resume stream to start with the
+	 * same record that we last successfully received. There is no way to
+	 * request resume from the object record, but we can benefit from the
+	 * fact that sender always sends object record before anything else,
+	 * after which it will "resend" data at offset 0 and resume normally.
+	 */
+	save_resume_state(rwa, drro->drr_object, 0, tx);
+
 	dmu_tx_commit(tx);
 
 	return (0);

From c9d8f6c59a268f65075bb9e510a58b1eec8015f7 Mon Sep 17 00:00:00 2001
From: Cameron Harr <harr1@llnl.gov>
Date: Thu, 21 Mar 2024 09:00:29 -0700
Subject: [PATCH 04/34] Fix option string, adding -e and fixing order

The recently added '-e' option (PR #15769) missed adding the
new option in the online `zpool status` help command. This
adds the options and reorders a couple of the other options
that were not listed alphabetically.

Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Cameron Harr <harr1@llnl.gov>
Closes #16008
---
 cmd/zpool/zpool_main.c  | 39 +++++++++++++++++++--------------------
 man/man8/zpool-status.8 | 18 +++++++++---------
 2 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 0783271f4734..987d44062865 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -413,7 +413,7 @@ get_usage(zpool_help_t idx)
 		    "[<device> ...]\n"));
 	case HELP_STATUS:
 		return (gettext("\tstatus [--power] [-c [script1,script2,...]] "
-		    "[-igLpPstvxD]  [-T d|u] [pool] ... \n"
+		    "[-DegiLpPstvx] [-T d|u] [pool] ...\n"
 		    "\t    [interval [count]]\n"));
 	case HELP_UPGRADE:
 		return (gettext("\tupgrade\n"
@@ -9177,22 +9177,22 @@ status_callback(zpool_handle_t *zhp, void *data)
 }
 
 /*
- * zpool status [-c [script1,script2,...]] [-igLpPstvx] [--power] [-T d|u] ...
+ * zpool status [-c [script1,script2,...]] [-DegiLpPstvx] [--power] [-T d|u] ...
  *              [pool] [interval [count]]
  *
  *	-c CMD	For each vdev, run command CMD
+ *	-D	Display dedup status (undocumented)
  *	-e	Display only unhealthy vdevs
- *	-i	Display vdev initialization status.
  *	-g	Display guid for individual vdev name.
+ *	-i	Display vdev initialization status.
  *	-L	Follow links when resolving vdev path name.
  *	-p	Display values in parsable (exact) format.
  *	-P	Display full path for vdev name.
  *	-s	Display slow IOs column.
- *	-v	Display complete error logs
- *	-x	Display only pools with potential problems
- *	-D	Display dedup status (undocumented)
  *	-t	Display vdev TRIM status.
  *	-T	Display a timestamp in date(1) or Unix format
+ *	-v	Display complete error logs
+ *	-x	Display only pools with potential problems
  *	--power	Display vdev enclosure slot power status
  *
  * Describes the health status of all pools or some subset.
@@ -9213,7 +9213,7 @@ zpool_do_status(int argc, char **argv)
 	};
 
 	/* check options */
-	while ((c = getopt_long(argc, argv, "c:eigLpPsvxDtT:", long_options,
+	while ((c = getopt_long(argc, argv, "c:DegiLpPstT:vx", long_options,
 	    NULL)) != -1) {
 		switch (c) {
 		case 'c':
@@ -9240,15 +9240,18 @@ zpool_do_status(int argc, char **argv)
 			}
 			cmd = optarg;
 			break;
+		case 'D':
+			cb.cb_dedup_stats = B_TRUE;
+			break;
 		case 'e':
 			cb.cb_print_unhealthy = B_TRUE;
 			break;
-		case 'i':
-			cb.cb_print_vdev_init = B_TRUE;
-			break;
 		case 'g':
 			cb.cb_name_flags |= VDEV_NAME_GUID;
 			break;
+		case 'i':
+			cb.cb_print_vdev_init = B_TRUE;
+			break;
 		case 'L':
 			cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS;
 			break;
@@ -9261,21 +9264,18 @@ zpool_do_status(int argc, char **argv)
 		case 's':
 			cb.cb_print_slow_ios = B_TRUE;
 			break;
-		case 'v':
-			cb.cb_verbose = B_TRUE;
-			break;
-		case 'x':
-			cb.cb_explain = B_TRUE;
-			break;
-		case 'D':
-			cb.cb_dedup_stats = B_TRUE;
-			break;
 		case 't':
 			cb.cb_print_vdev_trim = B_TRUE;
 			break;
 		case 'T':
 			get_timestamp_arg(*optarg);
 			break;
+		case 'v':
+			cb.cb_verbose = B_TRUE;
+			break;
+		case 'x':
+			cb.cb_explain = B_TRUE;
+			break;
 		case POWER_OPT:
 			cb.cb_print_power = B_TRUE;
 			break;
@@ -9315,7 +9315,6 @@ zpool_do_status(int argc, char **argv)
 
 		if (cb.vcdl != NULL)
 			free_vdev_cmd_data_list(cb.vcdl);
-
 		if (argc == 0 && cb.cb_count == 0)
 			(void) fprintf(stderr, gettext("no pools available\n"));
 		else if (cb.cb_explain && cb.cb_first && cb.cb_allpools)
diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8
index 24ad6e643cae..bbe7a45aa0c6 100644
--- a/man/man8/zpool-status.8
+++ b/man/man8/zpool-status.8
@@ -36,7 +36,7 @@
 .Sh SYNOPSIS
 .Nm zpool
 .Cm status
-.Op Fl DeigLpPstvx
+.Op Fl DegiLpPstvx
 .Op Fl T Sy u Ns | Ns Sy d
 .Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns …
 .Oo Ar pool Oc Ns …
@@ -69,14 +69,20 @@ See the
 option of
 .Nm zpool Cm iostat
 for complete details.
+.It Fl D
+Display a histogram of deduplication statistics, showing the allocated
+.Pq physically present on disk
+and referenced
+.Pq logically referenced in the pool
+block counts and sizes by reference count.
 .It Fl e
 Only show unhealthy vdevs (not-ONLINE or with errors).
-.It Fl i
-Display vdev initialization status.
 .It Fl g
 Display vdev GUIDs instead of the normal device names
 These GUIDs can be used in place of device names for the zpool
 detach/offline/remove/replace commands.
+.It Fl i
+Display vdev initialization status.
 .It Fl L
 Display real paths for vdevs resolving all symbolic links.
 This can be used to look up the current block device name regardless of the
@@ -90,12 +96,6 @@ the path.
 This can be used in conjunction with the
 .Fl L
 flag.
-.It Fl D
-Display a histogram of deduplication statistics, showing the allocated
-.Pq physically present on disk
-and referenced
-.Pq logically referenced in the pool
-block counts and sizes by reference count.
 .It Fl s
 Display the number of leaf vdev slow I/O operations.
 This is the number of I/O operations that didn't complete in

From 5c4a4f82c850be6540076ff794d25defd826dddf Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Fri, 22 Mar 2024 06:10:04 +1100
Subject: [PATCH 05/34] zio: update ZIO type x stage documentation

- add column for TRIM ZIOs
- remove R from ZIO_STAGE_ISSUE_ASYNC, never happened
- remove I from ZIO_STAGE_VDEV_IO_DONE, never happened

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #15959
---
 include/sys/zio_impl.h  | 56 +++++++++++++++++++++-------------------
 man/man8/zpool-events.8 | 57 +++++++++++++++++++++--------------------
 2 files changed, 58 insertions(+), 55 deletions(-)

diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h
index febe0a87b428..1c0a44059d24 100644
--- a/include/sys/zio_impl.h
+++ b/include/sys/zio_impl.h
@@ -25,6 +25,7 @@
 
 /*
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2024, Klara Inc.
  */
 
 #ifndef _ZIO_IMPL_H
@@ -39,7 +40,7 @@ extern "C" {
  *
  * The ZFS I/O pipeline is comprised of various stages which are defined
  * in the zio_stage enum below. The individual stages are used to construct
- * these basic I/O operations: Read, Write, Free, Claim, and Ioctl.
+ * these basic I/O operations: Read, Write, Free, Claim, Ioctl and Trim.
  *
  * I/O operations: (XXX - provide detail for each of the operations)
  *
@@ -48,6 +49,7 @@ extern "C" {
  * Free:
  * Claim:
  * Ioctl:
+ * Trim:
  *
  * Although the most common pipeline are used by the basic I/O operations
  * above, there are some helper pipelines (one could consider them
@@ -120,43 +122,43 @@ extern "C" {
  * zio pipeline stage definitions
  */
 enum zio_stage {
-	ZIO_STAGE_OPEN			= 1 << 0,	/* RWFCI */
+	ZIO_STAGE_OPEN			= 1 << 0,	/* RWFCIT */
 
-	ZIO_STAGE_READ_BP_INIT		= 1 << 1,	/* R---- */
-	ZIO_STAGE_WRITE_BP_INIT		= 1 << 2,	/* -W--- */
-	ZIO_STAGE_FREE_BP_INIT		= 1 << 3,	/* --F-- */
-	ZIO_STAGE_ISSUE_ASYNC		= 1 << 4,	/* RWF-- */
-	ZIO_STAGE_WRITE_COMPRESS	= 1 << 5,	/* -W--- */
+	ZIO_STAGE_READ_BP_INIT		= 1 << 1,	/* R----- */
+	ZIO_STAGE_WRITE_BP_INIT		= 1 << 2,	/* -W---- */
+	ZIO_STAGE_FREE_BP_INIT		= 1 << 3,	/* --F--- */
+	ZIO_STAGE_ISSUE_ASYNC		= 1 << 4,	/* -WF--T */
+	ZIO_STAGE_WRITE_COMPRESS	= 1 << 5,	/* -W---- */
 
-	ZIO_STAGE_ENCRYPT		= 1 << 6,	/* -W--- */
-	ZIO_STAGE_CHECKSUM_GENERATE	= 1 << 7,	/* -W--- */
+	ZIO_STAGE_ENCRYPT		= 1 << 6,	/* -W---- */
+	ZIO_STAGE_CHECKSUM_GENERATE	= 1 << 7,	/* -W---- */
 
-	ZIO_STAGE_NOP_WRITE		= 1 << 8,	/* -W--- */
+	ZIO_STAGE_NOP_WRITE		= 1 << 8,	/* -W---- */
 
-	ZIO_STAGE_BRT_FREE		= 1 << 9,	/* --F-- */
+	ZIO_STAGE_BRT_FREE		= 1 << 9,	/* --F--- */
 
-	ZIO_STAGE_DDT_READ_START	= 1 << 10,	/* R---- */
-	ZIO_STAGE_DDT_READ_DONE		= 1 << 11,	/* R---- */
-	ZIO_STAGE_DDT_WRITE		= 1 << 12,	/* -W--- */
-	ZIO_STAGE_DDT_FREE		= 1 << 13,	/* --F-- */
+	ZIO_STAGE_DDT_READ_START	= 1 << 10,	/* R----- */
+	ZIO_STAGE_DDT_READ_DONE		= 1 << 11,	/* R----- */
+	ZIO_STAGE_DDT_WRITE		= 1 << 12,	/* -W---- */
+	ZIO_STAGE_DDT_FREE		= 1 << 13,	/* --F--- */
 
-	ZIO_STAGE_GANG_ASSEMBLE		= 1 << 14,	/* RWFC- */
-	ZIO_STAGE_GANG_ISSUE		= 1 << 15,	/* RWFC- */
+	ZIO_STAGE_GANG_ASSEMBLE		= 1 << 14,	/* RWFC-- */
+	ZIO_STAGE_GANG_ISSUE		= 1 << 15,	/* RWFC-- */
 
-	ZIO_STAGE_DVA_THROTTLE		= 1 << 16,	/* -W--- */
-	ZIO_STAGE_DVA_ALLOCATE		= 1 << 17,	/* -W--- */
-	ZIO_STAGE_DVA_FREE		= 1 << 18,	/* --F-- */
-	ZIO_STAGE_DVA_CLAIM		= 1 << 19,	/* ---C- */
+	ZIO_STAGE_DVA_THROTTLE		= 1 << 16,	/* -W---- */
+	ZIO_STAGE_DVA_ALLOCATE		= 1 << 17,	/* -W---- */
+	ZIO_STAGE_DVA_FREE		= 1 << 18,	/* --F--- */
+	ZIO_STAGE_DVA_CLAIM		= 1 << 19,	/* ---C-- */
 
-	ZIO_STAGE_READY			= 1 << 20,	/* RWFCI */
+	ZIO_STAGE_READY			= 1 << 20,	/* RWFCIT */
 
-	ZIO_STAGE_VDEV_IO_START		= 1 << 21,	/* RW--I */
-	ZIO_STAGE_VDEV_IO_DONE		= 1 << 22,	/* RW--I */
-	ZIO_STAGE_VDEV_IO_ASSESS	= 1 << 23,	/* RW--I */
+	ZIO_STAGE_VDEV_IO_START		= 1 << 21,	/* RW--IT */
+	ZIO_STAGE_VDEV_IO_DONE		= 1 << 22,	/* RW---T */
+	ZIO_STAGE_VDEV_IO_ASSESS	= 1 << 23,	/* RW--IT */
 
-	ZIO_STAGE_CHECKSUM_VERIFY	= 1 << 24,	/* R---- */
+	ZIO_STAGE_CHECKSUM_VERIFY	= 1 << 24,	/* R----- */
 
-	ZIO_STAGE_DONE			= 1 << 25	/* RWFCI */
+	ZIO_STAGE_DONE			= 1 << 25	/* RWFCIT */
 };
 
 #define	ZIO_ROOT_PIPELINE			\
diff --git a/man/man8/zpool-events.8 b/man/man8/zpool-events.8
index e1436f6ded57..a7a9e33442da 100644
--- a/man/man8/zpool-events.8
+++ b/man/man8/zpool-events.8
@@ -25,8 +25,9 @@
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+.\" Copyright (c) 2024, Klara Inc.
 .\"
-.Dd July 11, 2023
+.Dd February 28, 2024
 .Dt ZPOOL-EVENTS 8
 .Os
 .
@@ -363,7 +364,7 @@ that is, the bits set in the good data which are cleared in the bad data.
 .Sh I/O STAGES
 The ZFS I/O pipeline is comprised of various stages which are defined below.
 The individual stages are used to construct these basic I/O
-operations: Read, Write, Free, Claim, and Ioctl.
+operations: Read, Write, Free, Claim, Ioctl and Trim.
 These stages may be
 set on an event to describe the life cycle of a given I/O request.
 .Pp
@@ -372,43 +373,43 @@ tab(:);
 l l l .
 Stage:Bit Mask:Operations
 _:_:_
-ZIO_STAGE_OPEN:0x00000001:RWFCI
+ZIO_STAGE_OPEN:0x00000001:RWFCIT
 
-ZIO_STAGE_READ_BP_INIT:0x00000002:R----
-ZIO_STAGE_WRITE_BP_INIT:0x00000004:-W---
-ZIO_STAGE_FREE_BP_INIT:0x00000008:--F--
-ZIO_STAGE_ISSUE_ASYNC:0x00000010:RWF--
-ZIO_STAGE_WRITE_COMPRESS:0x00000020:-W---
+ZIO_STAGE_READ_BP_INIT:0x00000002:R-----
+ZIO_STAGE_WRITE_BP_INIT:0x00000004:-W----
+ZIO_STAGE_FREE_BP_INIT:0x00000008:--F---
+ZIO_STAGE_ISSUE_ASYNC:0x00000010:-WF--T
+ZIO_STAGE_WRITE_COMPRESS:0x00000020:-W----
 
-ZIO_STAGE_ENCRYPT:0x00000040:-W---
-ZIO_STAGE_CHECKSUM_GENERATE:0x00000080:-W---
+ZIO_STAGE_ENCRYPT:0x00000040:-W----
+ZIO_STAGE_CHECKSUM_GENERATE:0x00000080:-W----
 
-ZIO_STAGE_NOP_WRITE:0x00000100:-W---
+ZIO_STAGE_NOP_WRITE:0x00000100:-W----
 
-ZIO_STAGE_BRT_FREE:0x00000200:--F--
+ZIO_STAGE_BRT_FREE:0x00000200:--F---
 
-ZIO_STAGE_DDT_READ_START:0x00000400:R----
-ZIO_STAGE_DDT_READ_DONE:0x00000800:R----
-ZIO_STAGE_DDT_WRITE:0x00001000:-W---
-ZIO_STAGE_DDT_FREE:0x00002000:--F--
+ZIO_STAGE_DDT_READ_START:0x00000400:R-----
+ZIO_STAGE_DDT_READ_DONE:0x00000800:R-----
+ZIO_STAGE_DDT_WRITE:0x00001000:-W----
+ZIO_STAGE_DDT_FREE:0x00002000:--F---
 
-ZIO_STAGE_GANG_ASSEMBLE:0x00004000:RWFC-
-ZIO_STAGE_GANG_ISSUE:0x00008000:RWFC-
+ZIO_STAGE_GANG_ASSEMBLE:0x00004000:RWFC--
+ZIO_STAGE_GANG_ISSUE:0x00008000:RWFC--
 
-ZIO_STAGE_DVA_THROTTLE:0x00010000:-W---
-ZIO_STAGE_DVA_ALLOCATE:0x00020000:-W---
-ZIO_STAGE_DVA_FREE:0x00040000:--F--
-ZIO_STAGE_DVA_CLAIM:0x00080000:---C-
+ZIO_STAGE_DVA_THROTTLE:0x00010000:-W----
+ZIO_STAGE_DVA_ALLOCATE:0x00020000:-W----
+ZIO_STAGE_DVA_FREE:0x00040000:--F---
+ZIO_STAGE_DVA_CLAIM:0x00080000:---C--
 
-ZIO_STAGE_READY:0x00100000:RWFCI
+ZIO_STAGE_READY:0x00100000:RWFCIT
 
-ZIO_STAGE_VDEV_IO_START:0x00200000:RW--I
-ZIO_STAGE_VDEV_IO_DONE:0x00400000:RW--I
-ZIO_STAGE_VDEV_IO_ASSESS:0x00800000:RW--I
+ZIO_STAGE_VDEV_IO_START:0x00200000:RW--IT
+ZIO_STAGE_VDEV_IO_DONE:0x00400000:RW---T
+ZIO_STAGE_VDEV_IO_ASSESS:0x00800000:RW--IT
 
-ZIO_STAGE_CHECKSUM_VERIFY:0x01000000:R----
+ZIO_STAGE_CHECKSUM_VERIFY:0x01000000:R-----
 
-ZIO_STAGE_DONE:0x02000000:RWFCI
+ZIO_STAGE_DONE:0x02000000:RWFCIT
 .TE
 .
 .Sh I/O FLAGS

From 2c01cae8b9faca5766629aa45b2bfabaeae92e4d Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 21 Mar 2024 18:42:21 -0400
Subject: [PATCH 06/34] BRT: Change brt_pending_tree sorting order

It does not look important how exactly brt_pending_tree is sorted.
When cloning large file, it is quite likely that all of its blocks
have identical physical birth times, so comparing them first does
not provide useful entropy, while accesses additional cache line.
In most cases combination of vdev and offset provides unique result
and physical birth time comparison is not even needed.  Meanwhile,
when traversing the tree inside brt_pending_apply(), it can be
beneficial for dbuf cache and CPU cache hits to group processing
by vdev and so by the per-VDEV BRT ZAPs.

Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15954
---
 module/zfs/brt.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 225ddaca1e54..3d565cd1397c 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -1420,13 +1420,14 @@ brt_pending_entry_compare(const void *x1, const void *x2)
 	const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp;
 	int cmp;
 
-	cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1), BP_PHYSICAL_BIRTH(bp2));
+	cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]),
+	    DVA_GET_VDEV(&bp2->blk_dva[0]));
 	if (cmp == 0) {
-		cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]),
-		    DVA_GET_VDEV(&bp2->blk_dva[0]));
-		if (cmp == 0) {
-			cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
-			    DVA_GET_OFFSET(&bp2->blk_dva[0]));
+		cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
+		    DVA_GET_OFFSET(&bp2->blk_dva[0]));
+		if (unlikely(cmp == 0)) {
+			cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1),
+			    BP_PHYSICAL_BIRTH(bp2));
 		}
 	}
 

From f1b368359b3970f7995a6dcb088fdadb31840f4d Mon Sep 17 00:00:00 2001
From: Fabian-Gruenbichler <f.gruenbichler@proxmox.com>
Date: Fri, 22 Mar 2024 00:38:24 +0100
Subject: [PATCH 07/34] udev: correctly handle partition #16 and later
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If a zvol has more than 15 partitions, the minor device number exhausts
the slot count reserved for partitions next to the zvol itself. As a
result, the minor number cannot be used to determine the partition
number for the higher partition, and doing so results in wrong named
symlinks being generated by udev.

Since the partition number is encoded in the block device name anyway,
let's just extract it from there instead.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
Closes #15904
Closes #15970
---
 udev/zvol_id.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/udev/zvol_id.c b/udev/zvol_id.c
index 5960b978787a..609349594767 100644
--- a/udev/zvol_id.c
+++ b/udev/zvol_id.c
@@ -51,7 +51,7 @@ const char *__asan_default_options(void) {
 int
 main(int argc, const char *const *argv)
 {
-	if (argc != 2) {
+	if (argc != 2 || strncmp(argv[1], "/dev/zd", 7) != 0) {
 		fprintf(stderr, "usage: %s /dev/zdX\n", argv[0]);
 		return (1);
 	}
@@ -72,9 +72,10 @@ main(int argc, const char *const *argv)
 		return (1);
 	}
 
-	unsigned int dev_part = minor(sb.st_rdev) % ZVOL_MINORS;
-	if (dev_part != 0)
-		sprintf(zvol_name + strlen(zvol_name), "-part%u", dev_part);
+	const char *dev_part = strrchr(dev_name, 'p');
+	if (dev_part != NULL) {
+		sprintf(zvol_name + strlen(zvol_name), "-part%s", dev_part + 1);
+	}
 
 	for (size_t i = 0; i < strlen(zvol_name); ++i)
 		if (isblank(zvol_name[i]))

From c28f94f32ef0f104b731be0e44c5e61bbdf3b9b7 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 21 Mar 2024 19:43:53 -0400
Subject: [PATCH 08/34] ZAP: Some cleanups/micro-optimizations

- Remove custom zap_memset(), use regular memset().
- Use PANIC() instead of opaque cmn_err(CE_PANIC).
- Provide entry parameter to zap_leaf_rehash_entry().
- Reduce branching in zap_leaf_array_create() inner loop.
- Remove signedness where it should not be.

Should be no function changes.

Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15976
---
 include/sys/zap_leaf.h |  8 ++---
 module/zfs/zap_leaf.c  | 77 +++++++++++++++++++-----------------------
 2 files changed, 38 insertions(+), 47 deletions(-)

diff --git a/include/sys/zap_leaf.h b/include/sys/zap_leaf.h
index ebc67c2bf465..d563edd7ba59 100644
--- a/include/sys/zap_leaf.h
+++ b/include/sys/zap_leaf.h
@@ -47,7 +47,7 @@ struct zap_stats;
  * entries - header space (2*chunksize)
  */
 #define	ZAP_LEAF_NUMCHUNKS_BS(bs) \
-	(((1<<(bs)) - 2*ZAP_LEAF_HASH_NUMENTRIES_BS(bs)) / \
+	(((1U << (bs)) - 2 * ZAP_LEAF_HASH_NUMENTRIES_BS(bs)) / \
 	ZAP_LEAF_CHUNKSIZE - 2)
 
 #define	ZAP_LEAF_NUMCHUNKS(l) (ZAP_LEAF_NUMCHUNKS_BS(((l)->l_bs)))
@@ -80,7 +80,7 @@ struct zap_stats;
  * chunks per entry (3).
  */
 #define	ZAP_LEAF_HASH_SHIFT_BS(bs) ((bs) - 5)
-#define	ZAP_LEAF_HASH_NUMENTRIES_BS(bs) (1 << ZAP_LEAF_HASH_SHIFT_BS(bs))
+#define	ZAP_LEAF_HASH_NUMENTRIES_BS(bs) (1U << ZAP_LEAF_HASH_SHIFT_BS(bs))
 #define	ZAP_LEAF_HASH_SHIFT(l) (ZAP_LEAF_HASH_SHIFT_BS(((l)->l_bs)))
 #define	ZAP_LEAF_HASH_NUMENTRIES(l) (ZAP_LEAF_HASH_NUMENTRIES_BS(((l)->l_bs)))
 
@@ -163,7 +163,7 @@ typedef struct zap_leaf {
 	dmu_buf_user_t l_dbu;
 	krwlock_t l_rwlock;
 	uint64_t l_blkid;		/* 1<<ZAP_BLOCK_SHIFT byte block off */
-	int l_bs;			/* block size shift */
+	uint_t l_bs;			/* block size shift */
 	dmu_buf_t *l_dbuf;
 } zap_leaf_t;
 
@@ -243,7 +243,7 @@ extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh,
  */
 
 extern void zap_leaf_init(zap_leaf_t *l, boolean_t sort);
-extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len);
+extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, size_t len);
 extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort);
 extern void zap_leaf_stats(struct zap *zap, zap_leaf_t *l,
     struct zap_stats *zs);
diff --git a/module/zfs/zap_leaf.c b/module/zfs/zap_leaf.c
index e6afb1c58c95..032aca92695e 100644
--- a/module/zfs/zap_leaf.c
+++ b/module/zfs/zap_leaf.c
@@ -41,7 +41,8 @@
 #include <sys/zap_leaf.h>
 #include <sys/arc.h>
 
-static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
+static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, struct zap_leaf_entry *le,
+    uint16_t entry);
 
 #define	CHAIN_END 0xffff /* end of the chunk chain */
 
@@ -52,16 +53,6 @@ static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
 
 #define	LEAF_HASH_ENTPTR(l, h)	(&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)])
 
-static void
-zap_memset(void *a, int c, size_t n)
-{
-	char *cp = a;
-	char *cpend = cp + n;
-
-	while (cp < cpend)
-		*cp++ = c;
-}
-
 static void
 stv(int len, void *addr, uint64_t value)
 {
@@ -79,7 +70,7 @@ stv(int len, void *addr, uint64_t value)
 		*(uint64_t *)addr = value;
 		return;
 	default:
-		cmn_err(CE_PANIC, "bad int len %d", len);
+		PANIC("bad int len %d", len);
 	}
 }
 
@@ -96,13 +87,13 @@ ldv(int len, const void *addr)
 	case 8:
 		return (*(uint64_t *)addr);
 	default:
-		cmn_err(CE_PANIC, "bad int len %d", len);
+		PANIC("bad int len %d", len);
 	}
 	return (0xFEEDFACEDEADBEEFULL);
 }
 
 void
-zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
+zap_leaf_byteswap(zap_leaf_phys_t *buf, size_t size)
 {
 	zap_leaf_t l;
 	dmu_buf_t l_dbuf;
@@ -119,10 +110,10 @@ zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
 	buf->l_hdr.lh_prefix_len =	BSWAP_16(buf->l_hdr.lh_prefix_len);
 	buf->l_hdr.lh_freelist =	BSWAP_16(buf->l_hdr.lh_freelist);
 
-	for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
+	for (uint_t i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
 		buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
 
-	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
+	for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
 		zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
 		struct zap_leaf_entry *le;
 
@@ -160,11 +151,11 @@ void
 zap_leaf_init(zap_leaf_t *l, boolean_t sort)
 {
 	l->l_bs = highbit64(l->l_dbuf->db_size) - 1;
-	zap_memset(&zap_leaf_phys(l)->l_hdr, 0,
+	memset(&zap_leaf_phys(l)->l_hdr, 0,
 	    sizeof (struct zap_leaf_header));
-	zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
+	memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
 	    2*ZAP_LEAF_HASH_NUMENTRIES(l));
-	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+	for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
 		ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
 		ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
 	}
@@ -185,7 +176,7 @@ zap_leaf_chunk_alloc(zap_leaf_t *l)
 {
 	ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0);
 
-	int chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
+	uint_t chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
 	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 	ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
 
@@ -223,28 +214,29 @@ zap_leaf_array_create(zap_leaf_t *l, const char *buf,
 {
 	uint16_t chunk_head;
 	uint16_t *chunkp = &chunk_head;
-	int byten = 0;
+	int byten = integer_size;
 	uint64_t value = 0;
 	int shift = (integer_size - 1) * 8;
 	int len = num_integers;
 
 	ASSERT3U(num_integers * integer_size, <=, ZAP_MAXVALUELEN);
 
+	if (len > 0)
+		value = ldv(integer_size, buf);
 	while (len > 0) {
 		uint16_t chunk = zap_leaf_chunk_alloc(l);
 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
 
 		la->la_type = ZAP_CHUNK_ARRAY;
 		for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
-			if (byten == 0)
-				value = ldv(integer_size, buf);
 			la->la_array[i] = value >> shift;
 			value <<= 8;
-			if (++byten == integer_size) {
-				byten = 0;
-				buf += integer_size;
+			if (--byten == 0) {
 				if (--len == 0)
 					break;
+				byten = integer_size;
+				buf += integer_size;
+				value = ldv(integer_size, buf);
 			}
 		}
 
@@ -264,7 +256,7 @@ zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
 	*chunkp = CHAIN_END;
 
 	while (chunk != CHAIN_END) {
-		int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
+		uint_t nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
 		ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
 		    ZAP_CHUNK_ARRAY);
 		zap_leaf_chunk_free(l, chunk);
@@ -333,7 +325,7 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
 
 static boolean_t
 zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
-    int chunk, int array_numints)
+    uint_t chunk, int array_numints)
 {
 	int bseen = 0;
 
@@ -562,7 +554,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
 
 	uint64_t valuelen = integer_size * num_integers;
 
-	int numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
+	uint_t numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
 	    zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
 	if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
 		return (SET_ERROR(E2BIG));
@@ -624,7 +616,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
 
 	/* link it into the hash chain */
 	/* XXX if we did the search above, we could just use that */
-	uint16_t *chunkp = zap_leaf_rehash_entry(l, chunk);
+	uint16_t *chunkp = zap_leaf_rehash_entry(l, le, chunk);
 
 	zap_leaf_phys(l)->l_hdr.lh_nentries++;
 
@@ -687,9 +679,8 @@ zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
  */
 
 static uint16_t *
-zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
+zap_leaf_rehash_entry(zap_leaf_t *l, struct zap_leaf_entry *le, uint16_t entry)
 {
-	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
 	struct zap_leaf_entry *le2;
 	uint16_t *chunkp;
 
@@ -722,7 +713,7 @@ zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
 		    &ZAP_LEAF_CHUNK(nl, nchunk).l_array;
 		struct zap_leaf_array *la =
 		    &ZAP_LEAF_CHUNK(l, chunk).l_array;
-		int nextchunk = la->la_next;
+		uint_t nextchunk = la->la_next;
 
 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
 		ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
@@ -739,7 +730,7 @@ zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
 }
 
 static void
-zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
+zap_leaf_transfer_entry(zap_leaf_t *l, uint_t entry, zap_leaf_t *nl)
 {
 	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
@@ -748,7 +739,7 @@ zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
 	struct zap_leaf_entry *nle = ZAP_LEAF_ENTRY(nl, chunk);
 	*nle = *le; /* structure assignment */
 
-	(void) zap_leaf_rehash_entry(nl, chunk);
+	(void) zap_leaf_rehash_entry(nl, nle, chunk);
 
 	nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
 	nle->le_value_chunk =
@@ -766,7 +757,7 @@ zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
 void
 zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 {
-	int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+	uint_t bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 
 	/* set new prefix and prefix_len */
 	zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1;
@@ -777,7 +768,7 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 	    zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 
 	/* break existing hash chains */
-	zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
+	memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
 	    2*ZAP_LEAF_HASH_NUMENTRIES(l));
 
 	if (sort)
@@ -792,7 +783,7 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 	 * but this accesses memory more sequentially, and when we're
 	 * called, the block is usually pretty full.
 	 */
-	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+	for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
 		struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
 		if (le->le_type != ZAP_CHUNK_ENTRY)
 			continue;
@@ -800,14 +791,14 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
 		if (le->le_hash & (1ULL << bit))
 			zap_leaf_transfer_entry(l, i, nl);
 		else
-			(void) zap_leaf_rehash_entry(l, i);
+			(void) zap_leaf_rehash_entry(l, le, i);
 	}
 }
 
 void
 zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
 {
-	int n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
+	uint_t n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
 	    zap_leaf_phys(l)->l_hdr.lh_prefix_len;
 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 	zs->zs_leafs_with_2n_pointers[n]++;
@@ -823,9 +814,9 @@ zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
 	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 	zs->zs_blocks_n_tenths_full[n]++;
 
-	for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
-		int nentries = 0;
-		int chunk = zap_leaf_phys(l)->l_hash[i];
+	for (uint_t i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
+		uint_t nentries = 0;
+		uint_t chunk = zap_leaf_phys(l)->l_hash[i];
 
 		while (chunk != CHAIN_END) {
 			struct zap_leaf_entry *le =

From 102b468b5e190973fbaee6fe682727eb33079811 Mon Sep 17 00:00:00 2001
From: Robert Evans <rrevans@gmail.com>
Date: Mon, 25 Mar 2024 17:56:49 -0400
Subject: [PATCH 09/34] Fix corruption caused by mmap flushing problems

1) Make mmap flushes synchronous. Linux may skip flushing dirty pages
   already in writeback unless data-integrity sync is requested.

2) Change zfs_putpage to use TXG_WAIT. Otherwise dirty pages may be
   skipped due to DMU pushing back on TX assign.

3) Add missing mmap flush when doing block cloning.

4) While here, pass errors from putpage to writepage/writepages.

This change fixes corruption edge cases, but unfortunately adds
synchronous ZIL flushes for dirty mmap pages to llseek and bclone
operations. It may be possible to avoid these sync writes later
but would need more tricky refactoring of the writeback code.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Robert Evans <evansr@google.com>
Closes #15933
Closes #16019
---
 module/os/linux/zfs/zfs_vnops_os.c | 5 +----
 module/os/linux/zfs/zpl_file.c     | 8 ++++----
 module/zfs/zfs_vnops.c             | 6 +++++-
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c
index a32307c39331..1cecad9f7755 100644
--- a/module/os/linux/zfs/zfs_vnops_os.c
+++ b/module/os/linux/zfs/zfs_vnops_os.c
@@ -3795,11 +3795,8 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 
-	err = dmu_tx_assign(tx, TXG_NOWAIT);
+	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
-		if (err == ERESTART)
-			dmu_tx_wait(tx);
-
 		dmu_tx_abort(tx);
 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
 		filemap_dirty_folio(page_mapping(pp), page_folio(pp));
diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
index 3caa0fc6c214..9dec52215c7c 100644
--- a/module/os/linux/zfs/zpl_file.c
+++ b/module/os/linux/zfs/zpl_file.c
@@ -720,23 +720,23 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
 {
 	boolean_t *for_sync = data;
 	fstrans_cookie_t cookie;
+	int ret;
 
 	ASSERT(PageLocked(pp));
 	ASSERT(!PageWriteback(pp));
 
 	cookie = spl_fstrans_mark();
-	(void) zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
+	ret = zfs_putpage(pp->mapping->host, pp, wbc, *for_sync);
 	spl_fstrans_unmark(cookie);
 
-	return (0);
+	return (ret);
 }
 
 #ifdef HAVE_WRITEPAGE_T_FOLIO
 static int
 zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data)
 {
-	(void) zpl_putpage(&pp->page, wbc, data);
-	return (0);
+	return (zpl_putpage(&pp->page, wbc, data));
 }
 #endif
 
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 7f39ad6fc775..babb07ca25a9 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -123,7 +123,7 @@ zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
 
 	/* Flush any mmap()'d data to disk */
 	if (zn_has_cached_data(zp, 0, file_sz - 1))
-		zn_flush_cached_data(zp, B_FALSE);
+		zn_flush_cached_data(zp, B_TRUE);
 
 	lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER);
 	error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
@@ -1187,6 +1187,10 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
 		}
 	}
 
+	/* Flush any mmap()'d data to disk */
+	if (zn_has_cached_data(inzp, inoff, inoff + len - 1))
+		zn_flush_cached_data(inzp, B_TRUE);
+
 	/*
 	 * Maintain predictable lock order.
 	 */

From bf8f72359d1bf0cdb6a4b31ccfc7bbef0f948ca4 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 25 Mar 2024 17:58:04 -0400
Subject: [PATCH 10/34] BRT: Skip duplicate BRT prefetches

If there is a pending entry for this block, then we've already
issued BRT prefetch for it within this TXG, so don't do it again.
BRT vdev lookup and following zap_prefetch_uint64() call can be
pretty expensive and should be avoided when not necessary.

Reviewed-by: Pawel Jakub Dawidek <pawel@dawidek.net>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15941
---
 module/zfs/brt.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 3d565cd1397c..7ddec0b4b9bb 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -1472,10 +1472,10 @@ brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
 		kmem_cache_free(brt_pending_entry_cache, newbpe);
 	} else {
 		ASSERT(bpe == NULL);
-	}
 
-	/* Prefetch BRT entry, as we will need it in the syncing context. */
-	brt_prefetch(brt, bp);
+		/* Prefetch BRT entry for the syncing context. */
+		brt_prefetch(brt, bp);
+	}
 }
 
 void

From 80cc516295fef1a429542fcfeea369c6bbb85ce4 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 25 Mar 2024 17:58:50 -0400
Subject: [PATCH 11/34] ZAP: Massively switch to _by_dnode() interfaces

Before this change ZAP called dnode_hold() for almost every block
access, that was clearly visible in profiler under heavy load, such
as BRT.  This patch makes it always hold the dnode reference between
zap_lockdir() and zap_unlockdir().  It allows to avoid most of dnode
operations between those.  It also adds several new _by_dnode() APIs
to ZAP and uses them in BRT code.  Also adds dmu_prefetch_by_dnode()
variant and uses it in the ZAP code.

After this there remains only one call to dmu_buf_dnode_enter(),
which seems to be unneeded.  So remove the call and the functions.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15951
---
 include/sys/dmu.h      |   4 +-
 include/sys/zap.h      |   8 ++
 include/sys/zap_impl.h |   1 +
 module/zfs/brt.c       |  72 +++-----------
 module/zfs/dbuf.c      |  15 ---
 module/zfs/dmu.c       |  18 +++-
 module/zfs/dmu_recv.c  |   7 +-
 module/zfs/zap.c       |  43 ++++-----
 module/zfs/zap_micro.c | 206 +++++++++++++++++++++++++++++------------
 9 files changed, 202 insertions(+), 172 deletions(-)

diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index 921f51f27a20..b5fed64da4ad 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -752,8 +752,6 @@ void dmu_buf_sub_user_size(dmu_buf_t *db, uint64_t nsub);
 void *dmu_buf_get_user(dmu_buf_t *db);
 
 objset_t *dmu_buf_get_objset(dmu_buf_t *db);
-dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db);
-void dmu_buf_dnode_exit(dmu_buf_t *db);
 
 /* Block until any in-progress dmu buf user evictions complete. */
 void dmu_buf_user_evict_wait(void);
@@ -902,6 +900,8 @@ extern uint_t zfs_max_recordsize;
  */
 void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
 	uint64_t len, enum zio_priority pri);
+void dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset,
+	uint64_t len, enum zio_priority pri);
 void dmu_prefetch_dnode(objset_t *os, uint64_t object, enum zio_priority pri);
 
 typedef struct dmu_object_info {
diff --git a/include/sys/zap.h b/include/sys/zap.h
index 308a7c7284d7..96ddcc324b65 100644
--- a/include/sys/zap.h
+++ b/include/sys/zap.h
@@ -253,6 +253,9 @@ int zap_add_by_dnode(dnode_t *dn, const char *key,
 int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key,
     int key_numints, int integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx);
+int zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx);
 
 /*
  * Set the attribute with the given name to the given value.  If an
@@ -267,6 +270,9 @@ int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
 int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints,
     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+int zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
+    int key_numints,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
 
 /*
  * Get the length (in integers) and the integer size of the specified
@@ -292,6 +298,8 @@ int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name,
 int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx);
 int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, dmu_tx_t *tx);
+int zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
+    int key_numints, dmu_tx_t *tx);
 
 /*
  * Returns (in *count) the number of attributes in the specified zap
diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h
index 74853f5faceb..2959aa9b2ca4 100644
--- a/include/sys/zap_impl.h
+++ b/include/sys/zap_impl.h
@@ -145,6 +145,7 @@ typedef struct zap {
 	dmu_buf_user_t zap_dbu;
 	objset_t *zap_objset;
 	uint64_t zap_object;
+	dnode_t *zap_dnode;
 	struct dmu_buf *zap_dbuf;
 	krwlock_t zap_rwlock;
 	boolean_t zap_ismicro;
diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 7ddec0b4b9bb..5e10df9dfe56 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -955,52 +955,10 @@ brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre)
 	if (mos_entries == 0)
 		return;
 
-	BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu",
-	    (u_longlong_t)mos_entries, (u_longlong_t)vdevid,
-	    (u_longlong_t)bre->bre_offset);
 	(void) zap_prefetch_uint64(brt->brt_mos, mos_entries,
 	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS);
 }
 
-static int
-brt_entry_update(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
-{
-	int error;
-
-	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
-	ASSERT(brtvd->bv_mos_entries != 0);
-	ASSERT(bre->bre_refcount > 0);
-
-	error = zap_update_uint64(brt->brt_mos, brtvd->bv_mos_entries,
-	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, 1,
-	    sizeof (bre->bre_refcount), &bre->bre_refcount, tx);
-	BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu "
-	    "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
-	    (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
-	    (u_longlong_t)bre->bre_refcount, error);
-
-	return (error);
-}
-
-static int
-brt_entry_remove(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
-{
-	int error;
-
-	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
-	ASSERT(brtvd->bv_mos_entries != 0);
-	ASSERT0(bre->bre_refcount);
-
-	error = zap_remove_uint64(brt->brt_mos, brtvd->bv_mos_entries,
-	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, tx);
-	BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu "
-	    "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
-	    (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
-	    (u_longlong_t)bre->bre_refcount, error);
-
-	return (error);
-}
-
 /*
  * Return TRUE if we _can_ have BRT entry for this bp. It might be false
  * positive, but gives us quick answer if we should look into BRT, which
@@ -1559,24 +1517,16 @@ brt_pending_apply(spa_t *spa, uint64_t txg)
 }
 
 static void
-brt_sync_entry(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
+brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx)
 {
-
-	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
-	ASSERT(brtvd->bv_mos_entries != 0);
-
 	if (bre->bre_refcount == 0) {
-		int error;
-
-		error = brt_entry_remove(brt, brtvd, bre, tx);
-		ASSERT(error == 0 || error == ENOENT);
-		/*
-		 * If error == ENOENT then zfs_clone_range() was done from a
-		 * removed (but opened) file (open(), unlink()).
-		 */
-		ASSERT(brt_entry_lookup(brt, brtvd, bre) == ENOENT);
+		int error = zap_remove_uint64_by_dnode(dn, &bre->bre_offset,
+		    BRT_KEY_WORDS, tx);
+		VERIFY(error == 0 || error == ENOENT);
 	} else {
-		VERIFY0(brt_entry_update(brt, brtvd, bre, tx));
+		VERIFY0(zap_update_uint64_by_dnode(dn, &bre->bre_offset,
+		    BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount),
+		    &bre->bre_refcount, tx));
 	}
 }
 
@@ -1585,6 +1535,7 @@ brt_sync_table(brt_t *brt, dmu_tx_t *tx)
 {
 	brt_vdev_t *brtvd;
 	brt_entry_t *bre;
+	dnode_t *dn;
 	uint64_t vdevid;
 	void *c;
 
@@ -1608,14 +1559,19 @@ brt_sync_table(brt_t *brt, dmu_tx_t *tx)
 		if (brtvd->bv_mos_brtvdev == 0)
 			brt_vdev_create(brt, brtvd, tx);
 
+		VERIFY0(dnode_hold(brt->brt_mos, brtvd->bv_mos_entries,
+		    FTAG, &dn));
+
 		c = NULL;
 		while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
-			brt_sync_entry(brt, brtvd, bre, tx);
+			brt_sync_entry(dn, bre, tx);
 			brt_entry_free(bre);
 			ASSERT(brt->brt_nentries > 0);
 			brt->brt_nentries--;
 		}
 
+		dnode_rele(dn, FTAG);
+
 		brt_vdev_sync(brt, brtvd, tx);
 
 		if (brtvd->bv_totalcount == 0)
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 324bf8cbc276..6798fc2d5bdc 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -4174,21 +4174,6 @@ dmu_buf_get_objset(dmu_buf_t *db)
 	return (dbi->db_objset);
 }
 
-dnode_t *
-dmu_buf_dnode_enter(dmu_buf_t *db)
-{
-	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
-	DB_DNODE_ENTER(dbi);
-	return (DB_DNODE(dbi));
-}
-
-void
-dmu_buf_dnode_exit(dmu_buf_t *db)
-{
-	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
-	DB_DNODE_EXIT(dbi);
-}
-
 static void
 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
 {
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index d82211e6d4c7..8986f55e792a 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -712,8 +712,6 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
     uint64_t len, zio_priority_t pri)
 {
 	dnode_t *dn;
-	int64_t level2 = level;
-	uint64_t start, end, start2, end2;
 
 	if (dmu_prefetch_max == 0 || len == 0) {
 		dmu_prefetch_dnode(os, object, pri);
@@ -723,6 +721,18 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
 	if (dnode_hold(os, object, FTAG, &dn) != 0)
 		return;
 
+	dmu_prefetch_by_dnode(dn, level, offset, len, pri);
+
+	dnode_rele(dn, FTAG);
+}
+
+void
+dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset,
+    uint64_t len, zio_priority_t pri)
+{
+	int64_t level2 = level;
+	uint64_t start, end, start2, end2;
+
 	/*
 	 * Depending on len we may do two prefetches: blocks [start, end) at
 	 * level, and following blocks [start2, end2) at higher level2.
@@ -762,8 +772,6 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
 	for (uint64_t i = start2; i < end2; i++)
 		dbuf_prefetch(dn, level2, i, pri, 0);
 	rw_exit(&dn->dn_struct_rwlock);
-
-	dnode_rele(dn, FTAG);
 }
 
 /*
@@ -2563,6 +2571,8 @@ EXPORT_SYMBOL(dmu_bonus_hold_by_dnode);
 EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus);
 EXPORT_SYMBOL(dmu_buf_rele_array);
 EXPORT_SYMBOL(dmu_prefetch);
+EXPORT_SYMBOL(dmu_prefetch_by_dnode);
+EXPORT_SYMBOL(dmu_prefetch_dnode);
 EXPORT_SYMBOL(dmu_free_range);
 EXPORT_SYMBOL(dmu_free_long_range);
 EXPORT_SYMBOL(dmu_free_long_object);
diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c
index 2cf10909738b..9f1c25f866f7 100644
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -2353,7 +2353,6 @@ receive_process_write_record(struct receive_writer_arg *rwa,
 	if (rwa->heal) {
 		blkptr_t *bp;
 		dmu_buf_t *dbp;
-		dnode_t *dn;
 		int flags = DB_RF_CANFAIL;
 
 		if (rwa->raw)
@@ -2385,19 +2384,15 @@ receive_process_write_record(struct receive_writer_arg *rwa,
 			dmu_buf_rele(dbp, FTAG);
 			return (err);
 		}
-		dn = dmu_buf_dnode_enter(dbp);
 		/* Make sure the on-disk block and recv record sizes match */
-		if (drrw->drr_logical_size !=
-		    dn->dn_datablkszsec << SPA_MINBLOCKSHIFT) {
+		if (drrw->drr_logical_size != dbp->db_size) {
 			err = ENOTSUP;
-			dmu_buf_dnode_exit(dbp);
 			dmu_buf_rele(dbp, FTAG);
 			return (err);
 		}
 		/* Get the block pointer for the corrupted block */
 		bp = dmu_buf_get_blkptr(dbp);
 		err = do_corrective_recv(rwa, drrw, rrd, bp);
-		dmu_buf_dnode_exit(dbp);
 		dmu_buf_rele(dbp, FTAG);
 		return (err);
 	}
diff --git a/module/zfs/zap.c b/module/zfs/zap.c
index dde05d7005c2..da86defb445c 100644
--- a/module/zfs/zap.c
+++ b/module/zfs/zap.c
@@ -133,7 +133,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
 	 * set up block 1 - the first leaf
 	 */
 	dmu_buf_t *db;
-	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
 	dmu_buf_will_dirty(db, tx);
 
@@ -182,7 +182,7 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
 		newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
 		tbl->zt_nextblk = newblk;
 		ASSERT0(tbl->zt_blks_copied);
-		dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
+		dmu_prefetch_by_dnode(zap->zap_dnode, 0,
 		    tbl->zt_blk << bs, tbl->zt_numblks << bs,
 		    ZIO_PRIORITY_SYNC_READ);
 	}
@@ -193,21 +193,21 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
 
 	uint64_t b = tbl->zt_blks_copied;
 	dmu_buf_t *db_old;
-	int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
 	if (err != 0)
 		return (err);
 
 	/* first half of entries in old[b] go to new[2*b+0] */
 	dmu_buf_t *db_new;
-	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
 	dmu_buf_will_dirty(db_new, tx);
 	transfer_func(db_old->db_data, db_new->db_data, hepb);
 	dmu_buf_rele(db_new, FTAG);
 
 	/* second half of entries in old[b] go to new[2*b+1] */
-	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
 	dmu_buf_will_dirty(db_new, tx);
 	transfer_func((uint64_t *)db_old->db_data + hepb,
@@ -255,7 +255,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
 	uint64_t off = idx & ((1<<(bs-3))-1);
 
 	dmu_buf_t *db;
-	int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
 	if (err != 0)
 		return (err);
@@ -267,7 +267,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
 		uint64_t off2 = idx2 & ((1<<(bs-3))-1);
 		dmu_buf_t *db2;
 
-		err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+		err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 		    (tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
 		    DMU_READ_NO_PREFETCH);
 		if (err != 0) {
@@ -296,16 +296,9 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
 	uint64_t blk = idx >> (bs-3);
 	uint64_t off = idx & ((1<<(bs-3))-1);
 
-	/*
-	 * Note: this is equivalent to dmu_buf_hold(), but we use
-	 * _dnode_enter / _by_dnode because it's faster because we don't
-	 * have to hold the dnode.
-	 */
-	dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
 	dmu_buf_t *db;
-	int err = dmu_buf_hold_by_dnode(dn,
+	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
-	dmu_buf_dnode_exit(zap->zap_dbuf);
 	if (err != 0)
 		return (err);
 	*valp = ((uint64_t *)db->db_data)[off];
@@ -319,11 +312,9 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
 		 */
 		blk = (idx*2) >> (bs-3);
 
-		dn = dmu_buf_dnode_enter(zap->zap_dbuf);
-		err = dmu_buf_hold_by_dnode(dn,
+		err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 		    (tbl->zt_nextblk + blk) << bs, FTAG, &db,
 		    DMU_READ_NO_PREFETCH);
-		dmu_buf_dnode_exit(zap->zap_dbuf);
 		if (err == 0)
 			dmu_buf_rele(db, FTAG);
 	}
@@ -368,7 +359,7 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
 
 		uint64_t newblk = zap_allocate_blocks(zap, 1);
 		dmu_buf_t *db_new;
-		int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+		int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 		    newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
 		    DMU_READ_NO_PREFETCH);
 		if (err != 0)
@@ -433,7 +424,7 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
 	l->l_blkid = zap_allocate_blocks(zap, 1);
 	l->l_dbuf = NULL;
 
-	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
 	    DMU_READ_NO_PREFETCH));
 	dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
@@ -533,10 +524,8 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
 		return (SET_ERROR(ENOENT));
 
 	int bs = FZAP_BLOCK_SHIFT(zap);
-	dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
-	int err = dmu_buf_hold_by_dnode(dn,
+	int err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 	    blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
-	dmu_buf_dnode_exit(zap->zap_dbuf);
 	if (err != 0)
 		return (err);
 
@@ -985,7 +974,7 @@ fzap_prefetch(zap_name_t *zn)
 	if (zap_idx_to_blk(zap, idx, &blk) != 0)
 		return;
 	int bs = FZAP_BLOCK_SHIFT(zap);
-	dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
+	dmu_prefetch_by_dnode(zap->zap_dnode, 0, blk << bs, 1 << bs,
 	    ZIO_PRIORITY_SYNC_READ);
 }
 
@@ -1228,7 +1217,7 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
 	 */
 	if (zc->zc_hash == 0 && zap_iterate_prefetch &&
 	    zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) {
-		dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0,
+		dmu_prefetch_by_dnode(zap->zap_dnode, 0, 0,
 		    zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap),
 		    ZIO_PRIORITY_ASYNC_READ);
 	}
@@ -1356,7 +1345,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
 		zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
 		    1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
 	} else {
-		dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
+		dmu_prefetch_by_dnode(zap->zap_dnode, 0,
 		    zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
 		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
 		    ZIO_PRIORITY_SYNC_READ);
@@ -1366,7 +1355,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
 			dmu_buf_t *db;
 			int err;
 
-			err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+			err = dmu_buf_hold_by_dnode(zap->zap_dnode,
 			    (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs,
 			    FTAG, &db, DMU_READ_NO_PREFETCH);
 			if (err == 0) {
diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c
index 085d9cd8b4b6..d806988af96d 100644
--- a/module/zfs/zap_micro.c
+++ b/module/zfs/zap_micro.c
@@ -415,7 +415,7 @@ mze_destroy(zap_t *zap)
 }
 
 static zap_t *
-mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
+mzap_open(dmu_buf_t *db)
 {
 	zap_t *winner;
 	uint64_t *zap_hdr = (uint64_t *)db->db_data;
@@ -427,8 +427,8 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
 	zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
 	rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL);
 	rw_enter(&zap->zap_rwlock, RW_WRITER);
-	zap->zap_objset = os;
-	zap->zap_object = obj;
+	zap->zap_objset = dmu_buf_get_objset(db);
+	zap->zap_object = db->db_object;
 	zap->zap_dbuf = db;
 
 	if (zap_block_type != ZBT_MICRO) {
@@ -518,7 +518,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
  * have the specified tag.
  */
 static int
-zap_lockdir_impl(dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
+zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
 {
 	ASSERT0(db->db_offset);
@@ -528,13 +528,13 @@ zap_lockdir_impl(dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
 
 	*zapp = NULL;
 
-	dmu_object_info_from_db(db, &doi);
+	dmu_object_info_from_dnode(dn, &doi);
 	if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP)
 		return (SET_ERROR(EINVAL));
 
 	zap_t *zap = dmu_buf_get_user(db);
 	if (zap == NULL) {
-		zap = mzap_open(os, obj, db);
+		zap = mzap_open(db);
 		if (zap == NULL) {
 			/*
 			 * mzap_open() didn't like what it saw on-disk.
@@ -563,6 +563,7 @@ zap_lockdir_impl(dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
 	}
 
 	zap->zap_objset = os;
+	zap->zap_dnode = dn;
 
 	if (lt == RW_WRITER)
 		dmu_buf_will_dirty(db, tx);
@@ -598,23 +599,16 @@ zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
     zap_t **zapp)
 {
 	dmu_buf_t *db;
+	int err;
 
-	int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
-	if (err != 0) {
+	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0)
 		return (err);
-	}
-#ifdef ZFS_DEBUG
-	{
-		dmu_object_info_t doi;
-		dmu_object_info_from_db(db, &doi);
-		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
-	}
-#endif
-
-	err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
-	if (err != 0) {
+	err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
+	if (err != 0)
 		dmu_buf_rele(db, tag);
-	}
+	else
+		VERIFY(dnode_add_ref(dn, tag));
 	return (err);
 }
 
@@ -623,21 +617,23 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
     zap_t **zapp)
 {
+	dnode_t *dn;
 	dmu_buf_t *db;
+	int err;
 
-	int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH);
+	err = dnode_hold(os, obj, tag, &dn);
 	if (err != 0)
 		return (err);
-#ifdef ZFS_DEBUG
-	{
-		dmu_object_info_t doi;
-		dmu_object_info_from_db(db, &doi);
-		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
+	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0) {
+		dnode_rele(dn, tag);
+		return (err);
 	}
-#endif
-	err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
-	if (err != 0)
+	err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
+	if (err != 0) {
 		dmu_buf_rele(db, tag);
+		dnode_rele(dn, tag);
+	}
 	return (err);
 }
 
@@ -645,6 +641,7 @@ void
 zap_unlockdir(zap_t *zap, const void *tag)
 {
 	rw_exit(&zap->zap_rwlock);
+	dnode_rele(zap->zap_dnode, tag);
 	dmu_buf_rele(zap->zap_dbuf, tag);
 }
 
@@ -730,7 +727,8 @@ mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx)
 	if (flags != 0) {
 		zap_t *zap;
 		/* Only fat zap supports flags; upgrade immediately. */
-		VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER,
+		VERIFY(dnode_add_ref(dn, FTAG));
+		VERIFY0(zap_lockdir_impl(dn, db, FTAG, tx, RW_WRITER,
 		    B_FALSE, B_FALSE, &zap));
 		VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags));
 		zap_unlockdir(zap, FTAG);
@@ -1325,6 +1323,26 @@ zap_add_by_dnode(dnode_t *dn, const char *key,
 	return (err);
 }
 
+static int
+zap_add_uint64_impl(zap_t *zap, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx, const void *tag)
+{
+	int err;
+
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap, tag);
+		return (SET_ERROR(ENOTSUP));
+	}
+	err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
+	zap = zn->zn_zap;	/* fzap_add() may change zap */
+	zap_name_free(zn);
+	if (zap != NULL)	/* may be NULL if fzap_add() failed */
+		zap_unlockdir(zap, tag);
+	return (err);
+}
+
 int
 zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, int integer_size, uint64_t num_integers,
@@ -1336,16 +1354,26 @@ zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
 	if (err != 0)
 		return (err);
-	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
-	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-	err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx);
-	zap = zn->zn_zap;	/* fzap_add() may change zap */
-	zap_name_free(zn);
-	if (zap != NULL)	/* may be NULL if fzap_add() failed */
-		zap_unlockdir(zap, FTAG);
+	err = zap_add_uint64_impl(zap, key, key_numints,
+	    integer_size, num_integers, val, tx, FTAG);
+	/* zap_add_uint64_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+int
+zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_add_uint64_impl(zap, key, key_numints,
+	    integer_size, num_integers, val, tx, FTAG);
+	/* zap_add_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }
 
@@ -1396,27 +1424,56 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 	return (err);
 }
 
-int
-zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
-    int key_numints,
-    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+static int
+zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx,
+    const void *tag)
 {
-	zap_t *zap;
+	int err;
 
-	int err =
-	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
-	if (err != 0)
-		return (err);
 	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
 	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
+		zap_unlockdir(zap, tag);
 		return (SET_ERROR(ENOTSUP));
 	}
-	err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx);
+	err = fzap_update(zn, integer_size, num_integers, val, tag, tx);
 	zap = zn->zn_zap;	/* fzap_update() may change zap */
 	zap_name_free(zn);
 	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
-		zap_unlockdir(zap, FTAG);
+		zap_unlockdir(zap, tag);
+	return (err);
+}
+
+int
+zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers, const void *val,
+    dmu_tx_t *tx)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_update_uint64_impl(zap, key, key_numints,
+	    integer_size, num_integers, val, tx, FTAG);
+	/* zap_update_uint64_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+int
+zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_update_uint64_impl(zap, key, key_numints,
+	    integer_size, num_integers, val, tx, FTAG);
+	/* zap_update_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }
 
@@ -1481,6 +1538,23 @@ zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx)
 	return (err);
 }
 
+static int
+zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
+    dmu_tx_t *tx, const void *tag)
+{
+	int err;
+
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap, tag);
+		return (SET_ERROR(ENOTSUP));
+	}
+	err = fzap_remove(zn, tx);
+	zap_name_free(zn);
+	zap_unlockdir(zap, tag);
+	return (err);
+}
+
 int
 zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
     int key_numints, dmu_tx_t *tx)
@@ -1491,14 +1565,23 @@ zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
 	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
 	if (err != 0)
 		return (err);
-	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
-	if (zn == NULL) {
-		zap_unlockdir(zap, FTAG);
-		return (SET_ERROR(ENOTSUP));
-	}
-	err = fzap_remove(zn, tx);
-	zap_name_free(zn);
-	zap_unlockdir(zap, FTAG);
+	err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
+	/* zap_remove_uint64_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+int
+zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
+    dmu_tx_t *tx)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
+	/* zap_remove_uint64_impl() calls zap_unlockdir() */
 	return (err);
 }
 
@@ -1704,14 +1787,17 @@ EXPORT_SYMBOL(zap_prefetch_uint64);
 EXPORT_SYMBOL(zap_add);
 EXPORT_SYMBOL(zap_add_by_dnode);
 EXPORT_SYMBOL(zap_add_uint64);
+EXPORT_SYMBOL(zap_add_uint64_by_dnode);
 EXPORT_SYMBOL(zap_update);
 EXPORT_SYMBOL(zap_update_uint64);
+EXPORT_SYMBOL(zap_update_uint64_by_dnode);
 EXPORT_SYMBOL(zap_length);
 EXPORT_SYMBOL(zap_length_uint64);
 EXPORT_SYMBOL(zap_remove);
 EXPORT_SYMBOL(zap_remove_by_dnode);
 EXPORT_SYMBOL(zap_remove_norm);
 EXPORT_SYMBOL(zap_remove_uint64);
+EXPORT_SYMBOL(zap_remove_uint64_by_dnode);
 EXPORT_SYMBOL(zap_count);
 EXPORT_SYMBOL(zap_value_search);
 EXPORT_SYMBOL(zap_join);

From 4616b96a643c941e96ee0d1d816c573df9f0de28 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 25 Mar 2024 17:59:55 -0400
Subject: [PATCH 12/34] BRT: Relax brt_pending_apply() locking

Since brt_pending_apply() is running in syncing context, no other
brt_pending_tree accesses are possible for the TXG.  We don't need
to acquire brt_pending_lock here.

Reviewed-by: Pawel Jakub Dawidek <pawel@dawidek.net>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Brian Atkinson <batkinson@lanl.gov>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15955
---
 module/zfs/brt.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 5e10df9dfe56..416caeb11c7e 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -1473,26 +1473,23 @@ brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
 void
 brt_pending_apply(spa_t *spa, uint64_t txg)
 {
-	brt_t *brt;
+	brt_t *brt = spa->spa_brt;
 	brt_pending_entry_t *bpe;
 	avl_tree_t *pending_tree;
-	kmutex_t *pending_lock;
 	void *c;
 
 	ASSERT3U(txg, !=, 0);
 
-	brt = spa->spa_brt;
+	/*
+	 * We are in syncing context, so no other brt_pending_tree accesses
+	 * are possible for the TXG. Don't need to acquire brt_pending_lock.
+	 */
 	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
-	pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
-
-	mutex_enter(pending_lock);
 
 	c = NULL;
 	while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) {
 		boolean_t added_to_ddt;
 
-		mutex_exit(pending_lock);
-
 		for (int i = 0; i < bpe->bpe_count; i++) {
 			/*
 			 * If the block has DEDUP bit set, it means that it
@@ -1510,10 +1507,7 @@ brt_pending_apply(spa_t *spa, uint64_t txg)
 		}
 
 		kmem_cache_free(brt_pending_entry_cache, bpe);
-		mutex_enter(pending_lock);
 	}
-
-	mutex_exit(pending_lock);
 }
 
 static void

From 493fcce9be165bd751434879d2478938cd5bb926 Mon Sep 17 00:00:00 2001
From: George Wilson <george.wilson@delphix.com>
Date: Mon, 25 Mar 2024 18:01:54 -0400
Subject: [PATCH 13/34] Provide macros for setting and getting blkptr birth
 times

There exist a couple of macros that are used to update the blkptr birth
times but they can often be confusing. For example, the
BP_PHYSICAL_BIRTH() macro will provide either the physical birth time
if it is set or else return back the logical birth time. The
complement to this macro is BP_SET_BIRTH() which will set the logical
birth time and set the physical birth time if they are not the same.
Consumers may get confused when they are trying to get the physical
birth time and use the BP_PHYSICAL_BIRTH() macro only to find out that
the logical birth time is what is actually returned.

This change cleans up these macros and makes them symmetrical. The same
functionally is preserved but the name is changed. Instead of calling
BP_PHYSICAL_BIRTH(), consumer can now call BP_GET_BIRTH(). In
additional to cleaning up this naming conventions, two new sets of
macros are introduced -- BP_[SET|GET]_LOGICAL_BIRTH() and
BP_[SET|GET]_PHYSICAL_BIRTH.  These new macros allow the consumer to
get and set the specific birth time.

As part of the cleanup, the unused GRID macros have been removed and
that portion of the blkptr are currently unused.

Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Signed-off-by: George Wilson <gwilson@delphix.com>
Closes #15962
---
 cmd/zdb/zdb.c                 | 19 +++++------
 cmd/zdb/zdb_il.c              | 12 +++----
 cmd/zhack.c                   |  4 +--
 include/sys/spa.h             | 59 ++++++++++++++++++-----------------
 include/sys/uberblock_impl.h  |  2 +-
 lib/libzdb/libzdb.c           |  4 +--
 module/zfs/arc.c              | 16 +++++-----
 module/zfs/bpobj.c            |  7 +++--
 module/zfs/brt.c              |  3 +-
 module/zfs/dbuf.c             | 22 +++++++------
 module/zfs/ddt.c              |  4 +--
 module/zfs/dmu.c              | 19 +++++------
 module/zfs/dmu_recv.c         | 11 ++++---
 module/zfs/dmu_send.c         |  6 ++--
 module/zfs/dmu_traverse.c     | 11 ++++---
 module/zfs/dnode.c            |  4 +--
 module/zfs/dsl_bookmark.c     |  3 +-
 module/zfs/dsl_dataset.c      | 21 +++++++------
 module/zfs/dsl_deadlist.c     |  7 ++---
 module/zfs/dsl_destroy.c      | 13 +++++---
 module/zfs/dsl_pool.c         |  2 +-
 module/zfs/dsl_scan.c         | 31 +++++++++---------
 module/zfs/metaslab.c         | 11 ++++---
 module/zfs/spa.c              |  9 +++---
 module/zfs/spa_errlog.c       | 26 +++++----------
 module/zfs/spa_log_spacemap.c |  2 +-
 module/zfs/uberblock.c        |  2 +-
 module/zfs/vdev_mirror.c      |  2 +-
 module/zfs/vdev_raidz.c       | 13 ++++----
 module/zfs/zil.c              | 14 ++++-----
 module/zfs/zio.c              | 45 +++++++++++++-------------
 module/zfs/zio_checksum.c     |  2 +-
 32 files changed, 209 insertions(+), 197 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index 4880c8048726..449b6bf2ccb3 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -199,7 +199,8 @@ sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,
 					break;
 				sublivelist_verify_block_t svb = {
 				    .svb_dva = bp->blk_dva[i],
-				    .svb_allocated_txg = bp->blk_birth
+				    .svb_allocated_txg =
+				    BP_GET_LOGICAL_BIRTH(bp)
 				};
 
 				if (zfs_btree_find(&sv->sv_leftover, &svb,
@@ -2340,7 +2341,7 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
 		    (int)BPE_GET_ETYPE(bp),
 		    (u_longlong_t)BPE_GET_LSIZE(bp),
 		    (u_longlong_t)BPE_GET_PSIZE(bp),
-		    (u_longlong_t)bp->blk_birth);
+		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp));
 		return;
 	}
 
@@ -2358,7 +2359,7 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
 		    buflen - strlen(blkbuf),
 		    "%llxL B=%llu",
 		    (u_longlong_t)BP_GET_LSIZE(bp),
-		    (u_longlong_t)bp->blk_birth);
+		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp));
 	} else {
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf),
@@ -2366,8 +2367,8 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
 		    (u_longlong_t)BP_GET_LSIZE(bp),
 		    (u_longlong_t)BP_GET_PSIZE(bp),
 		    (u_longlong_t)BP_GET_FILL(bp),
-		    (u_longlong_t)bp->blk_birth,
-		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp));
+		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp),
+		    (u_longlong_t)BP_GET_BIRTH(bp));
 		if (bp_freed)
 			(void) snprintf(blkbuf + strlen(blkbuf),
 			    buflen - strlen(blkbuf), " %s", "FREE");
@@ -2417,7 +2418,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
 {
 	int err = 0;
 
-	if (bp->blk_birth == 0)
+	if (BP_GET_LOGICAL_BIRTH(bp) == 0)
 		return (0);
 
 	print_indirect(spa, bp, zb, dnp);
@@ -2605,7 +2606,7 @@ dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 	(void) arg, (void) tx;
 	char blkbuf[BP_SPRINTF_LEN];
 
-	if (bp->blk_birth != 0) {
+	if (BP_GET_LOGICAL_BIRTH(bp) != 0) {
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		(void) printf("\t%s\n", blkbuf);
 	}
@@ -2646,7 +2647,7 @@ dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
 	(void) arg, (void) tx;
 	char blkbuf[BP_SPRINTF_LEN];
 
-	ASSERT(bp->blk_birth != 0);
+	ASSERT(BP_GET_LOGICAL_BIRTH(bp) != 0);
 	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed);
 	(void) printf("\t%s\n", blkbuf);
 	return (0);
@@ -5788,7 +5789,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	if (zb->zb_level == ZB_DNODE_LEVEL)
 		return (0);
 
-	if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
+	if (dump_opt['b'] >= 5 && BP_GET_LOGICAL_BIRTH(bp) > 0) {
 		char blkbuf[BP_SPRINTF_LEN];
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		(void) printf("objset %llu object %llu "
diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c
index 63d95ddedc3b..e3caaeb70e14 100644
--- a/cmd/zdb/zdb_il.c
+++ b/cmd/zdb/zdb_il.c
@@ -173,8 +173,8 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg)
 
 	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
 		(void) printf("%shas blkptr, %s\n", tab_prefix,
-		    !BP_IS_HOLE(bp) &&
-		    bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ?
+		    !BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) >=
+		    spa_min_claim_txg(zilog->zl_spa) ?
 		    "will claim" : "won't claim");
 		print_log_bp(bp, tab_prefix);
 
@@ -186,7 +186,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg)
 			(void) printf("%s<hole>\n", tab_prefix);
 			return;
 		}
-		if (bp->blk_birth < zilog->zl_header->zh_claim_txg) {
+		if (BP_GET_LOGICAL_BIRTH(bp) < zilog->zl_header->zh_claim_txg) {
 			(void) printf("%s<block already committed>\n",
 			    tab_prefix);
 			return;
@@ -237,8 +237,8 @@ zil_prt_rec_write_enc(zilog_t *zilog, int txtype, const void *arg)
 
 	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
 		(void) printf("%shas blkptr, %s\n", tab_prefix,
-		    !BP_IS_HOLE(bp) &&
-		    bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ?
+		    !BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) >=
+		    spa_min_claim_txg(zilog->zl_spa) ?
 		    "will claim" : "won't claim");
 		print_log_bp(bp, tab_prefix);
 	}
@@ -473,7 +473,7 @@ print_log_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
 
 	if (claim_txg != 0)
 		claim = "already claimed";
-	else if (bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa))
+	else if (BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(zilog->zl_spa))
 		claim = "will claim";
 	else
 		claim = "won't claim";
diff --git a/cmd/zhack.c b/cmd/zhack.c
index 44611887dd25..f15a6ece538c 100644
--- a/cmd/zhack.c
+++ b/cmd/zhack.c
@@ -612,8 +612,8 @@ zhack_repair_undetach(uberblock_t *ub, nvlist_t *cfg, const int l)
 	 * Uberblock root block pointer has valid birth TXG.
 	 * Copying it to the label NVlist
 	 */
-	if (ub->ub_rootbp.blk_birth != 0) {
-		const uint64_t txg = ub->ub_rootbp.blk_birth;
+	if (BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp) != 0) {
+		const uint64_t txg = BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp);
 		ub->ub_txg = txg;
 
 		if (nvlist_remove_all(cfg, ZPOOL_CONFIG_CREATE_TXG) != 0) {
diff --git a/include/sys/spa.h b/include/sys/spa.h
index cada3c841037..fb4c93431a31 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -125,15 +125,15 @@ typedef struct zio_cksum_salt {
  *
  *	64	56	48	40	32	24	16	8	0
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 0	|  pad  |	  vdev1         | GRID  |	  ASIZE		|
+ * 0	|  pad  |	  vdev1         | pad   |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 1	|G|			 offset1				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 2	|  pad  |	  vdev2         | GRID  |	  ASIZE		|
+ * 2	|  pad  |	  vdev2         | pad   |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 3	|G|			 offset2				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 4	|  pad  |	  vdev3         | GRID  |	  ASIZE		|
+ * 4	|  pad  |	  vdev3         | pad   |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 5	|G|			 offset3				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
@@ -165,7 +165,6 @@ typedef struct zio_cksum_salt {
  * LSIZE	logical size
  * PSIZE	physical size (after compression)
  * ASIZE	allocated size (including RAID-Z parity and gang block headers)
- * GRID		RAID-Z layout information (reserved for future use)
  * cksum	checksum function
  * comp		compression function
  * G		gang block indicator
@@ -190,11 +189,11 @@ typedef struct zio_cksum_salt {
  *
  *	64	56	48	40	32	24	16	8	0
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 0	|		vdev1		| GRID  |	  ASIZE		|
+ * 0	|		vdev1		| pad   |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 1	|G|			 offset1				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 2	|		vdev2		| GRID  |	  ASIZE		|
+ * 2	|		vdev2		| pad   |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 3	|G|			 offset2				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
@@ -355,7 +354,7 @@ typedef enum bp_embedded_type {
 #define	BPE_NUM_WORDS 14
 #define	BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
 #define	BPE_IS_PAYLOADWORD(bp, wp) \
-	((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
+	((wp) != &(bp)->blk_prop && (wp) != (&(bp)->blk_birth_word[1]))
 
 #define	SPA_BLKPTRSHIFT	7		/* blkptr_t is 128 bytes	*/
 #define	SPA_DVAS_PER_BP	3		/* Number of DVAs in a bp	*/
@@ -374,8 +373,7 @@ typedef struct blkptr {
 	dva_t		blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
 	uint64_t	blk_prop;	/* size, compression, type, etc	    */
 	uint64_t	blk_pad[2];	/* Extra space for the future	    */
-	uint64_t	blk_phys_birth;	/* txg when block was allocated	    */
-	uint64_t	blk_birth;	/* transaction group at birth	    */
+	uint64_t	blk_birth_word[2];
 	uint64_t	blk_fill;	/* fill count			    */
 	zio_cksum_t	blk_cksum;	/* 256-bit checksum		    */
 } blkptr_t;
@@ -395,9 +393,6 @@ typedef struct blkptr {
 	BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \
 	SPA_MINBLOCKSHIFT, 0, x)
 
-#define	DVA_GET_GRID(dva)	BF64_GET((dva)->dva_word[0], 24, 8)
-#define	DVA_SET_GRID(dva, x)	BF64_SET((dva)->dva_word[0], 24, 8, x)
-
 #define	DVA_GET_VDEV(dva)	BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS)
 #define	DVA_SET_VDEV(dva, x)	\
 	BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x)
@@ -480,15 +475,23 @@ typedef struct blkptr {
 #define	BP_GET_FREE(bp)			BF64_GET((bp)->blk_fill, 0, 1)
 #define	BP_SET_FREE(bp, x)		BF64_SET((bp)->blk_fill, 0, 1, x)
 
-#define	BP_PHYSICAL_BIRTH(bp)		\
-	(BP_IS_EMBEDDED(bp) ? 0 : \
-	(bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
+#define	BP_GET_LOGICAL_BIRTH(bp)	(bp)->blk_birth_word[1]
+#define	BP_SET_LOGICAL_BIRTH(bp, x)	((bp)->blk_birth_word[1] = (x))
+
+#define	BP_GET_PHYSICAL_BIRTH(bp)	(bp)->blk_birth_word[0]
+#define	BP_SET_PHYSICAL_BIRTH(bp, x)	((bp)->blk_birth_word[0] = (x))
+
+#define	BP_GET_BIRTH(bp)					\
+	(BP_IS_EMBEDDED(bp) ? 0 : 				\
+	BP_GET_PHYSICAL_BIRTH(bp) ? BP_GET_PHYSICAL_BIRTH(bp) :	\
+	BP_GET_LOGICAL_BIRTH(bp))
 
 #define	BP_SET_BIRTH(bp, logical, physical)	\
 {						\
 	ASSERT(!BP_IS_EMBEDDED(bp));		\
-	(bp)->blk_birth = (logical);		\
-	(bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
+	BP_SET_LOGICAL_BIRTH(bp, logical);	\
+	BP_SET_PHYSICAL_BIRTH(bp, 		\
+	    ((logical) == (physical) ? 0 : (physical))); \
 }
 
 #define	BP_GET_FILL(bp)				\
@@ -541,8 +544,8 @@ typedef struct blkptr {
 	(dva1)->dva_word[0] == (dva2)->dva_word[0])
 
 #define	BP_EQUAL(bp1, bp2)	\
-	(BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) &&	\
-	(bp1)->blk_birth == (bp2)->blk_birth &&			\
+	(BP_GET_BIRTH(bp1) == BP_GET_BIRTH(bp2) &&	\
+	BP_GET_LOGICAL_BIRTH(bp1) == BP_GET_LOGICAL_BIRTH(bp2) &&	\
 	DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) &&	\
 	DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) &&	\
 	DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
@@ -581,8 +584,8 @@ typedef struct blkptr {
 	(bp)->blk_prop = 0;			\
 	(bp)->blk_pad[0] = 0;			\
 	(bp)->blk_pad[1] = 0;			\
-	(bp)->blk_phys_birth = 0;		\
-	(bp)->blk_birth = 0;			\
+	(bp)->blk_birth_word[0] = 0;		\
+	(bp)->blk_birth_word[1] = 0;		\
 	(bp)->blk_fill = 0;			\
 	ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0);	\
 }
@@ -631,7 +634,7 @@ typedef struct blkptr {
 		    (u_longlong_t)BP_GET_LEVEL(bp),			\
 		    type,						\
 		    (u_longlong_t)BP_GET_LSIZE(bp),			\
-		    (u_longlong_t)bp->blk_birth);			\
+		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp));		\
 	} else if (BP_IS_EMBEDDED(bp)) {				\
 		len = func(buf + len, size - len,			\
 		    "EMBEDDED [L%llu %s] et=%u %s "			\
@@ -642,14 +645,14 @@ typedef struct blkptr {
 		    compress,						\
 		    (u_longlong_t)BPE_GET_LSIZE(bp),			\
 		    (u_longlong_t)BPE_GET_PSIZE(bp),			\
-		    (u_longlong_t)bp->blk_birth);			\
+		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp));		\
 	} else if (BP_IS_REDACTED(bp)) {				\
 		len += func(buf + len, size - len,			\
 		    "REDACTED [L%llu %s] size=%llxL birth=%lluL",	\
 		    (u_longlong_t)BP_GET_LEVEL(bp),			\
 		    type,						\
 		    (u_longlong_t)BP_GET_LSIZE(bp),			\
-		    (u_longlong_t)bp->blk_birth);			\
+		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp));		\
 	} else {							\
 		for (int d = 0; d < BP_GET_NDVAS(bp); d++) {		\
 			const dva_t *dva = &bp->blk_dva[d];		\
@@ -691,8 +694,8 @@ typedef struct blkptr {
 		    ws,							\
 		    (u_longlong_t)BP_GET_LSIZE(bp),			\
 		    (u_longlong_t)BP_GET_PSIZE(bp),			\
-		    (u_longlong_t)bp->blk_birth,			\
-		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp),		\
+		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp),		\
+		    (u_longlong_t)BP_GET_BIRTH(bp),			\
 		    (u_longlong_t)BP_GET_FILL(bp),			\
 		    ws,							\
 		    (u_longlong_t)bp->blk_cksum.zc_word[0],		\
@@ -1142,9 +1145,9 @@ extern const char *spa_state_to_name(spa_t *spa);
 /* error handling */
 struct zbookmark_phys;
 extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb,
-    const uint64_t *birth);
+    const uint64_t birth);
 extern void spa_remove_error(spa_t *spa, zbookmark_phys_t *zb,
-    const uint64_t *birth);
+    uint64_t birth);
 extern int zfs_ereport_post(const char *clazz, spa_t *spa, vdev_t *vd,
     const zbookmark_phys_t *zb, zio_t *zio, uint64_t state);
 extern boolean_t zfs_ereport_is_valid(const char *clazz, spa_t *spa, vdev_t *vd,
diff --git a/include/sys/uberblock_impl.h b/include/sys/uberblock_impl.h
index d3a71cc8f84b..1736b32cd3c6 100644
--- a/include/sys/uberblock_impl.h
+++ b/include/sys/uberblock_impl.h
@@ -165,7 +165,7 @@ struct uberblock {
 	 * pool from a checkpointed uberblock [see spa_ld_select_uberblock()],
 	 * the value of the field is used to determine which ZIL blocks have
 	 * been allocated according to the ms_sm when we are rewinding to a
-	 * checkpoint. Specifically, if blk_birth > ub_checkpoint_txg, then
+	 * checkpoint. Specifically, if logical birth > ub_checkpoint_txg,then
 	 * the ZIL block is not allocated [see uses of spa_min_claim_txg()].
 	 */
 	uint64_t	ub_checkpoint_txg;
diff --git a/lib/libzdb/libzdb.c b/lib/libzdb/libzdb.c
index 9989fa1eb80f..12144dc65e75 100644
--- a/lib/libzdb/libzdb.c
+++ b/lib/libzdb/libzdb.c
@@ -93,9 +93,9 @@ livelist_compare(const void *larg, const void *rarg)
 	 * Since we're storing blkptrs without cancelling FREE/ALLOC pairs,
 	 * it's possible the offsets are equal. In that case, sort by txg
 	 */
-	if (l->blk_birth < r->blk_birth) {
+	if (BP_GET_LOGICAL_BIRTH(l) < BP_GET_LOGICAL_BIRTH(r)) {
 		return (-1);
-	} else if (l->blk_birth > r->blk_birth) {
+	} else if (BP_GET_LOGICAL_BIRTH(l) > BP_GET_LOGICAL_BIRTH(r)) {
 		return (+1);
 	}
 	return (0);
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 3bcffb3c7ede..b1bcac6c44bc 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -1014,7 +1014,7 @@ static arc_buf_hdr_t *
 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
 {
 	const dva_t *dva = BP_IDENTITY(bp);
-	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
+	uint64_t birth = BP_GET_BIRTH(bp);
 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 	arc_buf_hdr_t *hdr;
@@ -2183,7 +2183,7 @@ arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
 		 * (and generate an ereport) before leaving the ARC.
 		 */
 		ret = SET_ERROR(EIO);
-		spa_log_error(spa, zb, &buf->b_hdr->b_birth);
+		spa_log_error(spa, zb, buf->b_hdr->b_birth);
 		(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
 		    spa, NULL, zb, NULL, 0);
 	}
@@ -5251,7 +5251,7 @@ arc_read_done(zio_t *zio)
 	if (HDR_IN_HASH_TABLE(hdr)) {
 		arc_buf_hdr_t *found;
 
-		ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
+		ASSERT3U(hdr->b_birth, ==, BP_GET_BIRTH(zio->io_bp));
 		ASSERT3U(hdr->b_dva.dva_word[0], ==,
 		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
 		ASSERT3U(hdr->b_dva.dva_word[1], ==,
@@ -5354,7 +5354,7 @@ arc_read_done(zio_t *zio)
 			error = SET_ERROR(EIO);
 			if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
 				spa_log_error(zio->io_spa, &acb->acb_zb,
-				    &zio->io_bp->blk_birth);
+				    BP_GET_LOGICAL_BIRTH(zio->io_bp));
 				(void) zfs_ereport_post(
 				    FM_EREPORT_ZFS_AUTHENTICATION,
 				    zio->io_spa, NULL, &acb->acb_zb, zio, 0);
@@ -5639,7 +5639,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 				 */
 				rc = SET_ERROR(EIO);
 				if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) {
-					spa_log_error(spa, zb, &hdr->b_birth);
+					spa_log_error(spa, zb, hdr->b_birth);
 					(void) zfs_ereport_post(
 					    FM_EREPORT_ZFS_AUTHENTICATION,
 					    spa, NULL, zb, NULL, 0);
@@ -5686,12 +5686,12 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 			 * embedded data.
 			 */
 			arc_buf_hdr_t *exists = NULL;
-			hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
+			hdr = arc_hdr_alloc(guid, psize, lsize,
 			    BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type);
 
 			if (!embedded_bp) {
 				hdr->b_dva = *BP_IDENTITY(bp);
-				hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
+				hdr->b_birth = BP_GET_BIRTH(bp);
 				exists = buf_hash_insert(hdr, &hash_lock);
 			}
 			if (exists != NULL) {
@@ -6557,7 +6557,7 @@ arc_write_done(zio_t *zio)
 			buf_discard_identity(hdr);
 		} else {
 			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
-			hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
+			hdr->b_birth = BP_GET_BIRTH(zio->io_bp);
 		}
 	} else {
 		ASSERT(HDR_EMPTY(hdr));
diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c
index e772caead29b..96e1601c4e9c 100644
--- a/module/zfs/bpobj.c
+++ b/module/zfs/bpobj.c
@@ -893,7 +893,7 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
 		 */
 		memset(&stored_bp, 0, sizeof (stored_bp));
 		stored_bp.blk_prop = bp->blk_prop;
-		stored_bp.blk_birth = bp->blk_birth;
+		BP_SET_LOGICAL_BIRTH(&stored_bp, BP_GET_LOGICAL_BIRTH(bp));
 	} else if (!BP_GET_DEDUP(bp)) {
 		/* The bpobj will compress better without the checksum */
 		memset(&stored_bp.blk_cksum, 0, sizeof (stored_bp.blk_cksum));
@@ -953,7 +953,8 @@ space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
 	(void) bp_freed, (void) tx;
 	struct space_range_arg *sra = arg;
 
-	if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
+	if (BP_GET_LOGICAL_BIRTH(bp) > sra->mintxg &&
+	    BP_GET_LOGICAL_BIRTH(bp) <= sra->maxtxg) {
 		if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))
 			sra->used += bp_get_dsize_sync(sra->spa, bp);
 		else
@@ -985,7 +986,7 @@ bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 
 /*
  * Return the amount of space in the bpobj which is:
- * mintxg < blk_birth <= maxtxg
+ * mintxg < logical birth <= maxtxg
  */
 int
 bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 416caeb11c7e..0b5a09df3724 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -1384,8 +1384,7 @@ brt_pending_entry_compare(const void *x1, const void *x2)
 		cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
 		    DVA_GET_OFFSET(&bp2->blk_dva[0]));
 		if (unlikely(cmp == 0)) {
-			cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1),
-			    BP_PHYSICAL_BIRTH(bp2));
+			cmp = TREE_CMP(BP_GET_BIRTH(bp1), BP_GET_BIRTH(bp2));
 		}
 	}
 
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 6798fc2d5bdc..4e190c131e1d 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -1217,7 +1217,7 @@ dbuf_verify(dmu_buf_impl_t *db)
 					ASSERT0(bp->blk_pad[1]);
 					ASSERT(!BP_IS_EMBEDDED(bp));
 					ASSERT(BP_IS_HOLE(bp));
-					ASSERT0(bp->blk_phys_birth);
+					ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
 				}
 			}
 		}
@@ -1457,7 +1457,7 @@ dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *dbbp)
 		    dn->dn_datablksz : BP_GET_LSIZE(dbbp));
 		BP_SET_TYPE(bp, BP_GET_TYPE(dbbp));
 		BP_SET_LEVEL(bp, BP_GET_LEVEL(dbbp) - 1);
-		BP_SET_BIRTH(bp, dbbp->blk_birth, 0);
+		BP_SET_BIRTH(bp, BP_GET_LOGICAL_BIRTH(dbbp), 0);
 	}
 }
 
@@ -1486,7 +1486,7 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
 		memset(db->db.db_data, 0, db->db.db_size);
 
 		if (bp != NULL && db->db_level > 0 && BP_IS_HOLE(bp) &&
-		    bp->blk_birth != 0) {
+		    BP_GET_LOGICAL_BIRTH(bp) != 0) {
 			dbuf_handle_indirect_hole(db, dn, bp);
 		}
 		db->db_state = DB_CACHED;
@@ -1633,7 +1633,8 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
 	 * If this is not true it indicates tampering and we report an error.
 	 */
 	if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) {
-		spa_log_error(db->db_objset->os_spa, &zb, &bpp->blk_birth);
+		spa_log_error(db->db_objset->os_spa, &zb,
+		    BP_GET_LOGICAL_BIRTH(bpp));
 		err = SET_ERROR(EIO);
 		goto early_unlock;
 	}
@@ -2832,7 +2833,7 @@ dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx)
 	dl = &dr->dt.dl;
 	dl->dr_overridden_by = *bp;
 	dl->dr_override_state = DR_OVERRIDDEN;
-	dl->dr_overridden_by.blk_birth = dr->dr_txg;
+	BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
 }
 
 boolean_t
@@ -2909,7 +2910,7 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
 	BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
 
 	dl->dr_override_state = DR_OVERRIDDEN;
-	dl->dr_overridden_by.blk_birth = dr->dr_txg;
+	BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
 }
 
 void
@@ -4712,7 +4713,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
 	zio->io_prev_space_delta = delta;
 
-	if (bp->blk_birth != 0) {
+	if (BP_GET_LOGICAL_BIRTH(bp) != 0) {
 		ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
 		    BP_GET_TYPE(bp) == dn->dn_type) ||
 		    (db->db_blkid == DMU_SPILL_BLKID &&
@@ -4999,7 +5000,7 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
 	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
 
 	drica.drica_os = dn->dn_objset;
-	drica.drica_blk_birth = bp->blk_birth;
+	drica.drica_blk_birth = BP_GET_LOGICAL_BIRTH(bp);
 	drica.drica_tx = tx;
 	if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
 	    &drica)) {
@@ -5014,7 +5015,8 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
 		if (dn->dn_objset != spa_meta_objset(spa)) {
 			dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset);
 			if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
-			    bp->blk_birth > ds->ds_dir->dd_origin_txg) {
+			    BP_GET_LOGICAL_BIRTH(bp) >
+			    ds->ds_dir->dd_origin_txg) {
 				ASSERT(!BP_IS_EMBEDDED(bp));
 				ASSERT(dsl_dir_is_clone(ds->ds_dir));
 				ASSERT(spa_feature_is_enabled(spa,
@@ -5136,7 +5138,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 	}
 
 	ASSERT(db->db_level == 0 || data == db->db_buf);
-	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
+	ASSERT3U(BP_GET_LOGICAL_BIRTH(db->db_blkptr), <=, txg);
 	ASSERT(pio);
 
 	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index de8640e58a2c..4c53cb0a2f9b 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -437,7 +437,7 @@ ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
 
 	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
 		ddp->ddp_dva[d] = bp->blk_dva[d];
-	ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
+	ddp->ddp_phys_birth = BP_GET_BIRTH(bp);
 }
 
 void
@@ -485,7 +485,7 @@ ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
 
 	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 		if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
-		    BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
+		    BP_GET_BIRTH(bp) == ddp->ddp_phys_birth)
 			return (ddp);
 	}
 	return (NULL);
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 8986f55e792a..b88cf447d296 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -1627,7 +1627,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
 		 * it's an old style hole.
 		 */
 		if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
-		    dr->dt.dl.dr_overridden_by.blk_birth == 0)
+		    BP_GET_LOGICAL_BIRTH(&dr->dt.dl.dr_overridden_by) == 0)
 			BP_ZERO(&dr->dt.dl.dr_overridden_by);
 	} else {
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
@@ -1658,7 +1658,7 @@ dmu_sync_late_arrival_done(zio_t *zio)
 			blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig;
 			ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
 			ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
-			ASSERT(zio->io_bp->blk_birth == zio->io_txg);
+			ASSERT(BP_GET_LOGICAL_BIRTH(zio->io_bp) == zio->io_txg);
 			ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
 			zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
 		}
@@ -2285,11 +2285,11 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
 		 * operation into ZIL, or it may be impossible to replay, since
 		 * the block may appear not yet allocated at that point.
 		 */
-		if (BP_PHYSICAL_BIRTH(bp) > spa_freeze_txg(os->os_spa)) {
+		if (BP_GET_BIRTH(bp) > spa_freeze_txg(os->os_spa)) {
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
-		if (BP_PHYSICAL_BIRTH(bp) > spa_last_synced_txg(os->os_spa)) {
+		if (BP_GET_BIRTH(bp) > spa_last_synced_txg(os->os_spa)) {
 			error = SET_ERROR(EAGAIN);
 			goto out;
 		}
@@ -2364,13 +2364,14 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
 		dl->dr_brtwrite = B_TRUE;
 		dl->dr_override_state = DR_OVERRIDDEN;
 		if (BP_IS_HOLE(bp)) {
-			dl->dr_overridden_by.blk_birth = 0;
-			dl->dr_overridden_by.blk_phys_birth = 0;
+			BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, 0);
+			BP_SET_PHYSICAL_BIRTH(&dl->dr_overridden_by, 0);
 		} else {
-			dl->dr_overridden_by.blk_birth = dr->dr_txg;
+			BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by,
+			    dr->dr_txg);
 			if (!BP_IS_EMBEDDED(bp)) {
-				dl->dr_overridden_by.blk_phys_birth =
-				    BP_PHYSICAL_BIRTH(bp);
+				BP_SET_PHYSICAL_BIRTH(&dl->dr_overridden_by,
+				    BP_GET_BIRTH(bp));
 			}
 		}
 
diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c
index 9f1c25f866f7..680aed4513bc 100644
--- a/module/zfs/dmu_recv.c
+++ b/module/zfs/dmu_recv.c
@@ -1352,8 +1352,10 @@ corrective_read_done(zio_t *zio)
 {
 	cr_cb_data_t *data = zio->io_private;
 	/* Corruption corrected; update error log if needed */
-	if (zio->io_error == 0)
-		spa_remove_error(data->spa, &data->zb, &zio->io_bp->blk_birth);
+	if (zio->io_error == 0) {
+		spa_remove_error(data->spa, &data->zb,
+		    BP_GET_LOGICAL_BIRTH(zio->io_bp));
+	}
 	kmem_free(data, sizeof (cr_cb_data_t));
 	abd_free(zio->io_abd);
 }
@@ -1480,8 +1482,9 @@ do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw,
 	}
 	rrd->abd = abd;
 
-	io = zio_rewrite(NULL, rwa->os->os_spa, bp->blk_birth, bp, abd,
-	    BP_GET_PSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, flags, &zb);
+	io = zio_rewrite(NULL, rwa->os->os_spa, BP_GET_LOGICAL_BIRTH(bp), bp,
+	    abd, BP_GET_PSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, flags,
+	    &zb);
 
 	ASSERT(abd_get_size(abd) == BP_GET_LSIZE(bp) ||
 	    abd_get_size(abd) == BP_GET_PSIZE(bp));
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index 37c68528bf95..b6cc2f0a5e91 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -619,7 +619,7 @@ dump_spill(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
 
 	/* See comment in dump_dnode() for full details */
 	if (zfs_send_unmodified_spill_blocks &&
-	    (bp->blk_birth <= dscp->dsc_fromtxg)) {
+	    (BP_GET_LOGICAL_BIRTH(bp) <= dscp->dsc_fromtxg)) {
 		drrs->drr_flags |= DRR_SPILL_UNMODIFIED;
 	}
 
@@ -804,7 +804,7 @@ dump_dnode(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
 	 */
 	if (zfs_send_unmodified_spill_blocks &&
 	    (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) &&
-	    (DN_SPILL_BLKPTR(dnp)->blk_birth <= dscp->dsc_fromtxg)) {
+	    (BP_GET_LOGICAL_BIRTH(DN_SPILL_BLKPTR(dnp)) <= dscp->dsc_fromtxg)) {
 		struct send_range record;
 		blkptr_t *bp = DN_SPILL_BLKPTR(dnp);
 
@@ -1123,7 +1123,7 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 	 */
 	if (sta->os->os_encrypted &&
 	    !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) {
-		spa_log_error(spa, zb, &bp->blk_birth);
+		spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
 		return (SET_ERROR(EIO));
 	}
 
diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c
index 809f7f6165f9..15cc2885e805 100644
--- a/module/zfs/dmu_traverse.c
+++ b/module/zfs/dmu_traverse.c
@@ -83,7 +83,8 @@ traverse_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
 	if (BP_IS_HOLE(bp))
 		return (0);
 
-	if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(td->td_spa))
+	if (claim_txg == 0 &&
+	    BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(td->td_spa))
 		return (-1);
 
 	SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
@@ -108,7 +109,7 @@ traverse_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
 		if (BP_IS_HOLE(bp))
 			return (0);
 
-		if (claim_txg == 0 || bp->blk_birth < claim_txg)
+		if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg)
 			return (0);
 
 		ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
@@ -192,7 +193,7 @@ traverse_prefetch_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
 	 */
 	if (resume_skip_check(td, dnp, zb) != RESUME_SKIP_NONE)
 		return (B_FALSE);
-	if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg)
+	if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) <= td->td_min_txg)
 		return (B_FALSE);
 	if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
 		return (B_FALSE);
@@ -235,7 +236,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 		ASSERT(0);
 	}
 
-	if (bp->blk_birth == 0) {
+	if (BP_GET_LOGICAL_BIRTH(bp) == 0) {
 		/*
 		 * Since this block has a birth time of 0 it must be one of
 		 * two things: a hole created before the
@@ -263,7 +264,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 		    zb->zb_object == DMU_META_DNODE_OBJECT) &&
 		    td->td_hole_birth_enabled_txg <= td->td_min_txg)
 			return (0);
-	} else if (bp->blk_birth <= td->td_min_txg) {
+	} else if (BP_GET_LOGICAL_BIRTH(bp) <= td->td_min_txg) {
 		return (0);
 	}
 
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c
index ba28aa06a91f..a703fd414f87 100644
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -2557,7 +2557,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
 	}
 
 	if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
-	    db->db_blkptr->blk_birth <= txg ||
+	    BP_GET_LOGICAL_BIRTH(db->db_blkptr) <= txg ||
 	    BP_IS_HOLE(db->db_blkptr))) {
 		/*
 		 * This can only happen when we are searching up the tree
@@ -2605,7 +2605,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
 		    i >= 0 && i < epb; i += inc) {
 			if (BP_GET_FILL(&bp[i]) >= minfill &&
 			    BP_GET_FILL(&bp[i]) <= maxfill &&
-			    (hole || bp[i].blk_birth > txg))
+			    (hole || BP_GET_LOGICAL_BIRTH(&bp[i]) > txg))
 				break;
 			if (inc > 0 || *offset > 0)
 				*offset += inc;
diff --git a/module/zfs/dsl_bookmark.c b/module/zfs/dsl_bookmark.c
index 4faefecbadbb..5fd8bc2a2682 100644
--- a/module/zfs/dsl_bookmark.c
+++ b/module/zfs/dsl_bookmark.c
@@ -1520,7 +1520,8 @@ dsl_bookmark_block_killed(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 		 * If the block was live (referenced) at the time of this
 		 * bookmark, add its space to the bookmark's FBN.
 		 */
-		if (bp->blk_birth <= dbn->dbn_phys.zbm_creation_txg &&
+		if (BP_GET_LOGICAL_BIRTH(bp) <=
+		    dbn->dbn_phys.zbm_creation_txg &&
 		    (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) {
 			mutex_enter(&dbn->dbn_lock);
 			dbn->dbn_phys.zbm_referenced_freed_before_next_snap +=
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index 62a1649d3786..b4de0e7ff073 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -156,7 +156,8 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 		return;
 	}
 
-	ASSERT3U(bp->blk_birth, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);
+	ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), >,
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg);
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	mutex_enter(&ds->ds_lock);
 	delta = parent_delta(ds, used);
@@ -190,7 +191,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 	 * they do not need to be freed.
 	 */
 	if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
-	    bp->blk_birth > ds->ds_dir->dd_origin_txg &&
+	    BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg &&
 	    !(BP_IS_EMBEDDED(bp))) {
 		ASSERT(dsl_dir_is_clone(ds->ds_dir));
 		ASSERT(spa_feature_is_enabled(spa,
@@ -236,7 +237,7 @@ dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset,
 		mutex_exit(&ds->ds_remap_deadlist_lock);
 
 		BP_ZERO(&fakebp);
-		fakebp.blk_birth = birth;
+		BP_SET_LOGICAL_BIRTH(&fakebp, birth);
 		DVA_SET_VDEV(dva, vdev);
 		DVA_SET_OFFSET(dva, offset);
 		DVA_SET_ASIZE(dva, size);
@@ -259,7 +260,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 		return (0);
 
 	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(bp->blk_birth <= tx->tx_txg);
+	ASSERT(BP_GET_LOGICAL_BIRTH(bp) <= tx->tx_txg);
 
 	if (ds == NULL) {
 		dsl_free(tx->tx_pool, tx->tx_txg, bp);
@@ -277,7 +278,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 	 * they do not need to be freed.
 	 */
 	if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
-	    bp->blk_birth > ds->ds_dir->dd_origin_txg &&
+	    BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg &&
 	    !(BP_IS_EMBEDDED(bp))) {
 		ASSERT(dsl_dir_is_clone(ds->ds_dir));
 		ASSERT(spa_feature_is_enabled(spa,
@@ -285,7 +286,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 		bplist_append(&ds->ds_dir->dd_pending_frees, bp);
 	}
 
-	if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
+	if (BP_GET_LOGICAL_BIRTH(bp) > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
 		int64_t delta;
 
 		dprintf_bp(bp, "freeing ds=%llu", (u_longlong_t)ds->ds_object);
@@ -317,16 +318,16 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
 		ASSERT3U(ds->ds_prev->ds_object, ==,
 		    dsl_dataset_phys(ds)->ds_prev_snap_obj);
 		ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0);
-		/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
+		/* if (logical birth > prev prev snap txg) prev unique += bs */
 		if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
-		    ds->ds_object && bp->blk_birth >
+		    ds->ds_object && BP_GET_LOGICAL_BIRTH(bp) >
 		    dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) {
 			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
 			mutex_enter(&ds->ds_prev->ds_lock);
 			dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used;
 			mutex_exit(&ds->ds_prev->ds_lock);
 		}
-		if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
+		if (BP_GET_LOGICAL_BIRTH(bp) > ds->ds_dir->dd_origin_txg) {
 			dsl_dir_transfer_space(ds->ds_dir, used,
 			    DD_USED_HEAD, DD_USED_SNAP, tx);
 		}
@@ -2895,7 +2896,7 @@ dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
 	if (snap == NULL)
 		return (B_FALSE);
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-	birth = dsl_dataset_get_blkptr(ds)->blk_birth;
+	birth = BP_GET_LOGICAL_BIRTH(dsl_dataset_get_blkptr(ds));
 	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 	if (birth > dsl_dataset_phys(snap)->ds_creation_txg) {
 		objset_t *os, *os_snap;
diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c
index e6c8d4be13b4..eff1f7de7731 100644
--- a/module/zfs/dsl_deadlist.c
+++ b/module/zfs/dsl_deadlist.c
@@ -474,7 +474,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
 	dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp);
 	dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp);
 
-	dle_tofind.dle_mintxg = bp->blk_birth;
+	dle_tofind.dle_mintxg = BP_GET_LOGICAL_BIRTH(bp);
 	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
 	if (dle == NULL)
 		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
@@ -483,7 +483,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
 
 	if (dle == NULL) {
 		zfs_panic_recover("blkptr at %p has invalid BLK_BIRTH %llu",
-		    bp, (longlong_t)bp->blk_birth);
+		    bp, (longlong_t)BP_GET_LOGICAL_BIRTH(bp));
 		dle = avl_first(&dl->dl_tree);
 	}
 
@@ -1039,8 +1039,7 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
 		ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(&found->le_bp));
 		ASSERT3U(BP_GET_CHECKSUM(bp), ==,
 		    BP_GET_CHECKSUM(&found->le_bp));
-		ASSERT3U(BP_PHYSICAL_BIRTH(bp), ==,
-		    BP_PHYSICAL_BIRTH(&found->le_bp));
+		ASSERT3U(BP_GET_BIRTH(bp), ==, BP_GET_BIRTH(&found->le_bp));
 	}
 	if (bp_freed) {
 		if (found == NULL) {
diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c
index d9d88a981e05..d4a6e5b6e9fd 100644
--- a/module/zfs/dsl_destroy.c
+++ b/module/zfs/dsl_destroy.c
@@ -132,10 +132,11 @@ process_old_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
 
 	ASSERT(!BP_IS_HOLE(bp));
 
-	if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
+	if (BP_GET_LOGICAL_BIRTH(bp) <=
+	    dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
 		dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, bp_freed, tx);
 		if (poa->ds_prev && !poa->after_branch_point &&
-		    bp->blk_birth >
+		    BP_GET_LOGICAL_BIRTH(bp) >
 		    dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) {
 			dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes +=
 			    bp_get_dsize_sync(dp->dp_spa, bp);
@@ -313,7 +314,8 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
 
 	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-	ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
+	ASSERT3U(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=,
+	    tx->tx_txg);
 	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 	ASSERT(zfs_refcount_is_zero(&ds->ds_longholds));
 
@@ -727,7 +729,7 @@ kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 		dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
 	} else {
 		ASSERT(zilog == NULL);
-		ASSERT3U(bp->blk_birth, >,
+		ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), >,
 		    dsl_dataset_phys(ka->ds)->ds_prev_snap_txg);
 		(void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
 	}
@@ -1017,7 +1019,8 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
 	ASSERT(ds->ds_prev == NULL ||
 	    dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object);
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-	ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
+	ASSERT3U(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(ds)->ds_bp), <=,
+	    tx->tx_txg);
 	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
 
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
index 370c6a010dca..342ec5c15c79 100644
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -1047,7 +1047,7 @@ upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 		 * will be wrong.
 		 */
 		rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
-		ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth);
+		ASSERT0(BP_GET_LOGICAL_BIRTH(&dsl_dataset_phys(prev)->ds_bp));
 		rrw_exit(&ds->ds_bp_rwlock, FTAG);
 
 		/* The origin doesn't get attached to itself */
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c
index 060a5cc36d70..55e89b89f06a 100644
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -429,8 +429,8 @@ sio2bp(const scan_io_t *sio, blkptr_t *bp)
 {
 	memset(bp, 0, sizeof (*bp));
 	bp->blk_prop = sio->sio_blk_prop;
-	bp->blk_phys_birth = sio->sio_phys_birth;
-	bp->blk_birth = sio->sio_birth;
+	BP_SET_PHYSICAL_BIRTH(bp, sio->sio_phys_birth);
+	BP_SET_LOGICAL_BIRTH(bp, sio->sio_birth);
 	bp->blk_fill = 1;	/* we always only work with data pointers */
 	bp->blk_cksum = sio->sio_cksum;
 
@@ -444,8 +444,8 @@ static inline void
 bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i)
 {
 	sio->sio_blk_prop = bp->blk_prop;
-	sio->sio_phys_birth = bp->blk_phys_birth;
-	sio->sio_birth = bp->blk_birth;
+	sio->sio_phys_birth = BP_GET_PHYSICAL_BIRTH(bp);
+	sio->sio_birth = BP_GET_LOGICAL_BIRTH(bp);
 	sio->sio_cksum = bp->blk_cksum;
 	sio->sio_nr_dvas = BP_GET_NDVAS(bp);
 
@@ -1721,7 +1721,8 @@ dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
 	zbookmark_phys_t zb;
 
 	ASSERT(!BP_IS_REDACTED(bp));
-	if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+	if (BP_IS_HOLE(bp) ||
+	    BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg)
 		return (0);
 
 	/*
@@ -1730,7 +1731,8 @@ dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
 	 * (on-disk) even if it hasn't been claimed (even though for
 	 * scrub there's nothing to do to it).
 	 */
-	if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(dp->dp_spa))
+	if (claim_txg == 0 &&
+	    BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(dp->dp_spa))
 		return (0);
 
 	SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
@@ -1756,7 +1758,7 @@ dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
 
 		ASSERT(!BP_IS_REDACTED(bp));
 		if (BP_IS_HOLE(bp) ||
-		    bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+		    BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg)
 			return (0);
 
 		/*
@@ -1764,7 +1766,7 @@ dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
 		 * already txg sync'ed (but this log block contains
 		 * other records that are not synced)
 		 */
-		if (claim_txg == 0 || bp->blk_birth < claim_txg)
+		if (claim_txg == 0 || BP_GET_LOGICAL_BIRTH(bp) < claim_txg)
 			return (0);
 
 		ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
@@ -1903,7 +1905,8 @@ dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb)
 	if (zfs_no_scrub_prefetch || BP_IS_REDACTED(bp))
 		return;
 
-	if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg ||
+	if (BP_IS_HOLE(bp) ||
+	    BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg ||
 	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
 	    BP_GET_TYPE(bp) != DMU_OT_OBJSET))
 		return;
@@ -2174,7 +2177,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
 	if (dnp != NULL &&
 	    dnp->dn_bonuslen > DN_MAX_BONUS_LEN(dnp)) {
 		scn->scn_phys.scn_errors++;
-		spa_log_error(spa, zb, &bp->blk_birth);
+		spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
 		return (SET_ERROR(EINVAL));
 	}
 
@@ -2270,7 +2273,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
 		 * by arc_read() for the cases above.
 		 */
 		scn->scn_phys.scn_errors++;
-		spa_log_error(spa, zb, &bp->blk_birth);
+		spa_log_error(spa, zb, BP_GET_LOGICAL_BIRTH(bp));
 		return (SET_ERROR(EINVAL));
 	}
 
@@ -2347,7 +2350,7 @@ dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb,
 	if (f != SPA_FEATURE_NONE)
 		ASSERT(dsl_dataset_feature_is_active(ds, f));
 
-	if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) {
+	if (BP_GET_LOGICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_min_txg) {
 		scn->scn_lt_min_this_txg++;
 		return;
 	}
@@ -2373,7 +2376,7 @@ dsl_scan_visitbp(const blkptr_t *bp, const zbookmark_phys_t *zb,
 	 * Don't scan it now unless we need to because something
 	 * under it was modified.
 	 */
-	if (BP_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) {
+	if (BP_GET_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) {
 		scn->scn_gt_max_this_txg++;
 		return;
 	}
@@ -4714,7 +4717,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
 {
 	dsl_scan_t *scn = dp->dp_scan;
 	spa_t *spa = dp->dp_spa;
-	uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
+	uint64_t phys_birth = BP_GET_BIRTH(bp);
 	size_t psize = BP_GET_PSIZE(bp);
 	boolean_t needs_io = B_FALSE;
 	int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c
index 7237fa8eeb59..c4aa98ced433 100644
--- a/module/zfs/metaslab.c
+++ b/module/zfs/metaslab.c
@@ -5495,8 +5495,9 @@ remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
 	vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
 	    DVA_GET_VDEV(&bp->blk_dva[0]));
 	vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
-	bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
+	uint64_t physical_birth = vdev_indirect_births_physbirth(vib,
 	    DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
+	BP_SET_PHYSICAL_BIRTH(bp, physical_birth);
 
 	DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
 	DVA_SET_OFFSET(&bp->blk_dva[0], offset);
@@ -5845,8 +5846,8 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
 	dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
 	int error = 0;
 
-	ASSERT(bp->blk_birth == 0);
-	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
+	ASSERT0(BP_GET_LOGICAL_BIRTH(bp));
+	ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
 
 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
 
@@ -5900,7 +5901,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
 	int ndvas = BP_GET_NDVAS(bp);
 
 	ASSERT(!BP_IS_HOLE(bp));
-	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
+	ASSERT(!now || BP_GET_LOGICAL_BIRTH(bp) >= spa_syncing_txg(spa));
 
 	/*
 	 * If we have a checkpoint for the pool we need to make sure that
@@ -5918,7 +5919,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
 	 * normally as they will be referenced by the checkpointed uberblock.
 	 */
 	boolean_t checkpoint = B_FALSE;
-	if (bp->blk_birth <= spa->spa_checkpoint_txg &&
+	if (BP_GET_LOGICAL_BIRTH(bp) <= spa->spa_checkpoint_txg &&
 	    spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
 		/*
 		 * At this point, if the block is part of the checkpoint
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index b144d0652930..30c528a53049 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -2655,8 +2655,8 @@ spa_claim_notify(zio_t *zio)
 		return;
 
 	mutex_enter(&spa->spa_props_lock);	/* any mutex will do */
-	if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
-		spa->spa_claim_max_txg = zio->io_bp->blk_birth;
+	if (spa->spa_claim_max_txg < BP_GET_LOGICAL_BIRTH(zio->io_bp))
+		spa->spa_claim_max_txg = BP_GET_LOGICAL_BIRTH(zio->io_bp);
 	mutex_exit(&spa->spa_props_lock);
 }
 
@@ -6266,7 +6266,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 	nvlist_t *nvl;
 
 	if (props == NULL ||
-	    nvlist_lookup_string(props, "tname", &poolname) != 0)
+	    nvlist_lookup_string(props,
+	    zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0)
 		poolname = (char *)pool;
 
 	/*
@@ -9801,7 +9802,7 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
 		 * don't want to rely on that here).
 		 */
 		if (pass == 1 &&
-		    spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
+		    BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg &&
 		    !dmu_objset_is_dirty(mos, txg)) {
 			/*
 			 * Nothing changed on the first pass, therefore this
diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c
index 244b4d264212..62d7b4fa2df2 100644
--- a/module/zfs/spa_errlog.c
+++ b/module/zfs/spa_errlog.c
@@ -180,7 +180,7 @@ static int get_head_ds(spa_t *spa, uint64_t dsobj, uint64_t *head_ds)
  * during spa_errlog_sync().
  */
 void
-spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t *birth)
+spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t birth)
 {
 	spa_error_entry_t search;
 	spa_error_entry_t *new;
@@ -223,13 +223,7 @@ spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t *birth)
 		new->se_zep.zb_object = zb->zb_object;
 		new->se_zep.zb_level = zb->zb_level;
 		new->se_zep.zb_blkid = zb->zb_blkid;
-
-		/*
-		 * birth may end up being NULL, e.g. in zio_done(). We
-		 * will handle this in process_error_block().
-		 */
-		if (birth != NULL)
-			new->se_zep.zb_birth = *birth;
+		new->se_zep.zb_birth = birth;
 	}
 
 	avl_insert(tree, new, where);
@@ -258,7 +252,7 @@ find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep,
 	if (error == 0 && BP_IS_HOLE(&bp))
 		error = SET_ERROR(ENOENT);
 
-	*birth_txg = bp.blk_birth;
+	*birth_txg = BP_GET_LOGICAL_BIRTH(&bp);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 	return (error);
@@ -535,7 +529,7 @@ process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
 		 */
 		zbookmark_phys_t zb;
 		zep_to_zb(head_ds, zep, &zb);
-		spa_remove_error(spa, &zb, &zep->zb_birth);
+		spa_remove_error(spa, &zb, zep->zb_birth);
 	}
 
 	return (error);
@@ -563,7 +557,7 @@ spa_get_last_errlog_size(spa_t *spa)
  */
 static void
 spa_add_healed_error(spa_t *spa, uint64_t obj, zbookmark_phys_t *healed_zb,
-    const uint64_t *birth)
+    const uint64_t birth)
 {
 	char name[NAME_MAX_LEN];
 
@@ -618,11 +612,7 @@ spa_add_healed_error(spa_t *spa, uint64_t obj, zbookmark_phys_t *healed_zb,
 	healed_zep.zb_object = healed_zb->zb_object;
 	healed_zep.zb_level = healed_zb->zb_level;
 	healed_zep.zb_blkid = healed_zb->zb_blkid;
-
-	if (birth != NULL)
-		healed_zep.zb_birth = *birth;
-	else
-		healed_zep.zb_birth = 0;
+	healed_zep.zb_birth = birth;
 
 	errphys_to_name(&healed_zep, name, sizeof (name));
 
@@ -742,7 +732,7 @@ spa_remove_healed_errors(spa_t *spa, avl_tree_t *s, avl_tree_t *l, dmu_tx_t *tx)
  * later in spa_remove_healed_errors().
  */
 void
-spa_remove_error(spa_t *spa, zbookmark_phys_t *zb, const uint64_t *birth)
+spa_remove_error(spa_t *spa, zbookmark_phys_t *zb, uint64_t birth)
 {
 	spa_add_healed_error(spa, spa->spa_errlog_last, zb, birth);
 	spa_add_healed_error(spa, spa->spa_errlog_scrub, zb, birth);
@@ -890,7 +880,7 @@ sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj,
 		if (error == EACCES)
 			error = 0;
 		else if (!error)
-			zep.zb_birth = bp.blk_birth;
+			zep.zb_birth = BP_GET_LOGICAL_BIRTH(&bp);
 
 		rw_exit(&dn->dn_struct_rwlock);
 		dnode_rele(dn, FTAG);
diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c
index 873089a53e34..32158e8c592c 100644
--- a/module/zfs/spa_log_spacemap.c
+++ b/module/zfs/spa_log_spacemap.c
@@ -783,7 +783,7 @@ spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx)
 	 * request of flushing everything before we attempt to return
 	 * immediately.
 	 */
-	if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
+	if (BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg &&
 	    !dmu_objset_is_dirty(spa_meta_objset(spa), txg) &&
 	    !spa_flush_all_logs_requested(spa))
 		return;
diff --git a/module/zfs/uberblock.c b/module/zfs/uberblock.c
index 1921be107660..22ee8036c473 100644
--- a/module/zfs/uberblock.c
+++ b/module/zfs/uberblock.c
@@ -70,5 +70,5 @@ uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg, uint64_t mmp_delay)
 	}
 	ub->ub_checkpoint_txg = 0;
 
-	return (ub->ub_rootbp.blk_birth == txg);
+	return (BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp) == txg);
 }
diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c
index f9a01c9f53f4..102eacb03349 100644
--- a/module/zfs/vdev_mirror.c
+++ b/module/zfs/vdev_mirror.c
@@ -531,7 +531,7 @@ vdev_mirror_child_select(zio_t *zio)
 	uint64_t txg = zio->io_txg;
 	int c, lowest_load;
 
-	ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
+	ASSERT(zio->io_bp == NULL || BP_GET_BIRTH(zio->io_bp) == txg);
 
 	lowest_load = INT_MAX;
 	mm->mm_preferred_cnt = 0;
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index 9d0b8763f16f..b03331ec69c6 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -2190,12 +2190,11 @@ vdev_raidz_close(vdev_t *vd)
 
 /*
  * Return the logical width to use, given the txg in which the allocation
- * happened.  Note that BP_PHYSICAL_BIRTH() is usually the txg in which the
+ * happened.  Note that BP_GET_BIRTH() is usually the txg in which the
  * BP was allocated.  Remapped BP's (that were relocated due to device
- * removal, see remap_blkptr_cb()), will have a more recent
- * BP_PHYSICAL_BIRTH() which reflects when the BP was relocated, but we can
- * ignore these because they can't be on RAIDZ (device removal doesn't
- * support RAIDZ).
+ * removal, see remap_blkptr_cb()), will have a more recent physical birth
+ * which reflects when the BP was relocated, but we can ignore these because
+ * they can't be on RAIDZ (device removal doesn't support RAIDZ).
  */
 static uint64_t
 vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
@@ -2295,7 +2294,7 @@ vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)
 	logical_rs.rs_start = rr->rr_offset;
 	logical_rs.rs_end = logical_rs.rs_start +
 	    vdev_raidz_asize(zio->io_vd, rr->rr_size,
-	    BP_PHYSICAL_BIRTH(zio->io_bp));
+	    BP_GET_BIRTH(zio->io_bp));
 
 	raidz_col_t *rc = &rr->rr_col[col];
 	vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
@@ -2518,7 +2517,7 @@ vdev_raidz_io_start(zio_t *zio)
 	raidz_map_t *rm;
 
 	uint64_t logical_width = vdev_raidz_get_logical_width(vdrz,
-	    BP_PHYSICAL_BIRTH(zio->io_bp));
+	    BP_GET_BIRTH(zio->io_bp));
 	if (logical_width != vdrz->vd_physical_width) {
 		zfs_locked_range_t *lr = NULL;
 		uint64_t synced_offset = UINT64_MAX;
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index e549e1895f39..1af357c58006 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -557,7 +557,7 @@ zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
 	 * that we rewind to is invalid. Thus, we return -1 so
 	 * zil_parse() doesn't attempt to read it.
 	 */
-	if (bp->blk_birth >= first_txg)
+	if (BP_GET_LOGICAL_BIRTH(bp) >= first_txg)
 		return (-1);
 
 	if (zil_bp_tree_add(zilog, bp) != 0)
@@ -583,7 +583,7 @@ zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
 	 * Claim log block if not already committed and not already claimed.
 	 * If tx == NULL, just verify that the block is claimable.
 	 */
-	if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg ||
+	if (BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) < first_txg ||
 	    zil_bp_tree_add(zilog, bp) != 0)
 		return (0);
 
@@ -608,7 +608,7 @@ zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg)
 	 * waited for all writes to be stable first), so it is semantically
 	 * correct to declare this the end of the log.
 	 */
-	if (lr->lr_blkptr.blk_birth >= first_txg) {
+	if (BP_GET_LOGICAL_BIRTH(&lr->lr_blkptr) >= first_txg) {
 		error = zil_read_log_data(zilog, lr, NULL);
 		if (error != 0)
 			return (error);
@@ -655,7 +655,7 @@ zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx,
 		 * just in case lets be safe and just stop here now instead of
 		 * corrupting the pool.
 		 */
-		if (BP_PHYSICAL_BIRTH(bp) >= first_txg)
+		if (BP_GET_BIRTH(bp) >= first_txg)
 			return (SET_ERROR(ENOENT));
 
 		/*
@@ -710,8 +710,8 @@ zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg)
 	/*
 	 * If we previously claimed it, we need to free it.
 	 */
-	if (bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
-	    !BP_IS_HOLE(bp)) {
+	if (BP_GET_LOGICAL_BIRTH(bp) >= claim_txg &&
+	    zil_bp_tree_add(zilog, bp) == 0 && !BP_IS_HOLE(bp)) {
 		zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
 	}
 
@@ -1965,7 +1965,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
 		    &slog);
 	}
 	if (error == 0) {
-		ASSERT3U(bp->blk_birth, ==, txg);
+		ASSERT3U(BP_GET_LOGICAL_BIRTH(bp), ==, txg);
 		BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 :
 		    ZIO_CHECKSUM_ZILOG);
 		bp->blk_cksum = lwb->lwb_blk.blk_cksum;
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 213fe5c483f2..e96bbda35a04 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -613,7 +613,7 @@ zio_decrypt(zio_t *zio, abd_t *data, uint64_t size)
 		zio->io_error = SET_ERROR(EIO);
 		if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
 			spa_log_error(spa, &zio->io_bookmark,
-			    &zio->io_bp->blk_birth);
+			    BP_GET_LOGICAL_BIRTH(zio->io_bp));
 			(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
 			    spa, NULL, &zio->io_bookmark, zio, 0);
 		}
@@ -1052,8 +1052,8 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
 	    (long long)bp->blk_prop,
 	    (long long)bp->blk_pad[0],
 	    (long long)bp->blk_pad[1],
-	    (long long)bp->blk_phys_birth,
-	    (long long)bp->blk_birth,
+	    (long long)BP_GET_PHYSICAL_BIRTH(bp),
+	    (long long)BP_GET_LOGICAL_BIRTH(bp),
 	    (long long)bp->blk_fill,
 	    (long long)bp->blk_cksum.zc_word[0],
 	    (long long)bp->blk_cksum.zc_word[1],
@@ -1156,10 +1156,11 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
 	/*
 	 * Pool-specific checks.
 	 *
-	 * Note: it would be nice to verify that the blk_birth and
-	 * BP_PHYSICAL_BIRTH() are not too large.  However, spa_freeze()
-	 * allows the birth time of log blocks (and dmu_sync()-ed blocks
-	 * that are in the log) to be arbitrarily large.
+	 * Note: it would be nice to verify that the logical birth
+	 * and physical birth are not too large.  However,
+	 * spa_freeze() allows the birth time of log blocks (and
+	 * dmu_sync()-ed blocks that are in the log) to be arbitrarily
+	 * large.
 	 */
 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
 		const dva_t *dva = &bp->blk_dva[i];
@@ -1246,7 +1247,7 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 {
 	zio_t *zio;
 
-	zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
+	zio = zio_create(pio, spa, BP_GET_BIRTH(bp), bp,
 	    data, size, size, done, private,
 	    ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
 	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
@@ -1435,7 +1436,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 	 * starts allocating blocks -- so that nothing is allocated twice.
 	 * If txg == 0 we just verify that the block is claimable.
 	 */
-	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <,
+	ASSERT3U(BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp), <,
 	    spa_min_claim_txg(spa));
 	ASSERT(txg == spa_min_claim_txg(spa) || txg == 0);
 	ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));	/* zdb(8) */
@@ -1731,7 +1732,7 @@ zio_write_bp_init(zio_t *zio)
 		blkptr_t *bp = zio->io_bp;
 		zio_prop_t *zp = &zio->io_prop;
 
-		ASSERT(bp->blk_birth != zio->io_txg);
+		ASSERT(BP_GET_LOGICAL_BIRTH(bp) != zio->io_txg);
 
 		*bp = *zio->io_bp_override;
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
@@ -1819,7 +1820,7 @@ zio_write_compress(zio_t *zio)
 	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
 	ASSERT(zio->io_bp_override == NULL);
 
-	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
+	if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg) {
 		/*
 		 * We're rewriting an existing block, which means we're
 		 * working on behalf of spa_sync().  For spa_sync() to
@@ -1866,7 +1867,7 @@ zio_write_compress(zio_t *zio)
 			BP_SET_TYPE(bp, zio->io_prop.zp_type);
 			BP_SET_LEVEL(bp, zio->io_prop.zp_level);
 			zio_buf_free(cbuf, lsize);
-			bp->blk_birth = zio->io_txg;
+			BP_SET_LOGICAL_BIRTH(bp, zio->io_txg);
 			zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 			ASSERT(spa_feature_is_active(spa,
 			    SPA_FEATURE_EMBEDDED_DATA));
@@ -1947,7 +1948,7 @@ zio_write_compress(zio_t *zio)
 	 * spa_sync() to allocate new blocks, but force rewrites after that.
 	 * There should only be a handful of blocks after pass 1 in any case.
 	 */
-	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
+	if (!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg &&
 	    BP_GET_PSIZE(bp) == psize &&
 	    pass >= zfs_sync_pass_rewrite) {
 		VERIFY3U(psize, !=, 0);
@@ -1961,7 +1962,7 @@ zio_write_compress(zio_t *zio)
 	}
 
 	if (psize == 0) {
-		if (zio->io_bp_orig.blk_birth != 0 &&
+		if (BP_GET_LOGICAL_BIRTH(&zio->io_bp_orig) != 0 &&
 		    spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
 			BP_SET_LSIZE(bp, lsize);
 			BP_SET_TYPE(bp, zp->zp_type);
@@ -3539,7 +3540,7 @@ zio_ddt_write(zio_t *zio)
 		else
 			ddt_phys_addref(ddp);
 	} else if (zio->io_bp_override) {
-		ASSERT(bp->blk_birth == txg);
+		ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg);
 		ASSERT(BP_EQUAL(bp, zio->io_bp_override));
 		ddt_phys_fill(ddp, bp);
 		ddt_phys_addref(ddp);
@@ -3810,11 +3811,13 @@ zio_dva_claim(zio_t *zio)
 static void
 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
 {
-	ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
+	ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg || BP_IS_HOLE(bp));
 	ASSERT(zio->io_bp_override == NULL);
 
-	if (!BP_IS_HOLE(bp))
-		metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
+	if (!BP_IS_HOLE(bp)) {
+		metaslab_free(zio->io_spa, bp, BP_GET_LOGICAL_BIRTH(bp),
+		    B_TRUE);
+	}
 
 	if (gn != NULL) {
 		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
@@ -4555,8 +4558,8 @@ zio_ready(zio_t *zio)
 
 	if (zio->io_ready) {
 		ASSERT(IO_IS_ALLOCATING(zio));
-		ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
-		    (zio->io_flags & ZIO_FLAG_NOPWRITE));
+		ASSERT(BP_GET_LOGICAL_BIRTH(bp) == zio->io_txg ||
+		    BP_IS_HOLE(bp) || (zio->io_flags & ZIO_FLAG_NOPWRITE));
 		ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
 
 		zio->io_ready(zio);
@@ -4852,7 +4855,7 @@ zio_done(zio_t *zio)
 			 * error and generate a logical data ereport.
 			 */
 			spa_log_error(zio->io_spa, &zio->io_bookmark,
-			    &zio->io_bp->blk_birth);
+			    BP_GET_LOGICAL_BIRTH(zio->io_bp));
 			(void) zfs_ereport_post(FM_EREPORT_ZFS_DATA,
 			    zio->io_spa, NULL, &zio->io_bookmark, zio, 0);
 		}
diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c
index e511b31fee6d..ce6772a40c8b 100644
--- a/module/zfs/zio_checksum.c
+++ b/module/zfs/zio_checksum.c
@@ -272,7 +272,7 @@ static void
 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
 {
 	const dva_t *dva = BP_IDENTITY(bp);
-	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
+	uint64_t txg = BP_GET_BIRTH(bp);
 
 	ASSERT(BP_IS_GANG(bp));
 

From f68bde7236699353b89de176fd35f7fa92bfc30b Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 25 Mar 2024 18:02:38 -0400
Subject: [PATCH 14/34] BRT: Make BRT block sizes configurable

Similar to DDT make BRT data and indirect block sizes configurable
via module parameters.  I am not sure what would be the best yet,
but similar to DDT 4KB blocks kill all chances of compression on
vdev with ashift=12 or more, that on my tests reaches 3x.

While here, fix documentation for respective DDT parameters.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15967
---
 man/man4/zfs.4   | 17 +++++++++++++++--
 module/zfs/brt.c | 22 +++++++++++-----------
 2 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 30c168253f96..759a68784aca 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -244,12 +244,25 @@ For blocks that could be forced to be a gang block (due to
 .Sy metaslab_force_ganging ) ,
 force this many of them to be gang blocks.
 .
-.It Sy zfs_ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
+.It Sy brt_zap_prefetch Ns = Ns Sy 1 Ns | Ns 0 Pq int
+Controls prefetching BRT records for blocks which are going to be cloned.
+.
+.It Sy brt_zap_default_bs Ns = Ns Sy 12 Po 4 KiB Pc Pq int
+Default BRT ZAP data block size as a power of 2. Note that changing this after
+creating a BRT on the pool will not affect existing BRTs, only newly created
+ones.
+.
+.It Sy brt_zap_default_ibs Ns = Ns Sy 12 Po 4 KiB Pc Pq int
+Default BRT ZAP indirect block size as a power of 2. Note that changing this
+after creating a BRT on the pool will not affect existing BRTs, only newly
+created ones.
+.
+.It Sy ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
 Default DDT ZAP data block size as a power of 2. Note that changing this after
 creating a DDT on the pool will not affect existing DDTs, only newly created
 ones.
 .
-.It Sy zfs_ddt_zap_default_ibs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
+.It Sy ddt_zap_default_ibs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
 Default DDT ZAP indirect block size as a power of 2. Note that changing this
 after creating a DDT on the pool will not affect existing DDTs, only newly
 created ones.
diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 0b5a09df3724..5d1f4728b645 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -248,7 +248,7 @@ static kmem_cache_t *brt_pending_entry_cache;
 /*
  * Enable/disable prefetching of BRT entries that we are going to modify.
  */
-int zfs_brt_prefetch = 1;
+static int brt_zap_prefetch = 1;
 
 #ifdef ZFS_DEBUG
 #define	BRT_DEBUG(...)	do {						\
@@ -260,8 +260,8 @@ int zfs_brt_prefetch = 1;
 #define	BRT_DEBUG(...)	do { } while (0)
 #endif
 
-int brt_zap_leaf_blockshift = 12;
-int brt_zap_indirect_blockshift = 12;
+static int brt_zap_default_bs = 12;
+static int brt_zap_default_ibs = 12;
 
 static kstat_t	*brt_ksp;
 
@@ -458,8 +458,7 @@ brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
 
 	brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0,
 	    ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
-	    brt_zap_leaf_blockshift, brt_zap_indirect_blockshift, DMU_OT_NONE,
-	    0, tx);
+	    brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx);
 	VERIFY(brtvd->bv_mos_entries != 0);
 	BRT_DEBUG("MOS entries created, object=%llu",
 	    (u_longlong_t)brtvd->bv_mos_entries);
@@ -1363,7 +1362,7 @@ brt_prefetch(brt_t *brt, const blkptr_t *bp)
 
 	ASSERT(bp != NULL);
 
-	if (!zfs_brt_prefetch)
+	if (!brt_zap_prefetch)
 		return;
 
 	brt_entry_fill(bp, &bre, &vdevid);
@@ -1679,9 +1678,10 @@ brt_unload(spa_t *spa)
 }
 
 /* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, prefetch, INT, ZMOD_RW,
-    "Enable prefetching of BRT entries");
-#ifdef ZFS_BRT_DEBUG
-ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, debug, INT, ZMOD_RW, "BRT debug");
-#endif
+ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW,
+	"Enable prefetching of BRT ZAP entries");
+ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW,
+	"BRT ZAP leaf blockshift");
+ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW,
+	"BRT ZAP indirect blockshift");
 /* END CSTYLED */

From df04efe321a49c650f1fbaa6fd701fa2928cbe21 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 13 Nov 2023 17:55:29 +1100
Subject: [PATCH 15/34] linux 5.4 compat: page_size()

Before 5.4 we have to do a little math.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
---
 config/kernel-mm-page-size.m4             | 17 +++++++++++
 config/kernel.m4                          |  2 ++
 include/os/linux/Makefile.am              |  1 +
 include/os/linux/kernel/linux/mm_compat.h | 36 +++++++++++++++++++++++
 4 files changed, 56 insertions(+)
 create mode 100644 config/kernel-mm-page-size.m4
 create mode 100644 include/os/linux/kernel/linux/mm_compat.h

diff --git a/config/kernel-mm-page-size.m4 b/config/kernel-mm-page-size.m4
new file mode 100644
index 000000000000..d5ebd926986a
--- /dev/null
+++ b/config/kernel-mm-page-size.m4
@@ -0,0 +1,17 @@
+AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [
+	ZFS_LINUX_TEST_SRC([page_size], [
+		#include <linux/mm.h>
+	],[
+		unsigned long s;
+		s = page_size(NULL);
+	])
+])
+AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [
+	AC_MSG_CHECKING([whether page_size() is available])
+	ZFS_LINUX_TEST_RESULT([page_size], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
diff --git a/config/kernel.m4 b/config/kernel.m4
index 1d0c5a27fc7f..548905ccd04d 100644
--- a/config/kernel.m4
+++ b/config/kernel.m4
@@ -167,6 +167,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
 	ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ
 	ZFS_AC_KERNEL_SRC_SYNC_BDEV
+	ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
@@ -316,6 +317,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
 	ZFS_AC_KERNEL_COPY_SPLICE_READ
 	ZFS_AC_KERNEL_SYNC_BDEV
+	ZFS_AC_KERNEL_MM_PAGE_SIZE
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_CPU_HAS_FEATURE
diff --git a/include/os/linux/Makefile.am b/include/os/linux/Makefile.am
index 3830d198dfff..51c27132b4ef 100644
--- a/include/os/linux/Makefile.am
+++ b/include/os/linux/Makefile.am
@@ -5,6 +5,7 @@ kernel_linux_HEADERS = \
 	%D%/kernel/linux/compiler_compat.h \
 	%D%/kernel/linux/dcache_compat.h \
 	%D%/kernel/linux/kmap_compat.h \
+	%D%/kernel/linux/mm_compat.h \
 	%D%/kernel/linux/mod_compat.h \
 	%D%/kernel/linux/page_compat.h \
 	%D%/kernel/linux/percpu_compat.h \
diff --git a/include/os/linux/kernel/linux/mm_compat.h b/include/os/linux/kernel/linux/mm_compat.h
new file mode 100644
index 000000000000..40056c68d6dd
--- /dev/null
+++ b/include/os/linux/kernel/linux/mm_compat.h
@@ -0,0 +1,36 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2023, 2024, Klara Inc.
+ */
+
+#ifndef _ZFS_MM_COMPAT_H
+#define	_ZFS_MM_COMPAT_H
+
+#include <linux/mm.h>
+
+/* 5.4 introduced page_size(). Older kernels can use a trivial macro instead */
+#ifndef HAVE_MM_PAGE_SIZE
+#define	page_size(p) ((unsigned long)(PAGE_SIZE << compound_order(p)))
+#endif
+
+#endif /* _ZFS_MM_COMPAT_H */

From 390b448726c580999dd337be7a40b0e95cf1d50b Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 11 Dec 2023 16:05:54 +1100
Subject: [PATCH 16/34] abd: add page iterator

The regular ABD iterators yield data buffers, so they have to map and
unmap pages into kernel memory. If the caller only wants to count
chunks, or can use page pointers directly, then the map/unmap is just
unnecessary overhead.

This adds adb_iterate_page_func, which yields unmapped struct page
instead.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
---
 include/sys/abd.h              |   7 +++
 include/sys/abd_impl.h         |  26 ++++++++-
 module/os/freebsd/zfs/abd_os.c |   4 +-
 module/os/linux/zfs/abd_os.c   | 104 ++++++++++++++++++++++++++++++---
 module/zfs/abd.c               |  42 +++++++++++++
 5 files changed, 169 insertions(+), 14 deletions(-)

diff --git a/include/sys/abd.h b/include/sys/abd.h
index b48dc36423f7..3a500e2c9ae7 100644
--- a/include/sys/abd.h
+++ b/include/sys/abd.h
@@ -79,6 +79,9 @@ typedef struct abd {
 
 typedef int abd_iter_func_t(void *buf, size_t len, void *priv);
 typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv);
+#if defined(__linux__) && defined(_KERNEL)
+typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
+#endif
 
 extern int zfs_abd_scatter_enabled;
 
@@ -125,6 +128,10 @@ void abd_release_ownership_of_buf(abd_t *);
 int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
 int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
     abd_iter_func2_t *, void *);
+#if defined(__linux__) && defined(_KERNEL)
+int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
+    void *);
+#endif
 void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
 void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
 void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h
index 40546d4af137..f88ea25e245d 100644
--- a/include/sys/abd_impl.h
+++ b/include/sys/abd_impl.h
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
  * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
  */
 
 #ifndef _ABD_IMPL_H
@@ -38,12 +39,30 @@ typedef enum abd_stats_op {
 	ABDSTAT_DECR  /* Decrease abdstat values */
 } abd_stats_op_t;
 
-struct scatterlist; /* forward declaration */
+/* forward declarations */
+struct scatterlist;
+struct page;
 
 struct abd_iter {
 	/* public interface */
-	void		*iter_mapaddr;	/* addr corresponding to iter_pos */
-	size_t		iter_mapsize;	/* length of data valid at mapaddr */
+	union {
+		/* for abd_iter_map()/abd_iter_unmap() */
+		struct {
+			/* addr corresponding to iter_pos */
+			void		*iter_mapaddr;
+			/* length of data valid at mapaddr */
+			size_t		iter_mapsize;
+		};
+		/* for abd_iter_page() */
+		struct {
+			/* current page */
+			struct page	*iter_page;
+			/* offset of data in page */
+			size_t		iter_page_doff;
+			/* size of data in page */
+			size_t		iter_page_dsize;
+		};
+	};
 
 	/* private */
 	abd_t		*iter_abd;	/* ABD being iterated through */
@@ -78,6 +97,7 @@ boolean_t abd_iter_at_end(struct abd_iter *);
 void abd_iter_advance(struct abd_iter *, size_t);
 void abd_iter_map(struct abd_iter *);
 void abd_iter_unmap(struct abd_iter *);
+void abd_iter_page(struct abd_iter *);
 
 /*
  * Helper macros
diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c
index 58a37df62b69..3b812271f98b 100644
--- a/module/os/freebsd/zfs/abd_os.c
+++ b/module/os/freebsd/zfs/abd_os.c
@@ -417,10 +417,8 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
 {
 	ASSERT(!abd_is_gang(abd));
 	abd_verify(abd);
+	memset(aiter, 0, sizeof (struct abd_iter));
 	aiter->iter_abd = abd;
-	aiter->iter_pos = 0;
-	aiter->iter_mapaddr = NULL;
-	aiter->iter_mapsize = 0;
 }
 
 /*
diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
index 24390fbbf125..dae1280121da 100644
--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
  * Copyright (c) 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
  */
 
 /*
@@ -59,6 +60,7 @@
 #include <sys/zfs_znode.h>
 #ifdef _KERNEL
 #include <linux/kmap_compat.h>
+#include <linux/mm_compat.h>
 #include <linux/scatterlist.h>
 #endif
 
@@ -895,14 +897,9 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
 {
 	ASSERT(!abd_is_gang(abd));
 	abd_verify(abd);
+	memset(aiter, 0, sizeof (struct abd_iter));
 	aiter->iter_abd = abd;
-	aiter->iter_mapaddr = NULL;
-	aiter->iter_mapsize = 0;
-	aiter->iter_pos = 0;
-	if (abd_is_linear(abd)) {
-		aiter->iter_offset = 0;
-		aiter->iter_sg = NULL;
-	} else {
+	if (!abd_is_linear(abd)) {
 		aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
 		aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
 	}
@@ -915,6 +912,7 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
 boolean_t
 abd_iter_at_end(struct abd_iter *aiter)
 {
+	ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
 	return (aiter->iter_pos == aiter->iter_abd->abd_size);
 }
 
@@ -926,8 +924,15 @@ abd_iter_at_end(struct abd_iter *aiter)
 void
 abd_iter_advance(struct abd_iter *aiter, size_t amount)
 {
+	/*
+	 * Ensure that last chunk is not in use. abd_iterate_*() must clear
+	 * this state (directly or abd_iter_unmap()) before advancing.
+	 */
 	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
 	ASSERT0(aiter->iter_mapsize);
+	ASSERT3P(aiter->iter_page, ==, NULL);
+	ASSERT0(aiter->iter_page_doff);
+	ASSERT0(aiter->iter_page_dsize);
 
 	/* There's nothing left to advance to, so do nothing */
 	if (abd_iter_at_end(aiter))
@@ -1009,6 +1014,88 @@ abd_cache_reap_now(void)
 }
 
 #if defined(_KERNEL)
+/*
+ * Yield the next page struct and data offset and size within it, without
+ * mapping it into the address space.
+ */
+void
+abd_iter_page(struct abd_iter *aiter)
+{
+	if (abd_iter_at_end(aiter)) {
+		aiter->iter_page = NULL;
+		aiter->iter_page_doff = 0;
+		aiter->iter_page_dsize = 0;
+		return;
+	}
+
+	struct page *page;
+	size_t doff, dsize;
+
+	if (abd_is_linear(aiter->iter_abd)) {
+		ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
+
+		/* memory address at iter_pos */
+		void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;
+
+		/* struct page for address */
+		page = is_vmalloc_addr(paddr) ?
+		    vmalloc_to_page(paddr) : virt_to_page(paddr);
+
+		/* offset of address within the page */
+		doff = offset_in_page(paddr);
+
+		/* total data remaining in abd from this position */
+		dsize = aiter->iter_abd->abd_size - aiter->iter_offset;
+	} else {
+		ASSERT(!abd_is_gang(aiter->iter_abd));
+
+		/* current scatter page */
+		page = sg_page(aiter->iter_sg);
+
+		/* position within page */
+		doff = aiter->iter_offset;
+
+		/* remaining data in scatterlist */
+		dsize = MIN(aiter->iter_sg->length - aiter->iter_offset,
+		    aiter->iter_abd->abd_size - aiter->iter_pos);
+	}
+	ASSERT(page);
+
+	if (PageTail(page)) {
+		/*
+		 * This page is part of a "compound page", which is a group of
+		 * pages that can be referenced from a single struct page *.
+		 * Its organised as a "head" page, followed by a series of
+		 * "tail" pages.
+		 *
+		 * In OpenZFS, compound pages are allocated using the
+		 * __GFP_COMP flag, which we get from scatter ABDs and SPL
+		 * vmalloc slabs (ie >16K allocations). So a great many of the
+		 * IO buffers we get are going to be of this type.
+		 *
+		 * The tail pages are just regular PAGE_SIZE pages, and can be
+		 * safely used as-is. However, the head page has length
+		 * covering itself and all the tail pages. If this ABD chunk
+		 * spans multiple pages, then we can use the head page and a
+		 * >PAGE_SIZE length, which is far more efficient.
+		 *
+		 * To do this, we need to adjust the offset to be counted from
+		 * the head page. struct page for compound pages are stored
+		 * contiguously, so we can just adjust by a simple offset.
+		 */
+		struct page *head = compound_head(page);
+		doff += ((page - head) * PAGESIZE);
+		page = head;
+	}
+
+	/* final page and position within it */
+	aiter->iter_page = page;
+	aiter->iter_page_doff = doff;
+
+	/* amount of data in the chunk, up to the end of the page */
+	aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
+}
+
 /*
  * bio_nr_pages for ABD.
  * @off is the offset in @abd
@@ -1163,4 +1250,5 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size,
 module_param(zfs_abd_scatter_max_order, uint, 0644);
 MODULE_PARM_DESC(zfs_abd_scatter_max_order,
 	"Maximum order allocation used for a scatter ABD.");
-#endif
+
+#endif /* _KERNEL */
diff --git a/module/zfs/abd.c b/module/zfs/abd.c
index 0a2411a2d572..2c0cda25dbc6 100644
--- a/module/zfs/abd.c
+++ b/module/zfs/abd.c
@@ -826,6 +826,48 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size,
 	return (ret);
 }
 
+#if defined(__linux__) && defined(_KERNEL)
+int
+abd_iterate_page_func(abd_t *abd, size_t off, size_t size,
+    abd_iter_page_func_t *func, void *private)
+{
+	struct abd_iter aiter;
+	int ret = 0;
+
+	if (size == 0)
+		return (0);
+
+	abd_verify(abd);
+	ASSERT3U(off + size, <=, abd->abd_size);
+
+	abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
+
+	while (size > 0) {
+		IMPLY(abd_is_gang(abd), c_abd != NULL);
+
+		abd_iter_page(&aiter);
+
+		size_t len = MIN(aiter.iter_page_dsize, size);
+		ASSERT3U(len, >, 0);
+
+		ret = func(aiter.iter_page, aiter.iter_page_doff,
+		    len, private);
+
+		aiter.iter_page = NULL;
+		aiter.iter_page_doff = 0;
+		aiter.iter_page_dsize = 0;
+
+		if (ret != 0)
+			break;
+
+		size -= len;
+		c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len);
+	}
+
+	return (ret);
+}
+#endif
+
 struct buf_arg {
 	void *arg_buf;
 };

From f3b85d706bae82957d2e3e0ef1d53a1cfab60eb4 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 9 Jan 2024 12:12:56 +1100
Subject: [PATCH 17/34] vdev_disk: rename existing functions to vdev_classic_*

This is just renaming the existing functions we're about to replace and
grouping them together to make the next commits easier to follow.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
---
 include/sys/abd.h               |   2 +
 module/os/linux/zfs/abd_os.c    |   5 +
 module/os/linux/zfs/vdev_disk.c | 215 +++++++++++++++++---------------
 3 files changed, 120 insertions(+), 102 deletions(-)

diff --git a/include/sys/abd.h b/include/sys/abd.h
index 3a500e2c9ae7..19fe96292d5f 100644
--- a/include/sys/abd.h
+++ b/include/sys/abd.h
@@ -220,6 +220,8 @@ void abd_fini(void);
 
 /*
  * Linux ABD bio functions
+ * Note: these are only needed to support vdev_classic. See comment in
+ * vdev_disk.c.
  */
 #if defined(__linux__) && defined(_KERNEL)
 unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
index dae1280121da..3fe01c0b7d77 100644
--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
@@ -1096,6 +1096,11 @@ abd_iter_page(struct abd_iter *aiter)
 	aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
 }
 
+/*
+ * Note: ABD BIO functions only needed to support vdev_classic. See comments in
+ * vdev_disk.c.
+ */
+
 /*
  * bio_nr_pages for ABD.
  * @off is the offset in @abd
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index b0bda5fa2012..957619b87afd 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -83,17 +83,6 @@ static uint_t zfs_vdev_open_timeout_ms = 1000;
  */
 #define	EFI_MIN_RESV_SIZE	(16 * 1024)
 
-/*
- * Virtual device vector for disks.
- */
-typedef struct dio_request {
-	zio_t			*dr_zio;	/* Parent ZIO */
-	atomic_t		dr_ref;		/* References */
-	int			dr_error;	/* Bio error */
-	int			dr_bio_count;	/* Count of bio's */
-	struct bio		*dr_bio[];	/* Attached bio's */
-} dio_request_t;
-
 /*
  * BIO request failfast mask.
  */
@@ -467,85 +456,6 @@ vdev_disk_close(vdev_t *v)
 	v->vdev_tsd = NULL;
 }
 
-static dio_request_t *
-vdev_disk_dio_alloc(int bio_count)
-{
-	dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
-	    sizeof (struct bio *) * bio_count, KM_SLEEP);
-	atomic_set(&dr->dr_ref, 0);
-	dr->dr_bio_count = bio_count;
-	dr->dr_error = 0;
-
-	for (int i = 0; i < dr->dr_bio_count; i++)
-		dr->dr_bio[i] = NULL;
-
-	return (dr);
-}
-
-static void
-vdev_disk_dio_free(dio_request_t *dr)
-{
-	int i;
-
-	for (i = 0; i < dr->dr_bio_count; i++)
-		if (dr->dr_bio[i])
-			bio_put(dr->dr_bio[i]);
-
-	kmem_free(dr, sizeof (dio_request_t) +
-	    sizeof (struct bio *) * dr->dr_bio_count);
-}
-
-static void
-vdev_disk_dio_get(dio_request_t *dr)
-{
-	atomic_inc(&dr->dr_ref);
-}
-
-static void
-vdev_disk_dio_put(dio_request_t *dr)
-{
-	int rc = atomic_dec_return(&dr->dr_ref);
-
-	/*
-	 * Free the dio_request when the last reference is dropped and
-	 * ensure zio_interpret is called only once with the correct zio
-	 */
-	if (rc == 0) {
-		zio_t *zio = dr->dr_zio;
-		int error = dr->dr_error;
-
-		vdev_disk_dio_free(dr);
-
-		if (zio) {
-			zio->io_error = error;
-			ASSERT3S(zio->io_error, >=, 0);
-			if (zio->io_error)
-				vdev_disk_error(zio);
-
-			zio_delay_interrupt(zio);
-		}
-	}
-}
-
-BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
-{
-	dio_request_t *dr = bio->bi_private;
-
-	if (dr->dr_error == 0) {
-#ifdef HAVE_1ARG_BIO_END_IO_T
-		dr->dr_error = BIO_END_IO_ERROR(bio);
-#else
-		if (error)
-			dr->dr_error = -(error);
-		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-			dr->dr_error = EIO;
-#endif
-	}
-
-	/* Drop reference acquired by __vdev_disk_physio */
-	vdev_disk_dio_put(dr);
-}
-
 static inline void
 vdev_submit_bio_impl(struct bio *bio)
 {
@@ -697,8 +607,107 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask,
 	return (bio);
 }
 
+/* ========== */
+
+/*
+ * This is the classic, battle-tested BIO submission code.
+ *
+ * These functions have been renamed to vdev_classic_* to make it clear what
+ * they belong to, but their implementations are unchanged.
+ */
+
+/*
+ * Virtual device vector for disks.
+ */
+typedef struct dio_request {
+	zio_t			*dr_zio;	/* Parent ZIO */
+	atomic_t		dr_ref;		/* References */
+	int			dr_error;	/* Bio error */
+	int			dr_bio_count;	/* Count of bio's */
+	struct bio		*dr_bio[];	/* Attached bio's */
+} dio_request_t;
+
+static dio_request_t *
+vdev_classic_dio_alloc(int bio_count)
+{
+	dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
+	    sizeof (struct bio *) * bio_count, KM_SLEEP);
+	atomic_set(&dr->dr_ref, 0);
+	dr->dr_bio_count = bio_count;
+	dr->dr_error = 0;
+
+	for (int i = 0; i < dr->dr_bio_count; i++)
+		dr->dr_bio[i] = NULL;
+
+	return (dr);
+}
+
+static void
+vdev_classic_dio_free(dio_request_t *dr)
+{
+	int i;
+
+	for (i = 0; i < dr->dr_bio_count; i++)
+		if (dr->dr_bio[i])
+			bio_put(dr->dr_bio[i]);
+
+	kmem_free(dr, sizeof (dio_request_t) +
+	    sizeof (struct bio *) * dr->dr_bio_count);
+}
+
+static void
+vdev_classic_dio_get(dio_request_t *dr)
+{
+	atomic_inc(&dr->dr_ref);
+}
+
+static void
+vdev_classic_dio_put(dio_request_t *dr)
+{
+	int rc = atomic_dec_return(&dr->dr_ref);
+
+	/*
+	 * Free the dio_request when the last reference is dropped and
+	 * ensure zio_interpret is called only once with the correct zio
+	 */
+	if (rc == 0) {
+		zio_t *zio = dr->dr_zio;
+		int error = dr->dr_error;
+
+		vdev_classic_dio_free(dr);
+
+		if (zio) {
+			zio->io_error = error;
+			ASSERT3S(zio->io_error, >=, 0);
+			if (zio->io_error)
+				vdev_disk_error(zio);
+
+			zio_delay_interrupt(zio);
+		}
+	}
+}
+
+BIO_END_IO_PROTO(vdev_classic_physio_completion, bio, error)
+{
+	dio_request_t *dr = bio->bi_private;
+
+	if (dr->dr_error == 0) {
+#ifdef HAVE_1ARG_BIO_END_IO_T
+		dr->dr_error = BIO_END_IO_ERROR(bio);
+#else
+		if (error)
+			dr->dr_error = -(error);
+		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+			dr->dr_error = EIO;
+#endif
+	}
+
+	/* Drop reference acquired by vdev_classic_physio */
+	vdev_classic_dio_put(dr);
+}
+
 static inline unsigned int
-vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
+vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
 {
 	unsigned long nr_segs = abd_nr_pages_off(zio->io_abd,
 	    bio_size, abd_offset);
@@ -711,7 +720,7 @@ vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
 }
 
 static int
-__vdev_disk_physio(struct block_device *bdev, zio_t *zio,
+vdev_classic_physio(struct block_device *bdev, zio_t *zio,
     size_t io_size, uint64_t io_offset, int rw, int flags)
 {
 	dio_request_t *dr;
@@ -736,7 +745,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
 	}
 
 retry:
-	dr = vdev_disk_dio_alloc(bio_count);
+	dr = vdev_classic_dio_alloc(bio_count);
 
 	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
 	    zio->io_vd->vdev_failfast == B_TRUE) {
@@ -771,23 +780,23 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
 		 * this should be rare - see the comment above.
 		 */
 		if (dr->dr_bio_count == i) {
-			vdev_disk_dio_free(dr);
+			vdev_classic_dio_free(dr);
 			bio_count *= 2;
 			goto retry;
 		}
 
-		nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset);
+		nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset);
 		dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs);
 		if (unlikely(dr->dr_bio[i] == NULL)) {
-			vdev_disk_dio_free(dr);
+			vdev_classic_dio_free(dr);
 			return (SET_ERROR(ENOMEM));
 		}
 
-		/* Matching put called by vdev_disk_physio_completion */
-		vdev_disk_dio_get(dr);
+		/* Matching put called by vdev_classic_physio_completion */
+		vdev_classic_dio_get(dr);
 
 		BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
-		dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
+		dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion;
 		dr->dr_bio[i]->bi_private = dr;
 		bio_set_op_attrs(dr->dr_bio[i], rw, flags);
 
@@ -801,7 +810,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
 	}
 
 	/* Extra reference to protect dio_request during vdev_submit_bio */
-	vdev_disk_dio_get(dr);
+	vdev_classic_dio_get(dr);
 
 	if (dr->dr_bio_count > 1)
 		blk_start_plug(&plug);
@@ -815,11 +824,13 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio,
 	if (dr->dr_bio_count > 1)
 		blk_finish_plug(&plug);
 
-	vdev_disk_dio_put(dr);
+	vdev_classic_dio_put(dr);
 
 	return (error);
 }
 
+/* ========== */
+
 BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
 {
 	zio_t *zio = bio->bi_private;
@@ -1023,7 +1034,7 @@ vdev_disk_io_start(zio_t *zio)
 	}
 
 	zio->io_target_timestamp = zio_handle_io_delay(zio);
-	error = __vdev_disk_physio(BDH_BDEV(vd->vd_bdh), zio,
+	error = vdev_classic_physio(BDH_BDEV(vd->vd_bdh), zio,
 	    zio->io_size, zio->io_offset, rw, 0);
 	rw_exit(&vd->vd_lock);
 

From 867178ae1db28e73051c8a7ce662f2f2f81cd8e6 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 9 Jan 2024 12:23:30 +1100
Subject: [PATCH 18/34] vdev_disk: reorganise vdev_disk_io_start

Light reshuffle to make it a bit more linear to read and get rid of a
bunch of args that aren't needed in all cases.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
---
 module/os/linux/zfs/vdev_disk.c | 51 ++++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 957619b87afd..51e7cef2fc78 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -720,9 +720,16 @@ vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
 }
 
 static int
-vdev_classic_physio(struct block_device *bdev, zio_t *zio,
-    size_t io_size, uint64_t io_offset, int rw, int flags)
+vdev_classic_physio(zio_t *zio)
 {
+	vdev_t *v = zio->io_vd;
+	vdev_disk_t *vd = v->vdev_tsd;
+	struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
+	size_t io_size = zio->io_size;
+	uint64_t io_offset = zio->io_offset;
+	int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE;
+	int flags = 0;
+
 	dio_request_t *dr;
 	uint64_t abd_offset;
 	uint64_t bio_offset;
@@ -944,7 +951,7 @@ vdev_disk_io_start(zio_t *zio)
 {
 	vdev_t *v = zio->io_vd;
 	vdev_disk_t *vd = v->vdev_tsd;
-	int rw, error;
+	int error;
 
 	/*
 	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
@@ -1007,13 +1014,6 @@ vdev_disk_io_start(zio_t *zio)
 		rw_exit(&vd->vd_lock);
 		zio_execute(zio);
 		return;
-	case ZIO_TYPE_WRITE:
-		rw = WRITE;
-		break;
-
-	case ZIO_TYPE_READ:
-		rw = READ;
-		break;
 
 	case ZIO_TYPE_TRIM:
 		zio->io_error = vdev_disk_io_trim(zio);
@@ -1026,23 +1026,34 @@ vdev_disk_io_start(zio_t *zio)
 #endif
 		return;
 
-	default:
+	case ZIO_TYPE_READ:
+	case ZIO_TYPE_WRITE:
+		zio->io_target_timestamp = zio_handle_io_delay(zio);
+		error = vdev_classic_physio(zio);
 		rw_exit(&vd->vd_lock);
-		zio->io_error = SET_ERROR(ENOTSUP);
-		zio_interrupt(zio);
+		if (error) {
+			zio->io_error = error;
+			zio_interrupt(zio);
+		}
 		return;
-	}
 
-	zio->io_target_timestamp = zio_handle_io_delay(zio);
-	error = vdev_classic_physio(BDH_BDEV(vd->vd_bdh), zio,
-	    zio->io_size, zio->io_offset, rw, 0);
-	rw_exit(&vd->vd_lock);
+	default:
+		/*
+		 * Getting here means our parent vdev has made a very strange
+		 * request of us, and shouldn't happen. Assert here to force a
+		 * crash in dev builds, but in production return the IO
+		 * unhandled. The pool will likely suspend anyway but that's
+		 * nicer than crashing the kernel.
+		 */
+		ASSERT3S(zio->io_type, ==, -1);
 
-	if (error) {
-		zio->io_error = error;
+		rw_exit(&vd->vd_lock);
+		zio->io_error = SET_ERROR(ENOTSUP);
 		zio_interrupt(zio);
 		return;
 	}
+
+	__builtin_unreachable();
 }
 
 static void

From c4a13ba483f08a81aa47479d2f763a470d95b2b0 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 9 Jan 2024 12:29:19 +1100
Subject: [PATCH 19/34] vdev_disk: make read/write IO function configurable

This is just setting up for the next couple of commits, which will add a
new IO function and a parameter to select it.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
---
 module/os/linux/zfs/vdev_disk.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 51e7cef2fc78..de4dba72fa3c 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -946,6 +946,8 @@ vdev_disk_io_trim(zio_t *zio)
 #endif
 }
 
+int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL;
+
 static void
 vdev_disk_io_start(zio_t *zio)
 {
@@ -1029,7 +1031,7 @@ vdev_disk_io_start(zio_t *zio)
 	case ZIO_TYPE_READ:
 	case ZIO_TYPE_WRITE:
 		zio->io_target_timestamp = zio_handle_io_delay(zio);
-		error = vdev_classic_physio(zio);
+		error = vdev_disk_io_rw_fn(zio);
 		rw_exit(&vd->vd_lock);
 		if (error) {
 			zio->io_error = error;
@@ -1102,8 +1104,25 @@ vdev_disk_rele(vdev_t *vd)
 	/* XXX: Implement me as a vnode rele for the device */
 }
 
+/*
+ * At first use vdev use, set the submission function from the default value if
+ * it hasn't been set already.
+ */
+static int
+vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
+{
+	(void) spa;
+	(void) nv;
+	(void) tsd;
+
+	if (vdev_disk_io_rw_fn == NULL)
+		vdev_disk_io_rw_fn = vdev_classic_physio;
+
+	return (0);
+}
+
 vdev_ops_t vdev_disk_ops = {
-	.vdev_op_init = NULL,
+	.vdev_op_init = vdev_disk_init,
 	.vdev_op_fini = NULL,
 	.vdev_op_open = vdev_disk_open,
 	.vdev_op_close = vdev_disk_close,

From 06a196020e6f70d2fedbd4d0d05bbe0c1ac6e4d8 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 18 Jul 2023 11:11:29 +1000
Subject: [PATCH 20/34] vdev_disk: rewrite BIO filling machinery to avoid split
 pages

This commit tackles a number of issues in the way BIOs (`struct bio`)
are constructed for submission to the Linux block layer.

The kernel has a hard upper limit on the number of pages/segments that
can be added to a BIO, as well as a separate limit for each device
(related to its queue depth and other scheduling characteristics).

ZFS counts the number of memory pages in the request ABD
(`abd_nr_pages_off()`, and then uses that as the number of segments to
put into the BIO, up to the hard upper limit. If it requires more than
the limit, it will create multiple BIOs.

Leaving aside the fact that page count method is wrong (see below), not
limiting to the device segment max means that the device driver will
need to split the BIO in half. This is alone is not necessarily a
problem, but it interacts with another issue to cause a much larger
problem.

The kernel function to add a segment to a BIO (`bio_add_page()`) takes a
`struct page` pointer, and offset+len within it. `struct page` can
represent a run of contiguous memory pages (known as a "compound page").
In can be of arbitrary length.

The ZFS functions that count ABD pages and load them into the BIO
(`abd_nr_pages_off()`, `bio_map()` and `abd_bio_map_off()`) will never
consider a page to be more than `PAGE_SIZE` (4K), even if the `struct
page` is for multiple pages. In this case, it will load the same `struct
page` into the BIO multiple times, with the offset adjusted each time.

With a sufficiently large ABD, this can easily lead to the BIO being
entirely filled much earlier than it could have been. This is also
further contributes to the problem caused by the incorrect segment limit
calculation, as its much easier to go past the device limit, and so
require a split.

Again, this is not a problem on its own.

The logic for "never submit more than `PAGE_SIZE`" is actually a little
more subtle. It will actually never submit a buffer that crosses a 4K
page boundary.

In practice, this is fine, as most ABDs are scattered, that is a list of
complete 4K pages, and so are loaded in as such.

Linear ABDs are typically allocated from slabs, and for small sizes they
are frequently not aligned to page boundaries. For example, a 12K
allocation can span four pages, eg:

     -- 4K -- -- 4K -- -- 4K -- -- 4K --
    |        |        |        |        |
          :## ######## ######## ######:    [1K, 4K, 4K, 3K]

Such an allocation would be loaded into a BIO as you see:

    [1K, 4K, 4K, 3K]

This tends not to be a problem in practice, because even if the BIO were
filled and needed to be split, each half would still have either a start
or end aligned to the logical block size of the device (assuming 4K at
least).

---

In ideal circumstances, these shortcomings don't cause any particular
problems. Its when they start to interact with other ZFS features that
things get interesting.

Aggregation will create a "gang" ABD, which is simply a list of other
ABDs. Iterating over a gang ABD is just iterating over each ABD within
it in turn.

Because the segments are simply loaded in order, we can end up with
uneven segments either side of the "gap" between the two ABDs. For
example, two 12K ABDs might be aggregated and then loaded as:

    [1K, 4K, 4K, 3K, 2K, 4K, 4K, 2K]

Should a split occur, each individual BIO can end up either having an
start or end offset that is not aligned to the logical block size, which
some drivers (eg SCSI) will reject. However, this tends not to happen
because the default aggregation limit usually keeps the BIO small enough
to not require more than one split, and most pages are actually full 4K
pages, so hitting an uneven gap is very rare anyway.

If the pool is under particular memory pressure, then an IO can be
broken down into a "gang block", a 512-byte block composed of a header
and up to three block pointers. Each points to a fragment of the
original write, or in turn, another gang block, breaking the original
data up over and over until space can be found in the pool for each of
them.

Each gang header is a separate 512-byte memory allocation from a slab,
that needs to be written down to disk. When the gang header is added to
the BIO, its a single 512-byte segment.

Pulling all this together, consider a large aggregated write of gang
blocks. This results a BIO containing lots of 512-byte segments. Given
our tendency to overfill the BIO, a split is likely, and most possible
split points will yield a pair of BIOs that are misaligned. Drivers that
care, like the SCSI driver, will reject them.

---

This commit is a substantial refactor and rewrite of much of `vdev_disk`
to sort all this out.

`vdev_bio_max_segs()` now returns the ideal maximum size for the device,
if available. There's also a tuneable `zfs_vdev_disk_max_segs` to
override this, to assist with testing.

We scan the ABD up front to count the number of pages within it, and to
confirm that if we submitted all those pages to one or more BIOs, it
could be split at any point with creating a misaligned BIO.  If the
pages in the BIO are not usable (as in any of the above situations), the
ABD is linearised, and then checked again. This is the same technique
used in `vdev_geom` on FreeBSD, adjusted for Linux's variable page size
and allocator quirks.

`vbio_t` is a cleanup and enhancement of the old `dio_request_t`. The
idea is simply that it can hold all the state needed to create, submit
and return multiple BIOs, including all the refcounts, the ABD copy if
it was needed, and so on. Apart from what I hope is a clearer interface,
the major difference is that because we know how many BIOs we'll need up
front, we don't need the old overflow logic that would grow the BIO
array, throw away all the old work and restart. We can get it right from
the start.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
---
 include/os/linux/kernel/linux/mod_compat.h |   1 +
 man/man4/zfs.4                             |  10 +-
 module/os/linux/zfs/vdev_disk.c            | 439 ++++++++++++++++++++-
 3 files changed, 447 insertions(+), 3 deletions(-)

diff --git a/include/os/linux/kernel/linux/mod_compat.h b/include/os/linux/kernel/linux/mod_compat.h
index 8e20a9613539..039865b703ef 100644
--- a/include/os/linux/kernel/linux/mod_compat.h
+++ b/include/os/linux/kernel/linux/mod_compat.h
@@ -68,6 +68,7 @@ enum scope_prefix_types {
 	zfs_trim,
 	zfs_txg,
 	zfs_vdev,
+	zfs_vdev_disk,
 	zfs_vdev_file,
 	zfs_vdev_mirror,
 	zfs_vnops,
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 759a68784aca..61f1df9c81d5 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -2,6 +2,7 @@
 .\" Copyright (c) 2013 by Turbo Fredriksson <turbo@bayour.com>. All rights reserved.
 .\" Copyright (c) 2019, 2021 by Delphix. All rights reserved.
 .\" Copyright (c) 2019 Datto Inc.
+.\" Copyright (c) 2023, 2024 Klara, Inc.
 .\" The contents of this file are subject to the terms of the Common Development
 .\" and Distribution License (the "License").  You may not use this file except
 .\" in compliance with the License. You can obtain a copy of the license at
@@ -15,7 +16,7 @@
 .\" own identifying information:
 .\" Portions Copyright [yyyy] [name of copyright owner]
 .\"
-.Dd July 21, 2023
+.Dd January 9, 2024
 .Dt ZFS 4
 .Os
 .
@@ -1375,6 +1376,13 @@ _
 	4	Driver	No driver retries on driver errors.
 .TE
 .
+.It Sy zfs_vdev_disk_max_segs Ns = Ns Sy 0 Pq uint
+Maximum number of segments to add to a BIO (min 4).
+If this is higher than the maximum allowed by the device queue or the kernel
+itself, it will be clamped.
+Setting it to zero will cause the kernel's ideal size to be used.
+This parameter only applies on Linux.
+.
 .It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int
 Time before expiring
 .Pa .zfs/snapshot .
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index de4dba72fa3c..0ccb9ad96fa5 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -24,6 +24,7 @@
  * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
  * LLNL-CODE-403049.
  * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2023, 2024, Klara Inc.
  */
 
 #include <sys/zfs_context.h>
@@ -66,6 +67,13 @@ typedef struct vdev_disk {
 	krwlock_t			vd_lock;
 } vdev_disk_t;
 
+/*
+ * Maximum number of segments to add to a bio (min 4). If this is higher than
+ * the maximum allowed by the device queue or the kernel itself, it will be
+ * clamped. Setting it to zero will cause the kernel's ideal size to be used.
+ */
+uint_t zfs_vdev_disk_max_segs = 0;
+
 /*
  * Unique identifier for the exclusive vdev holder.
  */
@@ -607,10 +615,433 @@ vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask,
 	return (bio);
 }
 
+static inline uint_t
+vdev_bio_max_segs(struct block_device *bdev)
+{
+	/*
+	 * Smallest of the device max segs and the tuneable max segs. Minimum
+	 * 4, so there's room to finish split pages if they come up.
+	 */
+	const uint_t dev_max_segs = queue_max_segments(bdev_get_queue(bdev));
+	const uint_t tune_max_segs = (zfs_vdev_disk_max_segs > 0) ?
+	    MAX(4, zfs_vdev_disk_max_segs) : dev_max_segs;
+	const uint_t max_segs = MIN(tune_max_segs, dev_max_segs);
+
+#ifdef HAVE_BIO_MAX_SEGS
+	return (bio_max_segs(max_segs));
+#else
+	return (MIN(max_segs, BIO_MAX_PAGES));
+#endif
+}
+
+static inline uint_t
+vdev_bio_max_bytes(struct block_device *bdev)
+{
+	return (queue_max_sectors(bdev_get_queue(bdev)) << 9);
+}
+
+
+/*
+ * Virtual block IO object (VBIO)
+ *
+ * Linux block IO (BIO) objects have a limit on how many data segments (pages)
+ * they can hold. Depending on how they're allocated and structured, a large
+ * ZIO can require more than one BIO to be submitted to the kernel, which then
+ * all have to complete before we can return the completed ZIO back to ZFS.
+ *
+ * A VBIO is a wrapper around multiple BIOs, carrying everything needed to
+ * translate a ZIO down into the kernel block layer and back again.
+ *
+ * Note that these are only used for data ZIOs (read/write). Meta-operations
+ * (flush/trim) don't need multiple BIOs and so can just make the call
+ * directly.
+ */
+typedef struct {
+	zio_t		*vbio_zio;	/* parent zio */
+
+	struct block_device *vbio_bdev;	/* blockdev to submit bios to */
+
+	abd_t		*vbio_abd;	/* abd carrying borrowed linear buf */
+
+	atomic_t	vbio_ref;	/* bio refcount */
+	int		vbio_error;	/* error from failed bio */
+
+	uint_t		vbio_max_segs;	/* max segs per bio */
+
+	uint_t		vbio_max_bytes;	/* max bytes per bio */
+	uint_t		vbio_lbs_mask;	/* logical block size mask */
+
+	uint64_t	vbio_offset;	/* start offset of next bio */
+
+	struct bio	*vbio_bio;	/* pointer to the current bio */
+	struct bio	*vbio_bios;	/* list of all bios */
+} vbio_t;
+
+static vbio_t *
+vbio_alloc(zio_t *zio, struct block_device *bdev)
+{
+	vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP);
+
+	vbio->vbio_zio = zio;
+	vbio->vbio_bdev = bdev;
+	atomic_set(&vbio->vbio_ref, 0);
+	vbio->vbio_max_segs = vdev_bio_max_segs(bdev);
+	vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev);
+	vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1);
+	vbio->vbio_offset = zio->io_offset;
+
+	return (vbio);
+}
+
+static int
+vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
+{
+	struct bio *bio;
+	uint_t ssize;
+
+	while (size > 0) {
+		bio = vbio->vbio_bio;
+		if (bio == NULL) {
+			/* New BIO, allocate and set up */
+			bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO,
+			    vbio->vbio_max_segs);
+			if (unlikely(bio == NULL))
+				return (SET_ERROR(ENOMEM));
+			BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9;
+
+			bio->bi_next = vbio->vbio_bios;
+			vbio->vbio_bios = vbio->vbio_bio = bio;
+		}
+
+		/*
+		 * Only load as much of the current page data as will fit in
+		 * the space left in the BIO, respecting lbs alignment. Older
+		 * kernels will error if we try to overfill the BIO, while
+		 * newer ones will accept it and split the BIO. This ensures
+		 * everything works on older kernels, and avoids an additional
+		 * overhead on the new.
+		 */
+		ssize = MIN(size, (vbio->vbio_max_bytes - BIO_BI_SIZE(bio)) &
+		    vbio->vbio_lbs_mask);
+		if (ssize > 0 &&
+		    bio_add_page(bio, page, ssize, offset) == ssize) {
+			/* Accepted, adjust and load any remaining. */
+			size -= ssize;
+			offset += ssize;
+			continue;
+		}
+
+		/* No room, set up for a new BIO and loop */
+		vbio->vbio_offset += BIO_BI_SIZE(bio);
+
+		/* Signal new BIO allocation wanted */
+		vbio->vbio_bio = NULL;
+	}
+
+	return (0);
+}
+
+BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error);
+static void vbio_put(vbio_t *vbio);
+
+static void
+vbio_submit(vbio_t *vbio, int flags)
+{
+	ASSERT(vbio->vbio_bios);
+	struct bio *bio = vbio->vbio_bios;
+	vbio->vbio_bio = vbio->vbio_bios = NULL;
+
+	/*
+	 * We take a reference for each BIO as we submit it, plus one to
+	 * protect us from BIOs completing before we're done submitting them
+	 * all, causing vbio_put() to free vbio out from under us and/or the
+	 * zio to be returned before all its IO has completed.
+	 */
+	atomic_set(&vbio->vbio_ref, 1);
+
+	/*
+	 * If we're submitting more than one BIO, inform the block layer so
+	 * it can batch them if it wants.
+	 */
+	struct blk_plug plug;
+	boolean_t do_plug = (bio->bi_next != NULL);
+	if (do_plug)
+		blk_start_plug(&plug);
+
+	/* Submit all the BIOs */
+	while (bio != NULL) {
+		atomic_inc(&vbio->vbio_ref);
+
+		struct bio *next = bio->bi_next;
+		bio->bi_next = NULL;
+
+		bio->bi_end_io = vdev_disk_io_rw_completion;
+		bio->bi_private = vbio;
+		bio_set_op_attrs(bio,
+		    vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
+		    WRITE : READ, flags);
+
+		vdev_submit_bio(bio);
+
+		bio = next;
+	}
+
+	/* Finish the batch */
+	if (do_plug)
+		blk_finish_plug(&plug);
+
+	/* Release the extra reference */
+	vbio_put(vbio);
+}
+
+static void
+vbio_return_abd(vbio_t *vbio)
+{
+	zio_t *zio = vbio->vbio_zio;
+	if (vbio->vbio_abd == NULL)
+		return;
+
+	/*
+	 * If we copied the ABD before issuing it, clean up and return the copy
+	 * to the ADB, with changes if appropriate.
+	 */
+	void *buf = abd_to_buf(vbio->vbio_abd);
+	abd_free(vbio->vbio_abd);
+	vbio->vbio_abd = NULL;
+
+	if (zio->io_type == ZIO_TYPE_READ)
+		abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
+	else
+		abd_return_buf(zio->io_abd, buf, zio->io_size);
+}
+
+static void
+vbio_free(vbio_t *vbio)
+{
+	VERIFY0(atomic_read(&vbio->vbio_ref));
+
+	vbio_return_abd(vbio);
+
+	kmem_free(vbio, sizeof (vbio_t));
+}
+
+static void
+vbio_put(vbio_t *vbio)
+{
+	if (atomic_dec_return(&vbio->vbio_ref) > 0)
+		return;
+
+	/*
+	 * This was the last reference, so the entire IO is completed. Clean
+	 * up and submit it for processing.
+	 */
+
+	/*
+	 * Get any data buf back to the original ABD, if necessary. We do this
+	 * now so we can get the ZIO into the pipeline as quickly as possible,
+	 * and then do the remaining cleanup after.
+	 */
+	vbio_return_abd(vbio);
+
+	zio_t *zio = vbio->vbio_zio;
+
+	/*
+	 * Set the overall error. If multiple BIOs returned an error, only the
+	 * first will be taken; the others are dropped (see
+	 * vdev_disk_io_rw_completion()). Its pretty much impossible for
+	 * multiple IOs to the same device to fail with different errors, so
+	 * there's no real risk.
+	 */
+	zio->io_error = vbio->vbio_error;
+	if (zio->io_error)
+		vdev_disk_error(zio);
+
+	/* All done, submit for processing */
+	zio_delay_interrupt(zio);
+
+	/* Finish cleanup */
+	vbio_free(vbio);
+}
+
+BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error)
+{
+	vbio_t *vbio = bio->bi_private;
+
+	if (vbio->vbio_error == 0) {
+#ifdef HAVE_1ARG_BIO_END_IO_T
+		vbio->vbio_error = BIO_END_IO_ERROR(bio);
+#else
+		if (error)
+			vbio->vbio_error = -(error);
+		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+			vbio->vbio_error = EIO;
+#endif
+	}
+
+	/*
+	 * Destroy the BIO. This is safe to do; the vbio owns its data and the
+	 * kernel won't touch it again after the completion function runs.
+	 */
+	bio_put(bio);
+
+	/* Drop this BIOs reference acquired by vbio_submit() */
+	vbio_put(vbio);
+}
+
+/*
+ * Iterator callback to count ABD pages and check their size & alignment.
+ *
+ * On Linux, each BIO segment can take a page pointer, and an offset+length of
+ * the data within that page. A page can be arbitrarily large ("compound"
+ * pages) but we still have to ensure the data portion is correctly sized and
+ * aligned to the logical block size, to ensure that if the kernel wants to
+ * split the BIO, the two halves will still be properly aligned.
+ */
+typedef struct {
+	uint_t  bmask;
+	uint_t  npages;
+	uint_t  end;
+} vdev_disk_check_pages_t;
+
+static int
+vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
+{
+	vdev_disk_check_pages_t *s = priv;
+
+	/*
+	 * If we didn't finish on a block size boundary last time, then there
+	 * would be a gap if we tried to use this ABD as-is, so abort.
+	 */
+	if (s->end != 0)
+		return (1);
+
+	/*
+	 * Note if we're taking less than a full block, so we can check it
+	 * above on the next call.
+	 */
+	s->end = len & s->bmask;
+
+	/* All blocks after the first must start on a block size boundary. */
+	if (s->npages != 0 && (off & s->bmask) != 0)
+		return (1);
+
+	s->npages++;
+	return (0);
+}
+
+/*
+ * Check if we can submit the pages in this ABD to the kernel as-is. Returns
+ * the number of pages, or 0 if it can't be submitted like this.
+ */
+static boolean_t
+vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev)
+{
+	vdev_disk_check_pages_t s = {
+	    .bmask = bdev_logical_block_size(bdev)-1,
+	    .npages = 0,
+	    .end = 0,
+	};
+
+	if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s))
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+/* Iterator callback to submit ABD pages to the vbio. */
+static int
+vdev_disk_fill_vbio_cb(struct page *page, size_t off, size_t len, void *priv)
+{
+	vbio_t *vbio = priv;
+	return (vbio_add_page(vbio, page, len, off));
+}
+
+static int
+vdev_disk_io_rw(zio_t *zio)
+{
+	vdev_t *v = zio->io_vd;
+	vdev_disk_t *vd = v->vdev_tsd;
+	struct block_device *bdev = BDH_BDEV(vd->vd_bdh);
+	int flags = 0;
+
+	/*
+	 * Accessing outside the block device is never allowed.
+	 */
+	if (zio->io_offset + zio->io_size > bdev->bd_inode->i_size) {
+		vdev_dbgmsg(zio->io_vd,
+		    "Illegal access %llu size %llu, device size %llu",
+		    (u_longlong_t)zio->io_offset,
+		    (u_longlong_t)zio->io_size,
+		    (u_longlong_t)i_size_read(bdev->bd_inode));
+		return (SET_ERROR(EIO));
+	}
+
+	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
+	    v->vdev_failfast == B_TRUE) {
+		bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1,
+		    zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4);
+	}
+
+	/*
+	 * Check alignment of the incoming ABD. If any part of it would require
+	 * submitting a page that is not aligned to the logical block size,
+	 * then we take a copy into a linear buffer and submit that instead.
+	 * This should be impossible on a 512b LBS, and fairly rare on 4K,
+	 * usually requiring abnormally-small data blocks (eg gang blocks)
+	 * mixed into the same ABD as larger ones (eg aggregated).
+	 */
+	abd_t *abd = zio->io_abd;
+	if (!vdev_disk_check_pages(abd, zio->io_size, bdev)) {
+		void *buf;
+		if (zio->io_type == ZIO_TYPE_READ)
+			buf = abd_borrow_buf(zio->io_abd, zio->io_size);
+		else
+			buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
+
+		/*
+		 * Wrap the copy in an abd_t, so we can use the same iterators
+		 * to count and fill the vbio later.
+		 */
+		abd = abd_get_from_buf(buf, zio->io_size);
+
+		/*
+		 * False here would mean the borrowed copy has an invalid
+		 * alignment too, which would mean we've somehow been passed a
+		 * linear ABD with an interior page that has a non-zero offset
+		 * or a size not a multiple of PAGE_SIZE. This is not possible.
+		 * It would mean either zio_buf_alloc() or its underlying
+		 * allocators have done something extremely strange, or our
+		 * math in vdev_disk_check_pages() is wrong. In either case,
+		 * something in seriously wrong and its not safe to continue.
+		 */
+		VERIFY(vdev_disk_check_pages(abd, zio->io_size, bdev));
+	}
+
+	/* Allocate vbio, with a pointer to the borrowed ABD if necessary */
+	int error = 0;
+	vbio_t *vbio = vbio_alloc(zio, bdev);
+	if (abd != zio->io_abd)
+		vbio->vbio_abd = abd;
+
+	/* Fill it with pages */
+	error = abd_iterate_page_func(abd, 0, zio->io_size,
+	    vdev_disk_fill_vbio_cb, vbio);
+	if (error != 0) {
+		vbio_free(vbio);
+		return (error);
+	}
+
+	vbio_submit(vbio, flags);
+	return (0);
+}
+
 /* ========== */
 
 /*
- * This is the classic, battle-tested BIO submission code.
+ * This is the classic, battle-tested BIO submission code. Until we're totally
+ * sure that the new code is safe and correct in all cases, this will remain
+ * available and can be enabled by setting zfs_vdev_disk_classic=1 at module
+ * load time.
  *
  * These functions have been renamed to vdev_classic_* to make it clear what
  * they belong to, but their implementations are unchanged.
@@ -1116,7 +1547,8 @@ vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
 	(void) tsd;
 
 	if (vdev_disk_io_rw_fn == NULL)
-		vdev_disk_io_rw_fn = vdev_classic_physio;
+		/* XXX make configurable */
+		vdev_disk_io_rw_fn = 0 ? vdev_classic_physio : vdev_disk_io_rw;
 
 	return (0);
 }
@@ -1215,3 +1647,6 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW,
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,
 	"Defines failfast mask: 1 - device, 2 - transport, 4 - driver");
+
+ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW,
+	"Maximum number of data segments to add to an IO request (min 4)");

From df2169d141aadc0c2cc728c5c5261d6f5c2a27f7 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 9 Jan 2024 13:28:57 +1100
Subject: [PATCH 21/34] vdev_disk: add module parameter to select BIO
 submission method

This makes the submission method selectable at module load time via the
`zfs_vdev_disk_classic` parameter, allowing this change to be backported
to 2.2 safely, and disabled in favour of the "classic" submission method
if new problems come up.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
---
 man/man4/zfs.4                  | 16 ++++++++++++++++
 module/os/linux/zfs/vdev_disk.c | 31 +++++++++++++++++++++++++++++--
 2 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 61f1df9c81d5..cacb214d1dc1 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -1382,6 +1382,22 @@ If this is higher than the maximum allowed by the device queue or the kernel
 itself, it will be clamped.
 Setting it to zero will cause the kernel's ideal size to be used.
 This parameter only applies on Linux.
+This parameter is ignored if
+.Sy zfs_vdev_disk_classic Ns = Ns Sy 1 .
+.
+.It Sy zfs_vdev_disk_classic Ns = Ns Sy 0 Ns | Ns 1 Pq uint
+If set to 1, OpenZFS will submit IO to Linux using the method it used in 2.2
+and earlier.
+This "classic" method has known issues with highly fragmented IO requests and
+is slower on many workloads, but it has been in use for many years and is known
+to be very stable.
+If you set this parameter, please also open a bug report why you did so,
+including the workload involved and any error messages.
+.Pp
+This parameter and the classic submission method will be removed once we have
+total confidence in the new method.
+.Pp
+This parameter only applies on Linux, and can only be set at module load time.
 .
 .It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int
 Time before expiring
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 0ccb9ad96fa5..a9110623ace0 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -1535,6 +1535,29 @@ vdev_disk_rele(vdev_t *vd)
 	/* XXX: Implement me as a vnode rele for the device */
 }
 
+/*
+ * BIO submission method. See comment above about vdev_classic.
+ * Set zfs_vdev_disk_classic=0 for new, =1 for classic
+ */
+static uint_t zfs_vdev_disk_classic = 0;	/* default new */
+
+/* Set submission function from module parameter */
+static int
+vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp)
+{
+	int err = param_set_uint(buf, kp);
+	if (err < 0)
+		return (SET_ERROR(err));
+
+	vdev_disk_io_rw_fn =
+	    zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw;
+
+	printk(KERN_INFO "ZFS: forcing %s BIO submission\n",
+	    zfs_vdev_disk_classic ? "classic" : "new");
+
+	return (0);
+}
+
 /*
  * At first use vdev use, set the submission function from the default value if
  * it hasn't been set already.
@@ -1547,8 +1570,8 @@ vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
 	(void) tsd;
 
 	if (vdev_disk_io_rw_fn == NULL)
-		/* XXX make configurable */
-		vdev_disk_io_rw_fn = 0 ? vdev_classic_physio : vdev_disk_io_rw;
+		vdev_disk_io_rw_fn = zfs_vdev_disk_classic ?
+		    vdev_classic_physio : vdev_disk_io_rw;
 
 	return (0);
 }
@@ -1650,3 +1673,7 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,
 
 ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW,
 	"Maximum number of data segments to add to an IO request (min 4)");
+
+ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic,
+    vdev_disk_param_set_classic, param_get_uint, ZMOD_RD,
+	"Use classic BIO submission method");

From 72fd834c47558cb10d847948d1a4615e894c77c3 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Wed, 21 Feb 2024 11:07:21 +1100
Subject: [PATCH 22/34] vdev_disk: use bio_chain() to submit multiple BIOs

Simplifies our code a lot, so we don't have to wait for each and
reassemble them.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
---
 module/os/linux/zfs/vdev_disk.c | 231 +++++++++++---------------------
 1 file changed, 80 insertions(+), 151 deletions(-)

diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index a9110623ace0..36468fc21132 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -454,10 +454,9 @@ vdev_disk_close(vdev_t *v)
 	if (v->vdev_reopening || vd == NULL)
 		return;
 
-	if (vd->vd_bdh != NULL) {
+	if (vd->vd_bdh != NULL)
 		vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa),
 		    zfs_vdev_holder);
-	}
 
 	rw_destroy(&vd->vd_lock);
 	kmem_free(vd, sizeof (vdev_disk_t));
@@ -663,9 +662,6 @@ typedef struct {
 
 	abd_t		*vbio_abd;	/* abd carrying borrowed linear buf */
 
-	atomic_t	vbio_ref;	/* bio refcount */
-	int		vbio_error;	/* error from failed bio */
-
 	uint_t		vbio_max_segs;	/* max segs per bio */
 
 	uint_t		vbio_max_bytes;	/* max bytes per bio */
@@ -674,43 +670,52 @@ typedef struct {
 	uint64_t	vbio_offset;	/* start offset of next bio */
 
 	struct bio	*vbio_bio;	/* pointer to the current bio */
-	struct bio	*vbio_bios;	/* list of all bios */
+	int		vbio_flags;	/* bio flags */
 } vbio_t;
 
 static vbio_t *
-vbio_alloc(zio_t *zio, struct block_device *bdev)
+vbio_alloc(zio_t *zio, struct block_device *bdev, int flags)
 {
 	vbio_t *vbio = kmem_zalloc(sizeof (vbio_t), KM_SLEEP);
 
 	vbio->vbio_zio = zio;
 	vbio->vbio_bdev = bdev;
-	atomic_set(&vbio->vbio_ref, 0);
+	vbio->vbio_abd = NULL;
 	vbio->vbio_max_segs = vdev_bio_max_segs(bdev);
 	vbio->vbio_max_bytes = vdev_bio_max_bytes(bdev);
 	vbio->vbio_lbs_mask = ~(bdev_logical_block_size(bdev)-1);
 	vbio->vbio_offset = zio->io_offset;
+	vbio->vbio_bio = NULL;
+	vbio->vbio_flags = flags;
 
 	return (vbio);
 }
 
+BIO_END_IO_PROTO(vbio_completion, bio, error);
+
 static int
 vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
 {
-	struct bio *bio;
+	struct bio *bio = vbio->vbio_bio;
 	uint_t ssize;
 
 	while (size > 0) {
-		bio = vbio->vbio_bio;
 		if (bio == NULL) {
 			/* New BIO, allocate and set up */
 			bio = vdev_bio_alloc(vbio->vbio_bdev, GFP_NOIO,
 			    vbio->vbio_max_segs);
-			if (unlikely(bio == NULL))
-				return (SET_ERROR(ENOMEM));
+			VERIFY(bio);
+
 			BIO_BI_SECTOR(bio) = vbio->vbio_offset >> 9;
+			bio_set_op_attrs(bio,
+			    vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
+			    WRITE : READ, vbio->vbio_flags);
 
-			bio->bi_next = vbio->vbio_bios;
-			vbio->vbio_bios = vbio->vbio_bio = bio;
+			if (vbio->vbio_bio) {
+				bio_chain(vbio->vbio_bio, bio);
+				vdev_submit_bio(vbio->vbio_bio);
+			}
+			vbio->vbio_bio = bio;
 		}
 
 		/*
@@ -735,157 +740,97 @@ vbio_add_page(vbio_t *vbio, struct page *page, uint_t size, uint_t offset)
 		vbio->vbio_offset += BIO_BI_SIZE(bio);
 
 		/* Signal new BIO allocation wanted */
-		vbio->vbio_bio = NULL;
+		bio = NULL;
 	}
 
 	return (0);
 }
 
-BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error);
-static void vbio_put(vbio_t *vbio);
+/* Iterator callback to submit ABD pages to the vbio. */
+static int
+vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv)
+{
+	vbio_t *vbio = priv;
+	return (vbio_add_page(vbio, page, len, off));
+}
 
+/* Create some BIOs, fill them with data and submit them */
 static void
-vbio_submit(vbio_t *vbio, int flags)
+vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
 {
-	ASSERT(vbio->vbio_bios);
-	struct bio *bio = vbio->vbio_bios;
-	vbio->vbio_bio = vbio->vbio_bios = NULL;
-
-	/*
-	 * We take a reference for each BIO as we submit it, plus one to
-	 * protect us from BIOs completing before we're done submitting them
-	 * all, causing vbio_put() to free vbio out from under us and/or the
-	 * zio to be returned before all its IO has completed.
-	 */
-	atomic_set(&vbio->vbio_ref, 1);
+	ASSERT(vbio->vbio_bdev);
 
 	/*
-	 * If we're submitting more than one BIO, inform the block layer so
-	 * it can batch them if it wants.
+	 * We plug so we can submit the BIOs as we go and only unplug them when
+	 * they are fully created and submitted. This is important; if we don't
+	 * plug, then the kernel may start executing earlier BIOs while we're
+	 * still creating and executing later ones, and if the device goes
+	 * away while that's happening, older kernels can get confused and
+	 * trample memory.
 	 */
 	struct blk_plug plug;
-	boolean_t do_plug = (bio->bi_next != NULL);
-	if (do_plug)
-		blk_start_plug(&plug);
+	blk_start_plug(&plug);
 
-	/* Submit all the BIOs */
-	while (bio != NULL) {
-		atomic_inc(&vbio->vbio_ref);
+	(void) abd_iterate_page_func(abd, 0, size, vbio_fill_cb, vbio);
+	ASSERT(vbio->vbio_bio);
 
-		struct bio *next = bio->bi_next;
-		bio->bi_next = NULL;
+	vbio->vbio_bio->bi_end_io = vbio_completion;
+	vbio->vbio_bio->bi_private = vbio;
 
-		bio->bi_end_io = vdev_disk_io_rw_completion;
-		bio->bi_private = vbio;
-		bio_set_op_attrs(bio,
-		    vbio->vbio_zio->io_type == ZIO_TYPE_WRITE ?
-		    WRITE : READ, flags);
+	vdev_submit_bio(vbio->vbio_bio);
 
-		vdev_submit_bio(bio);
-
-		bio = next;
-	}
-
-	/* Finish the batch */
-	if (do_plug)
-		blk_finish_plug(&plug);
+	blk_finish_plug(&plug);
 
-	/* Release the extra reference */
-	vbio_put(vbio);
+	vbio->vbio_bio = NULL;
+	vbio->vbio_bdev = NULL;
 }
 
-static void
-vbio_return_abd(vbio_t *vbio)
+/* IO completion callback */
+BIO_END_IO_PROTO(vbio_completion, bio, error)
 {
+	vbio_t *vbio = bio->bi_private;
 	zio_t *zio = vbio->vbio_zio;
-	if (vbio->vbio_abd == NULL)
-		return;
-
-	/*
-	 * If we copied the ABD before issuing it, clean up and return the copy
-	 * to the ADB, with changes if appropriate.
-	 */
-	void *buf = abd_to_buf(vbio->vbio_abd);
-	abd_free(vbio->vbio_abd);
-	vbio->vbio_abd = NULL;
-
-	if (zio->io_type == ZIO_TYPE_READ)
-		abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
-	else
-		abd_return_buf(zio->io_abd, buf, zio->io_size);
-}
 
-static void
-vbio_free(vbio_t *vbio)
-{
-	VERIFY0(atomic_read(&vbio->vbio_ref));
-
-	vbio_return_abd(vbio);
+	ASSERT(zio);
 
-	kmem_free(vbio, sizeof (vbio_t));
-}
+	/* Capture and log any errors */
+#ifdef HAVE_1ARG_BIO_END_IO_T
+	zio->io_error = BIO_END_IO_ERROR(bio);
+#else
+	zio->io_error = 0;
+	if (error)
+		zio->io_error = -(error);
+	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+		zio->io_error = EIO;
+#endif
+	ASSERT3U(zio->io_error, >=, 0);
 
-static void
-vbio_put(vbio_t *vbio)
-{
-	if (atomic_dec_return(&vbio->vbio_ref) > 0)
-		return;
+	if (zio->io_error)
+		vdev_disk_error(zio);
 
-	/*
-	 * This was the last reference, so the entire IO is completed. Clean
-	 * up and submit it for processing.
-	 */
+	/* Return the BIO to the kernel */
+	bio_put(bio);
 
 	/*
-	 * Get any data buf back to the original ABD, if necessary. We do this
-	 * now so we can get the ZIO into the pipeline as quickly as possible,
-	 * and then do the remaining cleanup after.
+	 * If we copied the ABD before issuing it, clean up and return the copy
+	 * to the ADB, with changes if appropriate.
 	 */
-	vbio_return_abd(vbio);
+	if (vbio->vbio_abd != NULL) {
+		void *buf = abd_to_buf(vbio->vbio_abd);
+		abd_free(vbio->vbio_abd);
+		vbio->vbio_abd = NULL;
 
-	zio_t *zio = vbio->vbio_zio;
+		if (zio->io_type == ZIO_TYPE_READ)
+			abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
+		else
+			abd_return_buf(zio->io_abd, buf, zio->io_size);
+	}
 
-	/*
-	 * Set the overall error. If multiple BIOs returned an error, only the
-	 * first will be taken; the others are dropped (see
-	 * vdev_disk_io_rw_completion()). Its pretty much impossible for
-	 * multiple IOs to the same device to fail with different errors, so
-	 * there's no real risk.
-	 */
-	zio->io_error = vbio->vbio_error;
-	if (zio->io_error)
-		vdev_disk_error(zio);
+	/* Final cleanup */
+	kmem_free(vbio, sizeof (vbio_t));
 
 	/* All done, submit for processing */
 	zio_delay_interrupt(zio);
-
-	/* Finish cleanup */
-	vbio_free(vbio);
-}
-
-BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error)
-{
-	vbio_t *vbio = bio->bi_private;
-
-	if (vbio->vbio_error == 0) {
-#ifdef HAVE_1ARG_BIO_END_IO_T
-		vbio->vbio_error = BIO_END_IO_ERROR(bio);
-#else
-		if (error)
-			vbio->vbio_error = -(error);
-		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-			vbio->vbio_error = EIO;
-#endif
-	}
-
-	/*
-	 * Destroy the BIO. This is safe to do; the vbio owns its data and the
-	 * kernel won't touch it again after the completion function runs.
-	 */
-	bio_put(bio);
-
-	/* Drop this BIOs reference acquired by vbio_submit() */
-	vbio_put(vbio);
 }
 
 /*
@@ -948,14 +893,6 @@ vdev_disk_check_pages(abd_t *abd, uint64_t size, struct block_device *bdev)
 	return (B_TRUE);
 }
 
-/* Iterator callback to submit ABD pages to the vbio. */
-static int
-vdev_disk_fill_vbio_cb(struct page *page, size_t off, size_t len, void *priv)
-{
-	vbio_t *vbio = priv;
-	return (vbio_add_page(vbio, page, len, off));
-}
-
 static int
 vdev_disk_io_rw(zio_t *zio)
 {
@@ -1018,20 +955,12 @@ vdev_disk_io_rw(zio_t *zio)
 	}
 
 	/* Allocate vbio, with a pointer to the borrowed ABD if necessary */
-	int error = 0;
-	vbio_t *vbio = vbio_alloc(zio, bdev);
+	vbio_t *vbio = vbio_alloc(zio, bdev, flags);
 	if (abd != zio->io_abd)
 		vbio->vbio_abd = abd;
 
-	/* Fill it with pages */
-	error = abd_iterate_page_func(abd, 0, zio->io_size,
-	    vdev_disk_fill_vbio_cb, vbio);
-	if (error != 0) {
-		vbio_free(vbio);
-		return (error);
-	}
-
-	vbio_submit(vbio, flags);
+	/* Fill it with data pages and submit it to the kernel */
+	vbio_submit(vbio, abd, zio->io_size);
 	return (0);
 }
 

From c6be6ce1755a3d9a3cbe70256cd8958ef83d8542 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Thu, 14 Mar 2024 10:57:30 +1100
Subject: [PATCH 23/34] abd_iter_page: don't use compound heads on Linux <4.5

Before 4.5 (specifically, torvalds/linux@ddc58f2), head and tail pages
in a compound page were refcounted separately. This means that using the
head page without taking a reference to it could see it cleaned up later
before we're finished with it. Specifically, bio_add_page() would take a
reference, and drop its reference after the bio completion callback
returns.

If the zio is executed immediately from the completion callback, this is
usually ok, as any data is referenced through the tail page referenced
by the ABD, and so becomes "live" that way. If there's a delay in zio
execution (high load, error injection), then the head page can be freed,
along with any dirty flags or other indicators that the underlying
memory is used. Later, when the zio completes and that memory is
accessed, its either unmapped and an unhandled fault takes down the
entire system, or it is mapped and we end up messing around in someone
else's memory. Both of these are very bad.

The solution on these older kernels is to take a reference to the head
page when we use it, and release it when we're done. There's not really
a sensible way under our current structure to do this; the "best" would
be to keep a list of head page references in the ABD, and release them
when the ABD is freed.

Since this additional overhead is totally unnecessary on 4.5+, where
head and tail pages share refcounts, I've opted to simply not use the
compound head in ABD page iteration there. This is theoretically less
efficient (though cleaning up head page references would add overhead),
but its safe, and we still get the other benefits of not mapping pages
before adding them to a bio and not mis-splitting pages.

There doesn't appear to be an obvious symbol name or config option we
can match on to discover this behaviour in configure (and the mm/page
APIs have changed a lot since then anyway), so I've gone with a simple
version check.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
---
 module/os/linux/zfs/abd_os.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c
index 3fe01c0b7d77..d3255dcbc0f7 100644
--- a/module/os/linux/zfs/abd_os.c
+++ b/module/os/linux/zfs/abd_os.c
@@ -62,6 +62,7 @@
 #include <linux/kmap_compat.h>
 #include <linux/mm_compat.h>
 #include <linux/scatterlist.h>
+#include <linux/version.h>
 #endif
 
 #ifdef _KERNEL
@@ -1061,6 +1062,7 @@ abd_iter_page(struct abd_iter *aiter)
 	}
 	ASSERT(page);
 
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
 	if (PageTail(page)) {
 		/*
 		 * This page is part of a "compound page", which is a group of
@@ -1082,11 +1084,23 @@ abd_iter_page(struct abd_iter *aiter)
 		 * To do this, we need to adjust the offset to be counted from
 		 * the head page. struct page for compound pages are stored
 		 * contiguously, so we can just adjust by a simple offset.
+		 *
+		 * Before kernel 4.5, compound page heads were refcounted
+		 * separately, such that moving back to the head page would
+		 * require us to take a reference to it and releasing it once
+		 * we're completely finished with it. In practice, that means
+		 * when our caller is done with the ABD, which we have no
+		 * insight into from here. Rather than contort this API to
+		 * track head page references on such ancient kernels, we just
+		 * compile this block out and use the tail pages directly. This
+		 * is slightly less efficient, but makes everything far
+		 * simpler.
 		 */
 		struct page *head = compound_head(page);
 		doff += ((page - head) * PAGESIZE);
 		page = head;
 	}
+#endif
 
 	/* final page and position within it */
 	aiter->iter_page = page;

From 8cd8ccca5383dcdd9bf55d4d22921a6b43b4ebe1 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 25 Mar 2024 20:13:45 -0400
Subject: [PATCH 24/34] BRT: Skip getting length in brt_entry_lookup()

Unlike DDT, where ZAP values may have different lengths due to
compression, all BRT entries are identical 8-byte counters.  It
does not make sense to first fetch the length only to assert it.
zap_lookup_uint64() is specifically designed to work with counters
of different size and should return error if something odd found.
Calling it straight allows to save some measurable CPU time.

Reviewed-by: Pawel Jakub Dawidek <pawel@dawidek.net>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #15950
---
 module/zfs/brt.c | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/module/zfs/brt.c b/module/zfs/brt.c
index 5d1f4728b645..ea8c0735c4b7 100644
--- a/module/zfs/brt.c
+++ b/module/zfs/brt.c
@@ -900,7 +900,6 @@ static int
 brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre)
 {
 	uint64_t mos_entries;
-	uint64_t one, physsize;
 	int error;
 
 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
@@ -918,21 +917,8 @@ brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre)
 
 	brt_unlock(brt);
 
-	error = zap_length_uint64(brt->brt_mos, mos_entries, &bre->bre_offset,
-	    BRT_KEY_WORDS, &one, &physsize);
-	if (error == 0) {
-		ASSERT3U(one, ==, 1);
-		ASSERT3U(physsize, ==, sizeof (bre->bre_refcount));
-
-		error = zap_lookup_uint64(brt->brt_mos, mos_entries,
-		    &bre->bre_offset, BRT_KEY_WORDS, 1,
-		    sizeof (bre->bre_refcount), &bre->bre_refcount);
-		BRT_DEBUG("ZAP lookup: object=%llu vdev=%llu offset=%llu "
-		    "count=%llu error=%d", (u_longlong_t)mos_entries,
-		    (u_longlong_t)brtvd->bv_vdevid,
-		    (u_longlong_t)bre->bre_offset,
-		    error == 0 ? (u_longlong_t)bre->bre_refcount : 0, error);
-	}
+	error = zap_lookup_uint64(brt->brt_mos, mos_entries, &bre->bre_offset,
+	    BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount), &bre->bre_refcount);
 
 	brt_wlock(brt);
 

From a89d209bb60c3f32881da7624bd01d28023da4f4 Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Mon, 18 Mar 2024 14:19:53 -0400
Subject: [PATCH 25/34] BRT: Fix holes cloning.

 - When reading L0 block pointers handle buffers without ones and
without dirty records as a holes.  Those appear when dnode size
was increased, but the end was never written, so there are no new
indirection levels to store the pointers.  It makes no sense to
return EAGAIN here, since sync won't create new indirection levels
until there will be actual writes.
 - When cloning blocks set destination hole logical birth time
to the current TXG.  Otherwise if we are cloning over existing
data, newly created holes may not be properly replicated later.
Use BP_SET_BIRTH() when possible to not replicate its logic.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #15994
Closes #16007
---
 module/zfs/dmu.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index b88cf447d296..753dde6d5205 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -2265,11 +2265,13 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
 
 		if (bp == NULL) {
 			/*
-			 * The block was created in this transaction group,
-			 * so it has no BP yet.
+			 * The file size was increased, but the block was never
+			 * written, otherwise we would either have the block
+			 * pointer or the dirty record and would not get here.
+			 * It is effectively a hole, so report it as such.
 			 */
-			error = SET_ERROR(EAGAIN);
-			goto out;
+			BP_ZERO(&bps[i]);
+			continue;
 		}
 		/*
 		 * Make sure we clone only data blocks.
@@ -2361,19 +2363,17 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
 		ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
 		dl = &dr->dt.dl;
 		dl->dr_overridden_by = *bp;
-		dl->dr_brtwrite = B_TRUE;
-		dl->dr_override_state = DR_OVERRIDDEN;
-		if (BP_IS_HOLE(bp)) {
-			BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, 0);
-			BP_SET_PHYSICAL_BIRTH(&dl->dr_overridden_by, 0);
-		} else {
-			BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by,
-			    dr->dr_txg);
+		if (!BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) != 0) {
 			if (!BP_IS_EMBEDDED(bp)) {
-				BP_SET_PHYSICAL_BIRTH(&dl->dr_overridden_by,
+				BP_SET_BIRTH(&dl->dr_overridden_by, dr->dr_txg,
 				    BP_GET_BIRTH(bp));
+			} else {
+				BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by,
+				    dr->dr_txg);
 			}
 		}
+		dl->dr_brtwrite = B_TRUE;
+		dl->dr_override_state = DR_OVERRIDDEN;
 
 		mutex_exit(&db->db_mtx);
 

From b4034276247bfe430a7ff8d8ef9b06826e83cb9d Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 19 Mar 2024 12:25:14 -0400
Subject: [PATCH 26/34] BRT: Fix tests to work on non-empty pools

It should not normally happen, but if it does, better to not fail
everything for no good reason, or it may be hard to debug.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #16007
---
 .../functional/bclone/bclone_common.kshlib    | 47 ++++++++++---------
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib b/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib
index 3b8eaea5bb54..84b92b4dcdc9 100644
--- a/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib
+++ b/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib
@@ -97,20 +97,19 @@ function verify_pool_prop_eq
 
 function verify_pool_props
 {
-    typeset -r dsize=$1
-    typeset -r ratio=$2
+    typeset -r oused=$1
+    typeset -r osaved=$2
+    typeset dsize=$3
+    typeset ratio=$4
 
     if [[ $dsize -eq 0 ]]; then
-        verify_pool_prop_eq bcloneused 0
-        verify_pool_prop_eq bclonesaved 0
-        verify_pool_prop_eq bcloneratio 1.00
-    else
-        if [[ $ratio -eq 1 ]]; then
-            verify_pool_prop_eq bcloneused 0
-        else
-            verify_pool_prop_eq bcloneused $dsize
-        fi
-        verify_pool_prop_eq bclonesaved $((dsize*(ratio-1)))
+        ratio=1
+    elif [[ $ratio -eq 1 ]]; then
+        dsize=0
+    fi
+    verify_pool_prop_eq bcloneused $(($oused+$dsize))
+    verify_pool_prop_eq bclonesaved $(($osaved+dsize*(ratio-1)))
+    if [[ $oused -eq 0 ]]; then
         verify_pool_prop_eq bcloneratio "${ratio}.00"
     fi
 }
@@ -124,16 +123,22 @@ function bclone_test
     typeset -r srcdir=$4
     typeset -r dstdir=$5
     typeset dsize
+    typeset oused
+    typeset osaved
 
     typeset -r original="${srcdir}/original"
     typeset -r clone="${dstdir}/clone"
 
     log_note "Testing file copy with datatype $datatype, file size $filesize, embedded $embedded"
 
+    # Save current block cloning stats for later use.
+    sync_pool $TESTPOOL
+    oused=$(get_pool_prop bcloneused $TESTPOOL)
+    osaved=$(get_pool_prop bclonesaved $TESTPOOL)
+
     # Create a test file with known content.
     case $datatype in
         random|text)
-            sync_pool $TESTPOOL
             if [[ $datatype = "random" ]]; then
                 dd if=/dev/urandom of=$original bs=$filesize count=1 2>/dev/null
             else
@@ -146,13 +151,13 @@ function bclone_test
             sync_pool $TESTPOOL
             # It is hard to predict block sizes that will be used,
             # so just do one clone and take it from bcloneused.
-            filesize=$(zpool get -Hp -o value bcloneused $TESTPOOL)
+            dsize=$(get_pool_prop bcloneused $TESTPOOL)
+            dsize=$(($dsize-$oused))
             if [[ $embedded = "false" ]]; then
-                log_must test $filesize -gt 0
+                log_must test $dsize -gt 0
             fi
             rm -f "${clone}-tmp"
             sync_pool $TESTPOOL
-            dsize=$filesize
             ;;
         hole)
             log_must truncate_test -s $filesize -f $original
@@ -217,7 +222,7 @@ function bclone_test
     test_file_integrity $original_checksum "${clone}4" $filesize
     test_file_integrity $original_checksum "${clone}5" $filesize
 
-    verify_pool_props $dsize 7
+    verify_pool_props $oused $osaved $dsize 7
 
     # Clear cache and test after fresh import.
     log_must zpool export $TESTPOOL
@@ -240,7 +245,7 @@ function bclone_test
 
     sync_pool $TESTPOOL
 
-    verify_pool_props $dsize 11
+    verify_pool_props $oused $osaved $dsize 11
 
     log_must zpool export $TESTPOOL
     log_must zpool import $TESTPOOL
@@ -268,7 +273,7 @@ function bclone_test
     test_file_integrity $original_checksum "${clone}8" $filesize
     test_file_integrity $original_checksum "${clone}9" $filesize
 
-    verify_pool_props $dsize 6
+    verify_pool_props $oused $osaved $dsize 6
 
     rm -f "${clone}0" "${clone}2" "${clone}4" "${clone}8" "${clone}9"
 
@@ -276,11 +281,11 @@ function bclone_test
 
     test_file_integrity $original_checksum "${clone}6" $filesize
 
-    verify_pool_props $dsize 1
+    verify_pool_props $oused $osaved $dsize 1
 
     rm -f "${clone}6"
 
     sync_pool $TESTPOOL
 
-    verify_pool_props $dsize 1
+    verify_pool_props $oused $osaved $dsize 1
 }

From 0c8eb974ff3bba965d7303d3fa7db2007ef4bdfa Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 19 Mar 2024 13:08:05 -0400
Subject: [PATCH 27/34] BRT: Check pool clone stats in more tests

This should allow to catch some leaks, if those happen.

While there fix some cosmetic issues.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #16007
---
 .../bclone/bclone_corner_cases.kshlib         | 20 ++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_corner_cases.kshlib b/tests/zfs-tests/tests/functional/bclone/bclone_corner_cases.kshlib
index ddfbfc999c4e..aeb8efe91715 100644
--- a/tests/zfs-tests/tests/functional/bclone/bclone_corner_cases.kshlib
+++ b/tests/zfs-tests/tests/functional/bclone/bclone_corner_cases.kshlib
@@ -66,7 +66,7 @@ function bclone_corner_cases_init
     export SECOND_HALF_ORIG0_CHECKSUM=$(second_half_checksum $ORIG0)
     export SECOND_HALF_ORIG1_CHECKSUM=$(second_half_checksum $ORIG1)
     export SECOND_HALF_ORIG2_CHECKSUM=$(second_half_checksum $ORIG2)
-    export ZEROS_CHECKSUM=$(dd if=/dev/zero bs=$HALFRECORDSIZE count=1 | sha256digest)
+    export ZEROS_CHECKSUM=$(dd if=/dev/zero bs=$HALFRECORDSIZE count=1 2>/dev/null | sha256digest)
     export FIRST_HALF_CHECKSUM=""
     export SECOND_HALF_CHECKSUM=""
 }
@@ -210,6 +210,8 @@ function bclone_corner_cases_test
     typeset -r dstdir=$2
     typeset limit=$3
     typeset -i count=0
+    typeset oused
+    typeset osaved
 
     if [[ $srcdir != "count" ]]; then
         if [[ -n "$limit" ]]; then
@@ -217,6 +219,11 @@ function bclone_corner_cases_test
             limit=$(random_int_between 1 $total_count $((limit*2)) | sort -nu | head -n $limit | xargs)
         fi
         bclone_corner_cases_init $srcdir $dstdir
+
+        # Save current block cloning stats for later use.
+        sync_pool $TESTPOOL
+        oused=$(get_pool_prop bcloneused $TESTPOOL)
+        osaved=$(get_pool_prop bclonesaved $TESTPOOL)
     fi
 
     #
@@ -285,21 +292,24 @@ function bclone_corner_cases_test
                                     overwrite_clone "$second_overwrite"
 
                                     if checksum_compare $read_after; then
-                                        log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after"
+                                        log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / second_overwrite: $second_overwrite / read_after: $read_after"
                                     else
-                                        log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after"
+                                        log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / second_overwrite: $second_overwrite / read_after: $read_after"
                                     fi
 
                                     log_must zpool export $TESTPOOL
                                     log_must zpool import $TESTPOOL
 
                                     if checksum_compare "yes"; then
-                                        log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after / read_next_txg"
+                                        log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / second_overwrite: $second_overwrite / read_after: $read_after / read_next_txg"
                                     else
-                                        log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after / read_next_txg"
+                                        log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / second_overwrite: $second_overwrite / read_after: $read_after / read_next_txg"
                                     fi
 
                                     rm -f "$CLONE"
+                                    sync_pool $TESTPOOL
+                                    verify_pool_prop_eq bcloneused $oused
+                                    verify_pool_prop_eq bclonesaved $osaved
                                 done
                             done
                         done

From e39e20b6dc73bd7df1f097c23b5297bcc989ed53 Mon Sep 17 00:00:00 2001
From: Robert Evans <evansr@google.com>
Date: Wed, 27 Mar 2024 17:59:16 -0400
Subject: [PATCH 28/34] ZTS: fix flakiness in cp_files_002_pos

Fix RANDOM to not return zero.

Overwriting with `dd ... count=0` does not test anything.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Robert Evans <evansr@google.com>
Closes #16029
---
 .../tests/functional/cp_files/cp_files_002_pos.ksh          | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh b/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh
index 60817449ab03..4db968ffae05 100755
--- a/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh
@@ -76,7 +76,7 @@ log_onexit cleanup
 
 SRC_FILE=src.data
 DST_FILE=dst.data
-SRC_SIZE=$(($RANDOM % 2048))
+SRC_SIZE=$((1024 + $RANDOM % 1024))
 
 # A smaller recordsize is used merely to speed up the test.
 RECORDSIZE=4096
@@ -120,7 +120,7 @@ for mode in "never" "auto" "always"; do
 	# Overwrite a random range of an existing file and immediately copy it.
 	sync_pool $TESTPOOL
 	log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \
-            seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc
+            seek=$(($RANDOM % $SRC_SIZE)) count=$((1 + $RANDOM % 16)) conv=notrunc
 	if [[ "$mode" == "always" ]]; then
 		log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE
 		log_must ls -l $CP_TESTDIR
@@ -152,7 +152,7 @@ for mode in "never" "auto" "always"; do
 
 	# Overwrite a random range of an existing file and immediately copy it.
 	log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \
-            seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc
+            seek=$(($RANDOM % $SRC_SIZE)) count=$((1 + $RANDOM % 16)) conv=notrunc
 	log_must cp --reflink=$mode $SRC_FILE $DST_FILE
 	verify_copy $SRC_FILE $DST_FILE
 	log_must rm -f $SRC_FILE $DST_FILE

From b1e46f869e773086c23c565d7d5b261577023cfb Mon Sep 17 00:00:00 2001
From: George Wilson <george.wilson@delphix.com>
Date: Fri, 29 Mar 2024 15:15:56 -0400
Subject: [PATCH 29/34] Add ashift validation when adding devices to a pool

Currently, zpool add allows users to add top-level vdevs that have
different ashifts but doing so prevents users from being able to
perform a top-level vdev removal. Often times consumers may not realize
that they have mismatched ashifts until the top-level removal fails.

This feature adds ashift validation to the zpool add command and will
fail the operation if the sector size of the specified vdev does not
match the existing pool. This behavior can be disabled by using the -f
flag. In addition, new flags have been added to provide fine-grained
control to disable specific checks. These flags
are:

--allow-in-use
--allow-ashift-mismatch
--allow-replicaton-mismatch

The force flag will disable all of these checks.

Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Mark Maybee <mmaybee@delphix.com>
Signed-off-by: George Wilson <gwilson@delphix.com>
Closes #15509
---
 cmd/zpool/zpool_main.c                        | 76 ++++++++++++++-----
 cmd/ztest.c                                   |  8 +-
 include/libzfs.h                              |  5 +-
 include/sys/fs/zfs.h                          |  3 +-
 include/sys/spa.h                             |  4 +-
 lib/libzfs/libzfs.abi                         | 76 +++++++++++++++----
 lib/libzfs/libzfs_pool.c                      |  5 +-
 lib/libzfs/libzfs_util.c                      |  8 +-
 man/man8/zpool-add.8                          | 18 ++++-
 module/zfs/spa.c                              | 14 +++-
 module/zfs/zfs_ioctl.c                        |  4 +-
 tests/runfiles/common.run                     |  3 +-
 tests/zfs-tests/tests/Makefile.am             |  1 +
 .../cli_root/zpool_add/add-o_ashift.ksh       | 17 ++++-
 .../cli_root/zpool_add/add_prop_ashift.ksh    | 16 +++-
 .../zpool_add--allow-ashift-mismatch.ksh      |  0
 .../cli_root/zpool_add/zpool_add_002_pos.ksh  | 11 +++
 .../cli_root/zpool_add/zpool_add_004_pos.ksh  |  2 +-
 .../cli_root/zpool_add/zpool_add_005_pos.ksh  |  2 +
 .../cli_root/zpool_add/zpool_add_009_neg.ksh  |  2 +
 .../cli_root/zpool_add/zpool_add_010_pos.ksh  |  2 +-
 21 files changed, 219 insertions(+), 58 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 987d44062865..c85a5f285154 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -22,7 +22,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright (c) 2012 by Frederik Wessels. All rights reserved.
  * Copyright (c) 2012 by Cyril Plisko. All rights reserved.
  * Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved.
@@ -131,6 +131,13 @@ static int zpool_do_help(int argc, char **argv);
 static zpool_compat_status_t zpool_do_load_compat(
     const char *, boolean_t *);
 
+enum zpool_options {
+	ZPOOL_OPTION_POWER = 1024,
+	ZPOOL_OPTION_ALLOW_INUSE,
+	ZPOOL_OPTION_ALLOW_REPLICATION_MISMATCH,
+	ZPOOL_OPTION_ALLOW_ASHIFT_MISMATCH
+};
+
 /*
  * These libumem hooks provide a reasonable set of defaults for the allocator's
  * debugging facilities.
@@ -347,7 +354,7 @@ get_usage(zpool_help_t idx)
 {
 	switch (idx) {
 	case HELP_ADD:
-		return (gettext("\tadd [-fgLnP] [-o property=value] "
+		return (gettext("\tadd [-afgLnP] [-o property=value] "
 		    "<pool> <vdev> ...\n"));
 	case HELP_ATTACH:
 		return (gettext("\tattach [-fsw] [-o property=value] "
@@ -1009,8 +1016,9 @@ add_prop_list_default(const char *propname, const char *propval,
 }
 
 /*
- * zpool add [-fgLnP] [-o property=value] <pool> <vdev> ...
+ * zpool add [-afgLnP] [-o property=value] <pool> <vdev> ...
  *
+ *	-a	Disable the ashift validation checks
  *	-f	Force addition of devices, even if they appear in use
  *	-g	Display guid for individual vdev name.
  *	-L	Follow links when resolving vdev path name.
@@ -1026,8 +1034,11 @@ add_prop_list_default(const char *propname, const char *propval,
 int
 zpool_do_add(int argc, char **argv)
 {
-	boolean_t force = B_FALSE;
+	boolean_t check_replication = B_TRUE;
+	boolean_t check_inuse = B_TRUE;
 	boolean_t dryrun = B_FALSE;
+	boolean_t check_ashift = B_TRUE;
+	boolean_t force = B_FALSE;
 	int name_flags = 0;
 	int c;
 	nvlist_t *nvroot;
@@ -1038,8 +1049,18 @@ zpool_do_add(int argc, char **argv)
 	nvlist_t *props = NULL;
 	char *propval;
 
+	struct option long_options[] = {
+		{"allow-in-use", no_argument, NULL, ZPOOL_OPTION_ALLOW_INUSE},
+		{"allow-replication-mismatch", no_argument, NULL,
+		    ZPOOL_OPTION_ALLOW_REPLICATION_MISMATCH},
+		{"allow-ashift-mismatch", no_argument, NULL,
+		    ZPOOL_OPTION_ALLOW_ASHIFT_MISMATCH},
+		{0, 0, 0, 0}
+	};
+
 	/* check options */
-	while ((c = getopt(argc, argv, "fgLno:P")) != -1) {
+	while ((c = getopt_long(argc, argv, "fgLno:P", long_options, NULL))
+	    != -1) {
 		switch (c) {
 		case 'f':
 			force = B_TRUE;
@@ -1069,6 +1090,15 @@ zpool_do_add(int argc, char **argv)
 		case 'P':
 			name_flags |= VDEV_NAME_PATH;
 			break;
+		case ZPOOL_OPTION_ALLOW_INUSE:
+			check_inuse = B_FALSE;
+			break;
+		case ZPOOL_OPTION_ALLOW_REPLICATION_MISMATCH:
+			check_replication = B_FALSE;
+			break;
+		case ZPOOL_OPTION_ALLOW_ASHIFT_MISMATCH:
+			check_ashift = B_FALSE;
+			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
@@ -1089,6 +1119,19 @@ zpool_do_add(int argc, char **argv)
 		usage(B_FALSE);
 	}
 
+	if (force) {
+		if (!check_inuse || !check_replication || !check_ashift) {
+			(void) fprintf(stderr, gettext("'-f' option is not "
+			    "allowed with '--allow-replication-mismatch', "
+			    "'--allow-ashift-mismatch', or "
+			    "'--allow-in-use'\n"));
+			usage(B_FALSE);
+		}
+		check_inuse = B_FALSE;
+		check_replication = B_FALSE;
+		check_ashift = B_FALSE;
+	}
+
 	poolname = argv[0];
 
 	argc--;
@@ -1119,8 +1162,8 @@ zpool_do_add(int argc, char **argv)
 	}
 
 	/* pass off to make_root_vdev for processing */
-	nvroot = make_root_vdev(zhp, props, force, !force, B_FALSE, dryrun,
-	    argc, argv);
+	nvroot = make_root_vdev(zhp, props, !check_inuse,
+	    check_replication, B_FALSE, dryrun, argc, argv);
 	if (nvroot == NULL) {
 		zpool_close(zhp);
 		return (1);
@@ -1224,7 +1267,7 @@ zpool_do_add(int argc, char **argv)
 
 		ret = 0;
 	} else {
-		ret = (zpool_add(zhp, nvroot) != 0);
+		ret = (zpool_add(zhp, nvroot, check_ashift) != 0);
 	}
 
 	nvlist_free(props);
@@ -7081,7 +7124,6 @@ zpool_do_split(int argc, char **argv)
 	return (ret);
 }
 
-#define	POWER_OPT 1024
 
 /*
  * zpool online [--power] <pool> <device> ...
@@ -7099,7 +7141,7 @@ zpool_do_online(int argc, char **argv)
 	int flags = 0;
 	boolean_t is_power_on = B_FALSE;
 	struct option long_options[] = {
-		{"power", no_argument, NULL, POWER_OPT},
+		{"power", no_argument, NULL, ZPOOL_OPTION_POWER},
 		{0, 0, 0, 0}
 	};
 
@@ -7109,7 +7151,7 @@ zpool_do_online(int argc, char **argv)
 		case 'e':
 			flags |= ZFS_ONLINE_EXPAND;
 			break;
-		case POWER_OPT:
+		case ZPOOL_OPTION_POWER:
 			is_power_on = B_TRUE;
 			break;
 		case '?':
@@ -7222,7 +7264,7 @@ zpool_do_offline(int argc, char **argv)
 	boolean_t is_power_off = B_FALSE;
 
 	struct option long_options[] = {
-		{"power", no_argument, NULL, POWER_OPT},
+		{"power", no_argument, NULL, ZPOOL_OPTION_POWER},
 		{0, 0, 0, 0}
 	};
 
@@ -7235,7 +7277,7 @@ zpool_do_offline(int argc, char **argv)
 		case 't':
 			istmp = B_TRUE;
 			break;
-		case POWER_OPT:
+		case ZPOOL_OPTION_POWER:
 			is_power_off = B_TRUE;
 			break;
 		case '?':
@@ -7335,7 +7377,7 @@ zpool_do_clear(int argc, char **argv)
 	char *pool, *device;
 
 	struct option long_options[] = {
-		{"power", no_argument, NULL, POWER_OPT},
+		{"power", no_argument, NULL, ZPOOL_OPTION_POWER},
 		{0, 0, 0, 0}
 	};
 
@@ -7352,7 +7394,7 @@ zpool_do_clear(int argc, char **argv)
 		case 'X':
 			xtreme_rewind = B_TRUE;
 			break;
-		case POWER_OPT:
+		case ZPOOL_OPTION_POWER:
 			is_power_on = B_TRUE;
 			break;
 		case '?':
@@ -9208,7 +9250,7 @@ zpool_do_status(int argc, char **argv)
 	char *cmd = NULL;
 
 	struct option long_options[] = {
-		{"power", no_argument, NULL, POWER_OPT},
+		{"power", no_argument, NULL, ZPOOL_OPTION_POWER},
 		{0, 0, 0, 0}
 	};
 
@@ -9276,7 +9318,7 @@ zpool_do_status(int argc, char **argv)
 		case 'x':
 			cb.cb_explain = B_TRUE;
 			break;
-		case POWER_OPT:
+		case ZPOOL_OPTION_POWER:
 			cb.cb_print_power = B_TRUE;
 			break;
 		case '?':
diff --git a/cmd/ztest.c b/cmd/ztest.c
index 1d414a9f6fd5..684ab586bb93 100644
--- a/cmd/ztest.c
+++ b/cmd/ztest.c
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
@@ -3375,7 +3375,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
 		    "log" : NULL, raidz_children, zs->zs_mirrors,
 		    1);
 
-		error = spa_vdev_add(spa, nvroot);
+		error = spa_vdev_add(spa, nvroot, B_FALSE);
 		fnvlist_free(nvroot);
 
 		switch (error) {
@@ -3438,7 +3438,7 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id)
 	nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0,
 	    class, raidz_children, zs->zs_mirrors, 1);
 
-	error = spa_vdev_add(spa, nvroot);
+	error = spa_vdev_add(spa, nvroot, B_FALSE);
 	fnvlist_free(nvroot);
 
 	if (error == ENOSPC)
@@ -3545,7 +3545,7 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
 		 */
 		nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL,
 		    (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1);
-		error = spa_vdev_add(spa, nvroot);
+		error = spa_vdev_add(spa, nvroot, B_FALSE);
 
 		switch (error) {
 		case 0:
diff --git a/include/libzfs.h b/include/libzfs.h
index 4f06b5d3c24c..2823b8845827 100644
--- a/include/libzfs.h
+++ b/include/libzfs.h
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2022 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright Joyent, Inc.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright (c) 2016, Intel Corporation.
@@ -158,6 +158,7 @@ typedef enum zfs_error {
 	EZFS_RESUME_EXISTS,	/* Resume on existing dataset without force */
 	EZFS_SHAREFAILED,	/* filesystem share failed */
 	EZFS_RAIDZ_EXPAND_IN_PROGRESS,	/* a raidz is currently expanding */
+	EZFS_ASHIFT_MISMATCH,   /* can't add vdevs with different ashifts */
 	EZFS_UNKNOWN
 } zfs_error_t;
 
@@ -261,7 +262,7 @@ _LIBZFS_H boolean_t zpool_skip_pool(const char *);
 _LIBZFS_H int zpool_create(libzfs_handle_t *, const char *, nvlist_t *,
     nvlist_t *, nvlist_t *);
 _LIBZFS_H int zpool_destroy(zpool_handle_t *, const char *);
-_LIBZFS_H int zpool_add(zpool_handle_t *, nvlist_t *);
+_LIBZFS_H int zpool_add(zpool_handle_t *, nvlist_t *, boolean_t check_ashift);
 
 typedef struct splitflags {
 	/* do not split, but return the config that would be split off */
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index 025567e2183f..21f99bacccf3 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2013, 2017 Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
@@ -1603,6 +1603,7 @@ typedef enum {
 	ZFS_ERR_RESUME_EXISTS,
 	ZFS_ERR_CRYPTO_NOTSUP,
 	ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS,
+	ZFS_ERR_ASHIFT_MISMATCH,
 } zfs_errno_t;
 
 /*
diff --git a/include/sys/spa.h b/include/sys/spa.h
index fb4c93431a31..b969f05afe48 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2021 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
@@ -785,7 +785,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
 #define	SPA_ASYNC_DETACH_SPARE			0x4000
 
 /* device manipulation */
-extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
+extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t ashift_check);
 extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
     int replacing, int rebuild);
 extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index cdd2f04c2629..2bbaae6345ab 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -1112,14 +1112,11 @@
         <var-decl name='prev' type-id='b03eadb4' visibility='default'/>
       </data-member>
     </class-decl>
-    <class-decl name='list' size-in-bits='256' is-struct='yes' visibility='default' id='e824dae9'>
+    <class-decl name='list' size-in-bits='192' is-struct='yes' visibility='default' id='e824dae9'>
       <data-member access='public' layout-offset-in-bits='0'>
-        <var-decl name='list_size' type-id='b59d7dce' visibility='default'/>
-      </data-member>
-      <data-member access='public' layout-offset-in-bits='64'>
         <var-decl name='list_offset' type-id='b59d7dce' visibility='default'/>
       </data-member>
-      <data-member access='public' layout-offset-in-bits='128'>
+      <data-member access='public' layout-offset-in-bits='64'>
         <var-decl name='list_head' type-id='b0b5e45e' visibility='default'/>
       </data-member>
     </class-decl>
@@ -2832,6 +2829,9 @@
     </function-type>
   </abi-instr>
   <abi-instr address-size='64' path='lib/libzfs/libzfs_crypto.c' language='LANG_C99'>
+    <array-type-def dimensions='1' type-id='38b51b3c' size-in-bits='832' id='02b72c00'>
+      <subrange length='13' type-id='7359adad' id='487fded1'/>
+    </array-type-def>
     <array-type-def dimensions='1' type-id='fb7c6451' size-in-bits='256' id='64177143'>
       <subrange length='32' type-id='7359adad' id='ae5bde82'/>
     </array-type-def>
@@ -2844,6 +2844,10 @@
     <class-decl name='_IO_codecvt' is-struct='yes' visibility='default' is-declaration-only='yes' id='a4036571'/>
     <class-decl name='_IO_marker' is-struct='yes' visibility='default' is-declaration-only='yes' id='010ae0b9'/>
     <class-decl name='_IO_wide_data' is-struct='yes' visibility='default' is-declaration-only='yes' id='79bd3751'/>
+    <class-decl name='__locale_data' is-struct='yes' visibility='default' is-declaration-only='yes' id='23de8b96'/>
+    <array-type-def dimensions='1' type-id='80f4b756' size-in-bits='832' id='39e6f84a'>
+      <subrange length='13' type-id='7359adad' id='487fded1'/>
+    </array-type-def>
     <array-type-def dimensions='1' type-id='95e97e5e' size-in-bits='896' id='47394ee0'>
       <subrange length='28' type-id='7359adad' id='3db583d7'/>
     </array-type-def>
@@ -2964,6 +2968,24 @@
     <typedef-decl name='__clock_t' type-id='bd54fe1a' id='4d66c6d7'/>
     <typedef-decl name='__ssize_t' type-id='bd54fe1a' id='41060289'/>
     <typedef-decl name='FILE' type-id='ec1ed955' id='aa12d1ba'/>
+    <class-decl name='__locale_struct' size-in-bits='1856' is-struct='yes' visibility='default' id='90cc1ce3'>
+      <data-member access='public' layout-offset-in-bits='0'>
+        <var-decl name='__locales' type-id='02b72c00' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='832'>
+        <var-decl name='__ctype_b' type-id='31347b7a' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='896'>
+        <var-decl name='__ctype_tolower' type-id='6d60f45d' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='960'>
+        <var-decl name='__ctype_toupper' type-id='6d60f45d' visibility='default'/>
+      </data-member>
+      <data-member access='public' layout-offset-in-bits='1024'>
+        <var-decl name='__names' type-id='39e6f84a' visibility='default'/>
+      </data-member>
+    </class-decl>
+    <typedef-decl name='__locale_t' type-id='f01e1813' id='b7ac9b5f'/>
     <class-decl name='__sigset_t' size-in-bits='1024' is-struct='yes' naming-typedef-id='b9c97942' visibility='default' id='2616147f'>
       <data-member access='public' layout-offset-in-bits='0'>
         <var-decl name='__val' type-id='d2baa450' visibility='default'/>
@@ -2979,6 +3001,7 @@
       </data-member>
     </union-decl>
     <typedef-decl name='__sigval_t' type-id='a094b870' id='eabacd01'/>
+    <typedef-decl name='locale_t' type-id='b7ac9b5f' id='973a4f8d'/>
     <class-decl name='siginfo_t' size-in-bits='1024' is-struct='yes' naming-typedef-id='cb681f62' visibility='default' id='d8149419'>
       <data-member access='public' layout-offset-in-bits='0'>
         <var-decl name='si_signo' type-id='95e97e5e' visibility='default'/>
@@ -3214,9 +3237,13 @@
     <pointer-type-def type-id='bb4788fa' size-in-bits='64' id='cecf4ea7'/>
     <pointer-type-def type-id='010ae0b9' size-in-bits='64' id='e4c6fa61'/>
     <pointer-type-def type-id='79bd3751' size-in-bits='64' id='c65a1f29'/>
+    <pointer-type-def type-id='23de8b96' size-in-bits='64' id='38b51b3c'/>
+    <pointer-type-def type-id='90cc1ce3' size-in-bits='64' id='f01e1813'/>
     <qualified-type-def type-id='9b23c9ad' restrict='yes' id='8c85230f'/>
     <qualified-type-def type-id='80f4b756' restrict='yes' id='9d26089a'/>
     <pointer-type-def type-id='80f4b756' size-in-bits='64' id='7d3cd834'/>
+    <qualified-type-def type-id='95e97e5e' const='yes' id='2448a865'/>
+    <pointer-type-def type-id='2448a865' size-in-bits='64' id='6d60f45d'/>
     <qualified-type-def type-id='aca3bac8' const='yes' id='2498fd78'/>
     <pointer-type-def type-id='2498fd78' size-in-bits='64' id='eed6c816'/>
     <qualified-type-def type-id='eed6c816' restrict='yes' id='a431a9da'/>
@@ -3249,6 +3276,7 @@
     <class-decl name='_IO_codecvt' is-struct='yes' visibility='default' is-declaration-only='yes' id='a4036571'/>
     <class-decl name='_IO_marker' is-struct='yes' visibility='default' is-declaration-only='yes' id='010ae0b9'/>
     <class-decl name='_IO_wide_data' is-struct='yes' visibility='default' is-declaration-only='yes' id='79bd3751'/>
+    <class-decl name='__locale_data' is-struct='yes' visibility='default' is-declaration-only='yes' id='23de8b96'/>
     <function-decl name='zpool_get_prop_int' mangled-name='zpool_get_prop_int' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_get_prop_int'>
       <parameter type-id='4c81de99'/>
       <parameter type-id='5d0c23fb'/>
@@ -3353,6 +3381,10 @@
     <function-decl name='dlerror' visibility='default' binding='global' size-in-bits='64'>
       <return type-id='26a90f95'/>
     </function-decl>
+    <function-decl name='uselocale' visibility='default' binding='global' size-in-bits='64'>
+      <parameter type-id='973a4f8d'/>
+      <return type-id='973a4f8d'/>
+    </function-decl>
     <function-decl name='PKCS5_PBKDF2_HMAC_SHA1' visibility='default' binding='global' size-in-bits='64'>
       <parameter type-id='80f4b756'/>
       <parameter type-id='95e97e5e'/>
@@ -3436,8 +3468,9 @@
       <parameter type-id='80f4b756'/>
       <return type-id='26a90f95'/>
     </function-decl>
-    <function-decl name='strerror' visibility='default' binding='global' size-in-bits='64'>
+    <function-decl name='strerror_l' visibility='default' binding='global' size-in-bits='64'>
       <parameter type-id='95e97e5e'/>
+      <parameter type-id='973a4f8d'/>
       <return type-id='26a90f95'/>
     </function-decl>
     <function-decl name='tcgetattr' visibility='default' binding='global' size-in-bits='64'>
@@ -3794,12 +3827,18 @@
     <qualified-type-def type-id='9c313c2d' const='yes' id='c3b7ba7d'/>
     <pointer-type-def type-id='c3b7ba7d' size-in-bits='64' id='713a56f5'/>
     <pointer-type-def type-id='01a1b934' size-in-bits='64' id='566b3f52'/>
+    <qualified-type-def type-id='566b3f52' restrict='yes' id='c878edd6'/>
+    <pointer-type-def type-id='566b3f52' size-in-bits='64' id='82d4e9e8'/>
+    <qualified-type-def type-id='82d4e9e8' restrict='yes' id='aa19c230'/>
     <pointer-type-def type-id='7e291ce6' size-in-bits='64' id='ca64ff60'/>
     <pointer-type-def type-id='9da381c4' size-in-bits='64' id='cb785ebf'/>
     <pointer-type-def type-id='1b055409' size-in-bits='64' id='9d424d31'/>
     <pointer-type-def type-id='8e0af06e' size-in-bits='64' id='053457bd'/>
     <pointer-type-def type-id='857bb57e' size-in-bits='64' id='75be733c'/>
     <pointer-type-def type-id='a63d15a3' size-in-bits='64' id='a195f4a3'/>
+    <qualified-type-def type-id='a195f4a3' restrict='yes' id='33518961'/>
+    <pointer-type-def type-id='a195f4a3' size-in-bits='64' id='e80ff3ab'/>
+    <qualified-type-def type-id='e80ff3ab' restrict='yes' id='8f2c7109'/>
     <pointer-type-def type-id='eae6431d' size-in-bits='64' id='0d41d328'/>
     <pointer-type-def type-id='7a6844eb' size-in-bits='64' id='18c91f9e'/>
     <pointer-type-def type-id='dddf6ca2' size-in-bits='64' id='d915a820'/>
@@ -4232,9 +4271,13 @@
       <parameter type-id='9d424d31'/>
       <return type-id='95e97e5e'/>
     </function-decl>
-    <function-decl name='getgrnam' visibility='default' binding='global' size-in-bits='64'>
-      <parameter type-id='80f4b756'/>
-      <return type-id='566b3f52'/>
+    <function-decl name='getgrnam_r' visibility='default' binding='global' size-in-bits='64'>
+      <parameter type-id='9d26089a'/>
+      <parameter type-id='c878edd6'/>
+      <parameter type-id='266fe297'/>
+      <parameter type-id='b59d7dce'/>
+      <parameter type-id='aa19c230'/>
+      <return type-id='95e97e5e'/>
     </function-decl>
     <function-decl name='hasmntopt' visibility='default' binding='global' size-in-bits='64'>
       <parameter type-id='48bea5ec'/>
@@ -4258,9 +4301,13 @@
       <parameter type-id='18c91f9e'/>
       <return type-id='95e97e5e'/>
     </function-decl>
-    <function-decl name='getpwnam' visibility='default' binding='global' size-in-bits='64'>
-      <parameter type-id='80f4b756'/>
-      <return type-id='a195f4a3'/>
+    <function-decl name='getpwnam_r' visibility='default' binding='global' size-in-bits='64'>
+      <parameter type-id='9d26089a'/>
+      <parameter type-id='33518961'/>
+      <parameter type-id='266fe297'/>
+      <parameter type-id='b59d7dce'/>
+      <parameter type-id='8f2c7109'/>
+      <return type-id='95e97e5e'/>
     </function-decl>
     <function-decl name='strtol' visibility='default' binding='global' size-in-bits='64'>
       <parameter type-id='9d26089a'/>
@@ -6315,6 +6362,7 @@
     <function-decl name='zpool_add' mangled-name='zpool_add' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_add'>
       <parameter type-id='4c81de99' name='zhp'/>
       <parameter type-id='5ce45b60' name='nvroot'/>
+      <parameter type-id='c19b74c3' name='ashift_check'/>
       <return type-id='95e97e5e'/>
     </function-decl>
     <function-decl name='zpool_export' mangled-name='zpool_export' visibility='default' binding='global' size-in-bits='64' elf-symbol-id='zpool_export'>
@@ -6778,7 +6826,7 @@
       <enumerator name='LZC_SEND_FLAG_RAW' value='8'/>
       <enumerator name='LZC_SEND_FLAG_SAVED' value='16'/>
     </enum-decl>
-    <class-decl name='ddt_key' size-in-bits='320' is-struct='yes' visibility='default' id='e0a4a1cb'>
+    <class-decl name='ddt_key_t' size-in-bits='320' is-struct='yes' naming-typedef-id='67f6d2cf' visibility='default' id='5fae1718'>
       <data-member access='public' layout-offset-in-bits='0'>
         <var-decl name='ddk_cksum' type-id='39730d0b' visibility='default'/>
       </data-member>
@@ -6786,7 +6834,7 @@
         <var-decl name='ddk_prop' type-id='9c313c2d' visibility='default'/>
       </data-member>
     </class-decl>
-    <typedef-decl name='ddt_key_t' type-id='e0a4a1cb' id='67f6d2cf'/>
+    <typedef-decl name='ddt_key_t' type-id='5fae1718' id='67f6d2cf'/>
     <enum-decl name='dmu_object_type' id='04b3b0b9'>
       <underlying-type type-id='9cac1fee'/>
       <enumerator name='DMU_OT_NONE' value='0'/>
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c
index 402c14a6baee..b42e93e3db5d 100644
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -22,7 +22,7 @@
 /*
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
  * Copyright (c) 2018 Datto Inc.
  * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
@@ -1724,7 +1724,7 @@ zpool_discard_checkpoint(zpool_handle_t *zhp)
  * necessary verification to ensure that the vdev specification is well-formed.
  */
 int
-zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot)
+zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot, boolean_t check_ashift)
 {
 	zfs_cmd_t zc = {"\0"};
 	int ret;
@@ -1756,6 +1756,7 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot)
 
 	zcmd_write_conf_nvlist(hdl, &zc, nvroot);
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	zc.zc_flags = check_ashift;
 
 	if (zfs_ioctl(hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) {
 		switch (errno) {
diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c
index 8e70af2e5830..73ae0950ccb6 100644
--- a/lib/libzfs/libzfs_util.c
+++ b/lib/libzfs/libzfs_util.c
@@ -22,7 +22,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2020 Joyent, Inc. All rights reserved.
- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
  * Copyright (c) 2017 Datto Inc.
  * Copyright (c) 2020 The FreeBSD Foundation
@@ -319,6 +319,9 @@ libzfs_error_description(libzfs_handle_t *hdl)
 		    "dataset without force"));
 	case EZFS_RAIDZ_EXPAND_IN_PROGRESS:
 		return (dgettext(TEXT_DOMAIN, "raidz expansion in progress"));
+	case EZFS_ASHIFT_MISMATCH:
+		return (dgettext(TEXT_DOMAIN, "adding devices with "
+		    "different physical sector sizes is not allowed"));
 	case EZFS_UNKNOWN:
 		return (dgettext(TEXT_DOMAIN, "unknown error"));
 	default:
@@ -768,6 +771,9 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
 	case ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS:
 		zfs_verror(hdl, EZFS_RAIDZ_EXPAND_IN_PROGRESS, fmt, ap);
 		break;
+	case ZFS_ERR_ASHIFT_MISMATCH:
+		zfs_verror(hdl, EZFS_ASHIFT_MISMATCH, fmt, ap);
+		break;
 	default:
 		zfs_error_aux(hdl, "%s", zfs_strerror(error));
 		zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);
diff --git a/man/man8/zpool-add.8 b/man/man8/zpool-add.8
index 8ccdcccc7b06..60b35f1a511a 100644
--- a/man/man8/zpool-add.8
+++ b/man/man8/zpool-add.8
@@ -24,8 +24,9 @@
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+.\" Copyright (c) 2024 by Delphix. All Rights Reserved.
 .\"
-.Dd March 16, 2022
+.Dd March 8, 2024
 .Dt ZPOOL-ADD 8
 .Os
 .
@@ -36,6 +37,7 @@
 .Nm zpool
 .Cm add
 .Op Fl fgLnP
+.Op Fl -allow-in-use -allow-replication-mismatch -allow-ashift-mismatch
 .Oo Fl o Ar property Ns = Ns Ar value Oc
 .Ar pool vdev Ns …
 .
@@ -56,7 +58,8 @@ subcommand.
 .It Fl f
 Forces use of
 .Ar vdev Ns s ,
-even if they appear in use or specify a conflicting replication level.
+even if they appear in use, have conflicting ashift values, or specify
+a conflicting replication level.
 Not all devices can be overridden in this manner.
 .It Fl g
 Display
@@ -91,6 +94,17 @@ See the
 manual page for a list of valid properties that can be set.
 The only property supported at the moment is
 .Sy ashift .
+.It Fl -allow-ashift-mismatch
+Disable the ashift validation which allows mismatched ashift values in the
+pool.
+Adding top-level
+.Ar vdev Ns s
+with different sector sizes will prohibit future device removal operations, see
+.Xr zpool-remove 8 .
+.It Fl -allow-in-use
+Allow vdevs to be added even if they might be in use in another pool.
+.It Fl -allow-replication-mismatch
+Allow vdevs with conflicting replication levels to be added to the pool.
 .El
 .
 .Sh EXAMPLES
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 30c528a53049..3704ffd08820 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright (c) 2018, Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
@@ -7083,7 +7083,7 @@ spa_draid_feature_incr(void *arg, dmu_tx_t *tx)
  * Add a device to a storage pool.
  */
 int
-spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
+spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift)
 {
 	uint64_t txg, ndraid = 0;
 	int error;
@@ -7174,6 +7174,16 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 		}
 	}
 
+	if (check_ashift && spa->spa_max_ashift == spa->spa_min_ashift) {
+		for (int c = 0; c < vd->vdev_children; c++) {
+			tvd = vd->vdev_child[c];
+			if (tvd->vdev_ashift != spa->spa_max_ashift) {
+				return (spa_vdev_exit(spa, vd, txg,
+				    ZFS_ERR_ASHIFT_MISMATCH));
+			}
+		}
+	}
+
 	for (int c = 0; c < vd->vdev_children; c++) {
 		tvd = vd->vdev_child[c];
 		vdev_remove_child(vd, tvd);
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index b2b06881bdd4..dca15f4b826d 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -27,7 +27,7 @@
  * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved.
  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
- * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
@@ -1886,7 +1886,7 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc)
 	error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
 	    zc->zc_iflags, &config);
 	if (error == 0) {
-		error = spa_vdev_add(spa, config);
+		error = spa_vdev_add(spa, config, zc->zc_flags);
 		nvlist_free(config);
 	}
 	spa_close(spa, FTAG);
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 502b4de2bae9..d4c5a21828a1 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -372,7 +372,8 @@ tags = ['functional', 'cli_root', 'zpool']
 tests = ['zpool_add_001_pos', 'zpool_add_002_pos', 'zpool_add_003_pos',
     'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg',
     'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_010_pos',
-    'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output']
+    'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output',
+    'zpool_add--allow-ashift-mismatch']
 tags = ['functional', 'cli_root', 'zpool_add']
 
 [tests/functional/cli_root/zpool_attach]
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index fe9c92108725..866ea5b9e7ec 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -988,6 +988,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/cli_root/zpool_add/add_prop_ashift.ksh \
 	functional/cli_root/zpool_add/cleanup.ksh \
 	functional/cli_root/zpool_add/setup.ksh \
+	functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh \
 	functional/cli_root/zpool_add/zpool_add_001_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_002_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_003_pos.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh
index 7ecaf849e44b..51871934dd22 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh
@@ -22,7 +22,7 @@
 
 #
 # Copyright 2017, loli10K. All rights reserved.
-# Copyright (c) 2020 by Delphix. All rights reserved.
+# Copyright (c) 2020, 2024 by Delphix. All rights reserved.
 #
 
 . $STF_SUITE/include/libtest.shlib
@@ -60,12 +60,23 @@ log_must mkfile $SIZE $disk2
 logical_ashift=$(get_tunable VDEV_FILE_LOGICAL_ASHIFT)
 orig_ashift=$(get_tunable VDEV_FILE_PHYSICAL_ASHIFT)
 max_auto_ashift=$(get_tunable VDEV_MAX_AUTO_ASHIFT)
+opt=""
 
 typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16")
 for ashift in ${ashifts[@]}
 do
+	#
+	# Need to add the --allow-ashift-mismatch option to disable the
+	# ashift mismatch checks in zpool add.
+	#
+	if [[ $ashift -eq $orig_ashift ]]; then
+		opt=""
+	else
+		opt="--allow-ashift-mismatch"
+	fi
+
 	log_must zpool create $TESTPOOL $disk1
-	log_must zpool add -o ashift=$ashift $TESTPOOL $disk2
+	log_must zpool add $opt -o ashift=$ashift $TESTPOOL $disk2
 	log_must verify_ashift $disk2 $ashift
 
 	# clean things for the next run
@@ -78,7 +89,7 @@ do
 	#
 	log_must zpool create $TESTPOOL $disk1
 	log_must set_tunable32 VDEV_FILE_PHYSICAL_ASHIFT $ashift
-	log_must zpool add $TESTPOOL $disk2
+	log_must zpool add $opt $TESTPOOL $disk2
 	exp=$(( (ashift <= max_auto_ashift) ? ashift : logical_ashift ))
 	log_must verify_ashift $disk2 $exp
 
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add_prop_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add_prop_ashift.ksh
index 228f62232aae..6a3283d0618f 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add_prop_ashift.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add_prop_ashift.ksh
@@ -22,7 +22,7 @@
 
 #
 # Copyright 2017, loli10K. All rights reserved.
-# Copyright (c) 2020 by Delphix. All rights reserved.
+# Copyright (c) 2020, 2024 by Delphix. All rights reserved.
 #
 
 . $STF_SUITE/include/libtest.shlib
@@ -68,8 +68,13 @@ log_must set_tunable32 VDEV_FILE_PHYSICAL_ASHIFT 16
 typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16")
 for ashift in ${ashifts[@]}
 do
+	if [ $ashift -eq $orig_ashift ];then
+		opt=""
+	else
+		opt="--allow-ashift-mismatch"
+	fi
 	log_must zpool create -o ashift=$ashift $TESTPOOL $disk1
-	log_must zpool add $TESTPOOL $disk2
+	log_must zpool add $opt $TESTPOOL $disk2
 	log_must verify_ashift $disk2 $ashift
 
 	# clean things for the next run
@@ -82,8 +87,13 @@ for ashift in ${ashifts[@]}
 do
 	for cmdval in ${ashifts[@]}
 	do
+		if [ $ashift -eq $cmdval ];then
+			opt=""
+		else
+			opt="--allow-ashift-mismatch"
+		fi
 		log_must zpool create -o ashift=$ashift $TESTPOOL $disk1
-		log_must zpool add -o ashift=$cmdval $TESTPOOL $disk2
+		log_must zpool add $opt -o ashift=$cmdval $TESTPOOL $disk2
 		log_must verify_ashift $disk2 $cmdval
 
 		# clean things for the next run
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh
new file mode 100755
index 000000000000..e69de29bb2d1
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_002_pos.ksh
index c5c06f76340b..afee34a33469 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_002_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_002_pos.ksh
@@ -65,4 +65,15 @@ log_mustnot vdevs_in_pool $TESTPOOL $DISK2
 log_must zpool add -f $TESTPOOL $DISK2
 log_must vdevs_in_pool $TESTPOOL $DISK2
 
+log_must zpool destroy $TESTPOOL
+
+create_pool $TESTPOOL mirror $DISK0 $DISK1
+log_must poolexists $TESTPOOL
+
+log_mustnot zpool add $TESTPOOL $DISK2
+log_mustnot vdevs_in_pool $TESTPOOL $DISK2
+
+log_must zpool add --allow-replication-mismatch $TESTPOOL $DISK2
+log_must vdevs_in_pool $TESTPOOL $DISK2
+
 log_pass "'zpool add -f <pool> <vdev> ...' executes successfully."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_004_pos.ksh
index 646edc1a4557..cecda56ab125 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_004_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_004_pos.ksh
@@ -70,7 +70,7 @@ if is_freebsd; then
 	recursive=$(get_tunable VOL_RECURSIVE)
 	log_must set_tunable64 VOL_RECURSIVE 1
 fi
-log_must zpool add $TESTPOOL $ZVOL_DEVDIR/$TESTPOOL1/$TESTVOL
+log_must zpool add --allow-ashift-mismatch $TESTPOOL $ZVOL_DEVDIR/$TESTPOOL1/$TESTVOL
 
 log_must vdevs_in_pool "$TESTPOOL" "$ZVOL_DEVDIR/$TESTPOOL1/$TESTVOL"
 
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_005_pos.ksh
index 4990ef9d29b0..0e9d9f5f030f 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_005_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_005_pos.ksh
@@ -75,7 +75,9 @@ log_must poolexists $TESTPOOL1
 
 unset NOINUSE_CHECK
 log_mustnot zpool add -f $TESTPOOL $DISK1
+log_mustnot zpool add --allow-in-use $TESTPOOL $DISK1
 log_mustnot zpool add -f $TESTPOOL $mnttab_dev
+log_mustnot zpool add --allow-in-use $TESTPOOL $mnttab_dev
 if is_linux; then
        log_mustnot zpool add $TESTPOOL $vfstab_dev
 else
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh
index d7f3a900e8fd..a13a27160e76 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh
@@ -64,7 +64,9 @@ log_mustnot zpool add -f $TESTPOOL $DISK0
 for type in "" "mirror" "raidz" "draid" "spare" "log" "dedup" "special" "cache"
 do
 	log_mustnot zpool add -f $TESTPOOL $type $DISK0 $DISK1
+	log_mustnot zpool add --allow-in-use $TESTPOOL $type $DISK0 $DISK1
 	log_mustnot zpool add -f $TESTPOOL $type $DISK1 $DISK1
+	log_mustnot zpool add --allow-in-use $TESTPOOL $type $DISK1 $DISK1
 done
 
 log_pass "'zpool add' get fail as expected if vdevs are the same or vdev is " \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh
index b8b25db1b9f9..22860e9caf1d 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh
@@ -138,7 +138,7 @@ function zpool_create_forced_add
 		while ((j < ${#add_args[@]})); do
 			log_must zpool create $TESTPOOL1 ${create_args[$i]}
 			log_mustnot zpool add $TESTPOOL1 ${add_args[$j]}
-			log_must zpool add -f $TESTPOOL1 ${add_args[$j]}
+			log_must zpool add --allow-replication-mismatch $TESTPOOL1 ${add_args[$j]}
 			log_must zpool destroy -f $TESTPOOL1
 
 			((j += 1))

From c0aab8b8f91f5ecb2c625a8fa7265f26c260e10a Mon Sep 17 00:00:00 2001
From: Fabian-Gruenbichler <f.gruenbichler@proxmox.com>
Date: Fri, 29 Mar 2024 22:37:40 +0100
Subject: [PATCH 30/34] zvols: prevent overflow of minor device numbers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

currently, the linux kernel allows 2^20 minor devices per major device
number.  ZFS reserves blocks of 2^4 minors per zvol: 1 for the zvol
itself, the other 15 for the first partitions of that zvol. as a result,
only 2^16 such blocks are available for use.

there are no checks in place to avoid overflowing into the major device
number when more than 2^16 zvols are allocated (with volmode=dev or
default). instead of ignoring this limit, which comes with all sorts of
weird knock-on effects, detect this situation and simply fail allocating
the zvol block device early on.

without this safeguard, the kernel will reject the attempt to create an
already existing block device, but ZFS doesn't handle this error and
gets confused about which zvol occupies which minor slot, potentially
resulting in kernel NULL derefs and other issues later on.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
Closes #16006
---
 module/os/linux/zfs/zvol_os.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
index 8d5d1f06fce9..26cc63d426eb 100644
--- a/module/os/linux/zfs/zvol_os.c
+++ b/module/os/linux/zfs/zvol_os.c
@@ -1314,6 +1314,13 @@ zvol_os_create_minor(const char *name)
 	if (idx < 0)
 		return (SET_ERROR(-idx));
 	minor = idx << ZVOL_MINOR_BITS;
+	if (MINOR(minor) != minor) {
+		/* too many partitions can cause an overflow */
+		zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u",
+		    name, minor, MINOR(minor));
+		ida_simple_remove(&zvol_ida, idx);
+		return (SET_ERROR(EINVAL));
+	}
 
 	zv = zvol_find_by_name_hash(name, hash, RW_NONE);
 	if (zv) {

From cfb96c772b8448dca6eaeb66a540b8bb39c9908c Mon Sep 17 00:00:00 2001
From: Rob N <rob.norris@klarasystems.com>
Date: Sat, 30 Mar 2024 08:51:33 +1100
Subject: [PATCH 31/34] vdev_disk: clean up spa/bdev mode conversion

43e8f6e37 introduced a subtle API misuse, in that it passed the output
from vdev_bdev_mode() back into itself. Fortunately, the
SPA_MODE_(READ|WRITE) bit values exactly map to the FMODE_(READ|WRITE) &
BLK_OPEN_(READ|WRITE) bit values, so it didn't result in a bug, but it
was hard to read and understand, so I cleaned it up.

In doing so, I noticed that the only call to vdev_bdev_mode() without
the "exclusive" flag set was in that misuse, and actually, we never do a
non-exclusive blkdev_get_by_path(). So I've just made exclusive be
always-on.


Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Allan Jude <allan@klarasystems.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Closes #15995
---
 module/os/linux/zfs/vdev_disk.c | 81 ++++++++++++++++-----------------
 1 file changed, 39 insertions(+), 42 deletions(-)

diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index 36468fc21132..ac8fe6cb1bf9 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -97,38 +97,41 @@ static uint_t zfs_vdev_open_timeout_ms = 1000;
 
 static unsigned int zfs_vdev_failfast_mask = 1;
 
+/*
+ * Convert SPA mode flags into bdev open mode flags.
+ */
 #ifdef HAVE_BLK_MODE_T
-static blk_mode_t
+typedef blk_mode_t vdev_bdev_mode_t;
+#define	VDEV_BDEV_MODE_READ	BLK_OPEN_READ
+#define	VDEV_BDEV_MODE_WRITE	BLK_OPEN_WRITE
+#define	VDEV_BDEV_MODE_EXCL	BLK_OPEN_EXCL
+#define	VDEV_BDEV_MODE_MASK	(BLK_OPEN_READ|BLK_OPEN_WRITE|BLK_OPEN_EXCL)
 #else
-static fmode_t
+typedef fmode_t vdev_bdev_mode_t;
+#define	VDEV_BDEV_MODE_READ	FMODE_READ
+#define	VDEV_BDEV_MODE_WRITE	FMODE_WRITE
+#define	VDEV_BDEV_MODE_EXCL	FMODE_EXCL
+#define	VDEV_BDEV_MODE_MASK	(FMODE_READ|FMODE_WRITE|FMODE_EXCL)
 #endif
-vdev_bdev_mode(spa_mode_t spa_mode, boolean_t exclusive)
-{
-#ifdef HAVE_BLK_MODE_T
-	blk_mode_t mode = 0;
-
-	if (spa_mode & SPA_MODE_READ)
-		mode |= BLK_OPEN_READ;
 
-	if (spa_mode & SPA_MODE_WRITE)
-		mode |= BLK_OPEN_WRITE;
+static vdev_bdev_mode_t
+vdev_bdev_mode(spa_mode_t smode)
+{
+	ASSERT3U(smode, !=, SPA_MODE_UNINIT);
+	ASSERT0(smode & ~(SPA_MODE_READ|SPA_MODE_WRITE));
 
-	if (exclusive)
-		mode |= BLK_OPEN_EXCL;
-#else
-	fmode_t mode = 0;
+	vdev_bdev_mode_t bmode = VDEV_BDEV_MODE_EXCL;
 
-	if (spa_mode & SPA_MODE_READ)
-		mode |= FMODE_READ;
+	if (smode & SPA_MODE_READ)
+		bmode |= VDEV_BDEV_MODE_READ;
 
-	if (spa_mode & SPA_MODE_WRITE)
-		mode |= FMODE_WRITE;
+	if (smode & SPA_MODE_WRITE)
+		bmode |= VDEV_BDEV_MODE_WRITE;
 
-	if (exclusive)
-		mode |= FMODE_EXCL;
-#endif
+	ASSERT(bmode & VDEV_BDEV_MODE_MASK);
+	ASSERT0(bmode & ~VDEV_BDEV_MODE_MASK);
 
-	return (mode);
+	return (bmode);
 }
 
 /*
@@ -235,30 +238,28 @@ vdev_disk_kobj_evt_post(vdev_t *v)
 }
 
 static zfs_bdev_handle_t *
-vdev_blkdev_get_by_path(const char *path, spa_mode_t mode, void *holder)
+vdev_blkdev_get_by_path(const char *path, spa_mode_t smode, void *holder)
 {
+	vdev_bdev_mode_t bmode = vdev_bdev_mode(smode);
+
 #if defined(HAVE_BDEV_OPEN_BY_PATH)
-	return (bdev_open_by_path(path,
-	    vdev_bdev_mode(mode, B_TRUE), holder, NULL));
+	return (bdev_open_by_path(path, bmode, holder, NULL));
 #elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG)
-	return (blkdev_get_by_path(path,
-	    vdev_bdev_mode(mode, B_TRUE), holder, NULL));
+	return (blkdev_get_by_path(path, bmode, holder, NULL));
 #else
-	return (blkdev_get_by_path(path,
-	    vdev_bdev_mode(mode, B_TRUE), holder));
+	return (blkdev_get_by_path(path, bmode, holder));
 #endif
 }
 
 static void
-vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t mode, void *holder)
+vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t smode, void *holder)
 {
 #if defined(HAVE_BDEV_RELEASE)
 	return (bdev_release(bdh));
 #elif defined(HAVE_BLKDEV_PUT_HOLDER)
 	return (blkdev_put(BDH_BDEV(bdh), holder));
 #else
-	return (blkdev_put(BDH_BDEV(bdh),
-	    vdev_bdev_mode(mode, B_TRUE)));
+	return (blkdev_put(BDH_BDEV(bdh), vdev_bdev_mode(smode)));
 #endif
 }
 
@@ -267,11 +268,7 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
     uint64_t *logical_ashift, uint64_t *physical_ashift)
 {
 	zfs_bdev_handle_t *bdh;
-#ifdef HAVE_BLK_MODE_T
-	blk_mode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE);
-#else
-	fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE);
-#endif
+	spa_mode_t smode = spa_mode(v->vdev_spa);
 	hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms);
 	vdev_disk_t *vd;
 
@@ -322,16 +319,16 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
 					reread_part = B_TRUE;
 			}
 
-			vdev_blkdev_put(bdh, mode, zfs_vdev_holder);
+			vdev_blkdev_put(bdh, smode, zfs_vdev_holder);
 		}
 
 		if (reread_part) {
-			bdh = vdev_blkdev_get_by_path(disk_name, mode,
+			bdh = vdev_blkdev_get_by_path(disk_name, smode,
 			    zfs_vdev_holder);
 			if (!BDH_IS_ERR(bdh)) {
 				int error =
 				    vdev_bdev_reread_part(BDH_BDEV(bdh));
-				vdev_blkdev_put(bdh, mode, zfs_vdev_holder);
+				vdev_blkdev_put(bdh, smode, zfs_vdev_holder);
 				if (error == 0) {
 					timeout = MSEC2NSEC(
 					    zfs_vdev_open_timeout_ms * 2);
@@ -376,7 +373,7 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
 	hrtime_t start = gethrtime();
 	bdh = BDH_ERR_PTR(-ENXIO);
 	while (BDH_IS_ERR(bdh) && ((gethrtime() - start) < timeout)) {
-		bdh = vdev_blkdev_get_by_path(v->vdev_path, mode,
+		bdh = vdev_blkdev_get_by_path(v->vdev_path, smode,
 		    zfs_vdev_holder);
 		if (unlikely(BDH_PTR_ERR(bdh) == -ENOENT)) {
 			/*

From 2553f94c4299aaf31c5ceea4bfbfcc811cf76513 Mon Sep 17 00:00:00 2001
From: Robert Evans <evansr@google.com>
Date: Fri, 29 Mar 2024 17:59:23 -0400
Subject: [PATCH 32/34] Fix buffer underflow if sysfs file is empty

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Jason Lee <jasonlee@lanl.gov>
Signed-off-by: Robert Evans <evansr@google.com>
Closes #16028
Closes #16035
---
 cmd/zpool/os/linux/zpool_vdev_os.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmd/zpool/os/linux/zpool_vdev_os.c b/cmd/zpool/os/linux/zpool_vdev_os.c
index 006a3a7d8e01..80627b58211c 100644
--- a/cmd/zpool/os/linux/zpool_vdev_os.c
+++ b/cmd/zpool/os/linux/zpool_vdev_os.c
@@ -458,7 +458,7 @@ static char *zpool_sysfs_gets(char *path)
 	}
 
 	/* Remove trailing newline */
-	if (buf[count - 1] == '\n')
+	if (count > 0 && buf[count - 1] == '\n')
 		buf[count - 1] = 0;
 
 	close(fd);

From 39be46f43f96fb7420386d03751b01f5cb376d6b Mon Sep 17 00:00:00 2001
From: Robert Evans <evansr@google.com>
Date: Fri, 29 Mar 2024 20:11:52 -0400
Subject: [PATCH 33/34] Linux 5.18+ compat: Detect filemap_range_has_page

In v5.18 `filemap_range_has_page` moved to `pagemap.h`

`pagemap.h` has been around since 3.10 so just include both

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Signed-off-by: Robert Evans <evansr@google.com>
Closes #16034
---
 config/kernel-filemap.m4 | 1 +
 1 file changed, 1 insertion(+)

diff --git a/config/kernel-filemap.m4 b/config/kernel-filemap.m4
index 745928168f92..0b7da828d299 100644
--- a/config/kernel-filemap.m4
+++ b/config/kernel-filemap.m4
@@ -4,6 +4,7 @@ dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_FILEMAP], [
 	ZFS_LINUX_TEST_SRC([filemap_range_has_page], [
 		#include <linux/fs.h>
+		#include <linux/pagemap.h>
 	],[
 		struct address_space *mapping = NULL;
 		loff_t lstart = 0;

From a8bc2a41828b3aaff4095918e8c83e6659348872 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Tue, 2 Apr 2024 15:14:54 +1100
Subject: [PATCH 34/34] vdev_disk: don't touch vbio after its handed off to the
 kernel

After IO is unplugged, it may complete immediately and vbio_completion
be called on interrupt context. That may interrupt or deschedule our
task. If its the last bio, the vbio will be freed. Then, we get
rescheduled, and try to write to freed memory through vbio->.

This patch just removes the the cleanup, and the corresponding assert.
These were leftovers from a previous iteration of vbio_submit() and were
always "belt and suspenders" ops anyway, never strictly required.

Reported-by: Rich Ercolani <rincebrain@gmail.com>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes: #16045
Closes: #16050
---
 module/os/linux/zfs/vdev_disk.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index ac8fe6cb1bf9..df5fa067797a 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -755,8 +755,6 @@ vbio_fill_cb(struct page *page, size_t off, size_t len, void *priv)
 static void
 vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
 {
-	ASSERT(vbio->vbio_bdev);
-
 	/*
 	 * We plug so we can submit the BIOs as we go and only unplug them when
 	 * they are fully created and submitted. This is important; if we don't
@@ -774,12 +772,15 @@ vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size)
 	vbio->vbio_bio->bi_end_io = vbio_completion;
 	vbio->vbio_bio->bi_private = vbio;
 
+	/*
+	 * Once submitted, vbio_bio now owns vbio (through bi_private) and we
+	 * can't touch it again. The bio may complete and vbio_completion() be
+	 * called and free the vbio before this task is run again, so we must
+	 * consider it invalid from this point.
+	 */
 	vdev_submit_bio(vbio->vbio_bio);
 
 	blk_finish_plug(&plug);
-
-	vbio->vbio_bio = NULL;
-	vbio->vbio_bdev = NULL;
 }
 
 /* IO completion callback */