From 8131a96544fd19411c23424af85140e91e41c001 Mon Sep 17 00:00:00 2001
From: Richard Yao <richard.yao@alumni.stonybrook.edu>
Date: Mon, 12 Sep 2022 14:22:15 -0400
Subject: [PATCH 01/39] Fix use-after-free in btree code

Coverty static analysis found these.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Neal Gompa <ngompa@datto.com>
Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Closes #10989
Closes #13861
---
 module/zfs/btree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/module/zfs/btree.c b/module/zfs/btree.c
index 03c46473c1ec..36755f97929c 100644
--- a/module/zfs/btree.c
+++ b/module/zfs/btree.c
@@ -1608,8 +1608,8 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
 	zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count, 1);
 
 	new_rm_hdr->bth_count = 0;
-	zfs_btree_node_destroy(tree, new_rm_hdr);
 	zfs_btree_remove_from_node(tree, parent, new_rm_hdr);
+	zfs_btree_node_destroy(tree, new_rm_hdr);
 }
 
 /* Remove the element at the specific location. */
@@ -1817,10 +1817,10 @@ zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where)
 
 	/* Move our elements to the left neighbor. */
 	bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep, k_count + 1);
-	zfs_btree_node_destroy(tree, rm_hdr);
 
 	/* Remove the emptied node from the parent. */
 	zfs_btree_remove_from_node(tree, parent, rm_hdr);
+	zfs_btree_node_destroy(tree, rm_hdr);
 	zfs_btree_verify(tree);
 }
 

From e1b49e3f1d53038a8c2bbd4a297988b5c850aab4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Neal=20Gompa=20=28=E3=83=8B=E3=83=BC=E3=83=AB=E3=83=BB?=
 =?UTF-8?q?=E3=82=B4=E3=83=B3=E3=83=91=29?= <ngompa@datto.com>
Date: Tue, 24 May 2022 17:07:01 -0400
Subject: [PATCH 02/39] rpm: Use the correct version-release information in
 dependencies

This tightly links the subpackages together and ensures that everything
is upgraded together.

Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Neal Gompa <ngompa@datto.com>
Closes #13489
---
 rpm/generic/zfs.spec.in | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in
index 1cd3f6b520ea..c2894aa7bb80 100644
--- a/rpm/generic/zfs.spec.in
+++ b/rpm/generic/zfs.spec.in
@@ -120,12 +120,12 @@ License:        @ZFS_META_LICENSE@
 URL:            https://github.com/openzfs/zfs
 Source0:        %{name}-%{version}.tar.gz
 BuildRoot:      %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n)
-Requires:       libzpool5 = %{version}
-Requires:       libnvpair3 = %{version}
-Requires:       libuutil3 = %{version}
-Requires:       libzfs5 = %{version}
+Requires:       libzpool5%{?_isa} = %{version}-%{release}
+Requires:       libnvpair3%{?_isa} = %{version}-%{release}
+Requires:       libuutil3%{?_isa} = %{version}-%{release}
+Requires:       libzfs5%{?_isa} = %{version}-%{release}
 Requires:       %{name}-kmod = %{version}
-Provides:       %{name}-kmod-common = %{version}
+Provides:       %{name}-kmod-common = %{version}-%{release}
 Obsoletes:      spl
 
 # zfs-fuse provides the same commands and man pages that OpenZFS does.
@@ -255,13 +255,13 @@ This package provides support for managing ZFS filesystems
 %package -n libzfs5-devel
 Summary:        Development headers
 Group:          System Environment/Kernel
-Requires:       libzfs5 = %{version}
-Requires:       libzpool5 = %{version}
-Requires:       libnvpair3 = %{version}
-Requires:       libuutil3 = %{version}
-Provides:       libzpool5-devel
-Provides:       libnvpair3-devel
-Provides:       libuutil3-devel
+Requires:       libzfs5%{?_isa} = %{version}-%{release}
+Requires:       libzpool5%{?_isa} = %{version}-%{release}
+Requires:       libnvpair3%{?_isa} = %{version}-%{release}
+Requires:       libuutil3%{?_isa} = %{version}-%{release}
+Provides:       libzpool5-devel = %{version}-%{release}
+Provides:       libnvpair3-devel = %{version}-%{release}
+Provides:       libuutil3-devel = %{version}-%{release}
 Obsoletes:      zfs-devel
 Obsoletes:      libzfs2-devel
 Obsoletes:      libzfs4-devel
@@ -313,8 +313,8 @@ Summary:        Python %{python_version} wrapper for libzfs_core
 Group:          Development/Languages/Python
 License:        Apache-2.0
 BuildArch:      noarch
-Requires:       libzfs5 = %{version}
-Requires:       libnvpair3 = %{version}
+Requires:       libzfs5 = %{version}-%{release}
+Requires:       libnvpair3 = %{version}-%{release}
 Requires:       libffi
 Requires:       python%{__python_pkg_version}
 Requires:       %{__python_cffi_pkg}
@@ -339,7 +339,6 @@ This package provides a python wrapper for the libzfs_core C library.
 Summary:        Initramfs module
 Group:          System Environment/Kernel
 Requires:       %{name}%{?_isa} = %{version}-%{release}
-Requires:       %{name} = %{version}-%{release}
 Requires:       initramfs-tools
 
 %description initramfs

From f48d9b426919c8c8bce8d075e05f59cce9053728 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Mon, 11 Jul 2022 11:35:01 -0700
Subject: [PATCH 03/39] rpm: Silence "unversioned Obsoletes" warnings on EL 9

Get rid of RPM warnings on AlmaLinux 9:

"It's not recommended to have unversioned Obsoletes"

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #13584
Closes #13638
---
 rpm/generic/zfs-dkms.spec.in |  2 +-
 rpm/generic/zfs.spec.in      | 20 ++++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/rpm/generic/zfs-dkms.spec.in b/rpm/generic/zfs-dkms.spec.in
index 55f0f1cf5249..920b90e88912 100644
--- a/rpm/generic/zfs-dkms.spec.in
+++ b/rpm/generic/zfs-dkms.spec.in
@@ -31,7 +31,7 @@ Requires(post): gcc, make, perl, diffutils
 %if 0%{?rhel}%{?fedora}%{?mageia}%{?suse_version}
 Requires:       kernel-devel >= @ZFS_META_KVER_MIN@, kernel-devel <= @ZFS_META_KVER_MAX@.999
 Requires(post): kernel-devel >= @ZFS_META_KVER_MIN@, kernel-devel <= @ZFS_META_KVER_MAX@.999
-Obsoletes:      spl-dkms
+Obsoletes:      spl-dkms <= %{version}
 %endif
 Provides:       %{module}-kmod = %{version}
 AutoReqProv:    no
diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in
index c2894aa7bb80..8cab1c3d70bb 100644
--- a/rpm/generic/zfs.spec.in
+++ b/rpm/generic/zfs.spec.in
@@ -126,7 +126,7 @@ Requires:       libuutil3%{?_isa} = %{version}-%{release}
 Requires:       libzfs5%{?_isa} = %{version}-%{release}
 Requires:       %{name}-kmod = %{version}
 Provides:       %{name}-kmod-common = %{version}-%{release}
-Obsoletes:      spl
+Obsoletes:      spl <= %{version}
 
 # zfs-fuse provides the same commands and man pages that OpenZFS does.
 # Renaming those on either side would conflict with all available documentation.
@@ -178,8 +178,8 @@ This package contains the core ZFS command line utilities.
 %package -n libzpool5
 Summary:        Native ZFS pool library for Linux
 Group:          System Environment/Kernel
-Obsoletes:      libzpool2
-Obsoletes:      libzpool4
+Obsoletes:      libzpool2 <= %{version}
+Obsoletes:      libzpool4 <= %{version}
 
 %description -n libzpool5
 This package contains the zpool library, which provides support
@@ -195,7 +195,7 @@ for managing zpools
 %package -n libnvpair3
 Summary:        Solaris name-value library for Linux
 Group:          System Environment/Kernel
-Obsoletes:      libnvpair1
+Obsoletes:      libnvpair1 <= %{version}
 
 %description -n libnvpair3
 This package contains routines for packing and unpacking name-value
@@ -213,7 +213,7 @@ to write self describing data structures on disk.
 %package -n libuutil3
 Summary:        Solaris userland utility library for Linux
 Group:          System Environment/Kernel
-Obsoletes:      libuutil1
+Obsoletes:      libuutil1 <= %{version}
 
 %description -n libuutil3
 This library provides a variety of compatibility functions for OpenZFS:
@@ -239,8 +239,8 @@ This library provides a variety of compatibility functions for OpenZFS:
 %package -n libzfs5
 Summary:        Native ZFS filesystem library for Linux
 Group:          System Environment/Kernel
-Obsoletes:      libzfs2
-Obsoletes:      libzfs4
+Obsoletes:      libzfs2 <= %{version}
+Obsoletes:      libzfs4 <= %{version}
 
 %description -n libzfs5
 This package provides support for managing ZFS filesystems
@@ -262,9 +262,9 @@ Requires:       libuutil3%{?_isa} = %{version}-%{release}
 Provides:       libzpool5-devel = %{version}-%{release}
 Provides:       libnvpair3-devel = %{version}-%{release}
 Provides:       libuutil3-devel = %{version}-%{release}
-Obsoletes:      zfs-devel
-Obsoletes:      libzfs2-devel
-Obsoletes:      libzfs4-devel
+Obsoletes:      zfs-devel <= %{version}
+Obsoletes:      libzfs2-devel <= %{version}
+Obsoletes:      libzfs4-devel <= %{version}
 
 %description -n libzfs5-devel
 This package contains the header files needed for building additional

From acd74646399c7b40ab5f54ea2f286ed9c9edaa64 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Mon, 11 Jul 2022 13:35:19 -0700
Subject: [PATCH 04/39] zed: Ignore false 'atari' partitions in autoreplace

libudev will sometimes falsely identify an 'atari' partition on a
blank disk, preventing it from being used in an autoreplace.  This
seems to be a known issue.  The workaround is to just ignore the
fake partition and continue with the autoreplace.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #13497
Closes #13632
---
 cmd/zed/zed_disk_event.c | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/cmd/zed/zed_disk_event.c b/cmd/zed/zed_disk_event.c
index 52b80d8c4c93..ed774effe256 100644
--- a/cmd/zed/zed_disk_event.c
+++ b/cmd/zed/zed_disk_event.c
@@ -208,6 +208,12 @@ zed_udev_monitor(void *arg)
 		 * if this is a disk and it is partitioned, then the
 		 * zfs label will reside in a DEVTYPE=partition and
 		 * we can skip passing this event
+		 *
+		 * Special case: Blank disks are sometimes reported with
+		 * an erroneous 'atari' partition, and should not be
+		 * excluded from being used as an autoreplace disk:
+		 *
+		 * https://github.com/openzfs/zfs/issues/13497
 		 */
 		type = udev_device_get_property_value(dev, "DEVTYPE");
 		part = udev_device_get_property_value(dev,
@@ -215,14 +221,23 @@ zed_udev_monitor(void *arg)
 		if (type != NULL && type[0] != '\0' &&
 		    strcmp(type, "disk") == 0 &&
 		    part != NULL && part[0] != '\0') {
-			zed_log_msg(LOG_INFO,
-			    "%s: skip %s since it has a %s partition already",
-			    __func__,
-			    udev_device_get_property_value(dev, "DEVNAME"),
-			    part);
-			/* skip and wait for partition event */
-			udev_device_unref(dev);
-			continue;
+			const char *devname =
+			    udev_device_get_property_value(dev, "DEVNAME");
+
+			if (strcmp(part, "atari") == 0) {
+				zed_log_msg(LOG_INFO,
+				    "%s: %s is reporting an atari partition, "
+				    "but we're going to assume it's a false "
+				    "positive and still use it (issue #13497)",
+				    __func__, devname);
+			} else {
+				zed_log_msg(LOG_INFO,
+				    "%s: skip %s since it has a %s partition "
+				    "already", __func__, devname, part);
+				/* skip and wait for partition event */
+				udev_device_unref(dev);
+				continue;
+			}
 		}
 
 		/*

From 65f8f92d12c081aa67d6312b5210fa10518d7ebf Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Thu, 14 Jul 2022 10:19:37 -0700
Subject: [PATCH 05/39] zed: Look for NVMe DEVPATH if no ID_BUS

We tried replacing an NVMe drive using autoreplace, only
to see zed reject it with:

zed[27955]: zed_udev_monitor: /dev/nvme5n1 no devid source

This happened because ZED saw that ID_BUS was not set by udev
for the NVMe drive, and thus didn't think it was "real drive".
This commit allows NVMe drives to be autoreplaced even if
ID_BUS is not set.

Reviewed-by: Don Brady <don.brady@intel.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #13512
Closes #13646
---
 cmd/zed/zed_disk_event.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/cmd/zed/zed_disk_event.c b/cmd/zed/zed_disk_event.c
index ed774effe256..d9b355292ab9 100644
--- a/cmd/zed/zed_disk_event.c
+++ b/cmd/zed/zed_disk_event.c
@@ -169,7 +169,7 @@ zed_udev_monitor(void *arg)
 	while (1) {
 		struct udev_device *dev;
 		const char *action, *type, *part, *sectors;
-		const char *bus, *uuid;
+		const char *bus, *uuid, *devpath;
 		const char *class, *subclass;
 		nvlist_t *nvl;
 		boolean_t is_zfs = B_FALSE;
@@ -263,10 +263,19 @@ zed_udev_monitor(void *arg)
 		 * device id string is required in the message schema
 		 * for matching with vdevs. Preflight here for expected
 		 * udev information.
+		 *
+		 * Special case:
+		 * NVMe devices don't have ID_BUS set (at least on RHEL 7-8),
+		 * but they are valid for autoreplace.  Add a special case for
+		 * them by searching for "/nvme/" in the udev DEVPATH:
+		 *
+		 * DEVPATH=/devices/pci0000:00/0000:00:1e.0/nvme/nvme2/nvme2n1
 		 */
 		bus = udev_device_get_property_value(dev, "ID_BUS");
 		uuid = udev_device_get_property_value(dev, "DM_UUID");
-		if (!is_zfs && (bus == NULL && uuid == NULL)) {
+		devpath = udev_device_get_devpath(dev);
+		if (!is_zfs && (bus == NULL && uuid == NULL &&
+		    strstr(devpath, "/nvme/") == NULL)) {
 			zed_log_msg(LOG_INFO, "zed_udev_monitor: %s no devid "
 			    "source", udev_device_get_devnode(dev));
 			udev_device_unref(dev);

From b1be0a5c151bb8f7d85430465c57d85d9be48075 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Tue, 9 Aug 2022 13:26:46 -0700
Subject: [PATCH 06/39] ZTS: Fix zpool_expand_001_pos

`zpool_expand_001_pos` was often failing due to not seeing autoexpand
commands in the `zpool history`.  During testing, I found this to be
unreliable (sometimes the "online" wouldn't appear in `zpool history`)
and unnecessary, as we could simply check that the pool increased in
size.

This commit revamps the test to check for the expanded pool size
and corresponding new free space.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #13743
---
 .../zpool_expand/zpool_expand_001_pos.ksh     | 112 +++++++++---------
 1 file changed, 55 insertions(+), 57 deletions(-)

diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh
index 6bbd46289f7c..8760f48dd2a4 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh
@@ -54,6 +54,14 @@
 
 verify_runnable "global"
 
+# We override $org_size and $exp_size from zpool_expand.cfg to make sure we get
+# an expected free space value every time.  Otherwise, if we left it
+# configurable, the free space ratio to pool size ratio would diverge too much
+# much at low $org_size values.
+#
+org_size=$((1024 * 1024 * 1024))
+exp_size=$(($org_size * 2))
+
 function cleanup
 {
 	poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1
@@ -68,11 +76,35 @@ function cleanup
 	unload_scsi_debug
 }
 
+# Wait for the size of a pool to autoexpand to $1 and the total free space to
+# expand to $2 (both values allowing a 10% tolerance).
+#
+# Wait for up to 10 seconds for this to happen (typically takes 1-2 seconds)
+#
+function wait_for_autoexpand
+{
+	typeset exp_new_size=$1
+	typeset exp_new_free=$2
+
+	for i in $(seq 1 10) ; do
+		typeset new_size=$(get_pool_prop size $TESTPOOL1)
+		typeset new_free=$(get_prop avail $TESTPOOL1)
+		# Values need to be within 90% of each other (10% tolerance)
+		if within_percent $new_size $exp_new_size 90 > /dev/null && \
+		    within_percent $new_free $exp_new_free 90 > /dev/null ; then
+			return
+		fi
+		sleep 1
+	done
+	log_fail "$TESTPOOL never expanded to $exp_new_size with $exp_new_free" \
+	    " free space (got $new_size with $new_free free space)"
+}
+
 log_onexit cleanup
 
 log_assert "zpool can be autoexpanded after set autoexpand=on on vdev expansion"
 
-for type in " " mirror raidz draid:1s; do
+for type in " " mirror raidz; do
 	log_note "Setting up loopback, scsi_debug, and file vdevs"
 	log_must truncate -s $org_size $FILE_LO
 	DEV1=$(losetup -f)
@@ -105,72 +137,38 @@ for type in " " mirror raidz draid:1s; do
 	log_note "Expanding loopback, scsi_debug, and file vdevs"
 	log_must truncate -s $exp_size $FILE_LO
 	log_must losetup -c $DEV1
-	sleep 3
 
 	echo "2" > /sys/bus/pseudo/drivers/scsi_debug/virtual_gb
 	echo "1" > /sys/class/block/$DEV2/device/rescan
 	block_device_wait
-	sleep 3
 
 	log_must truncate -s $exp_size $FILE_RAW
 	log_must zpool online -e $TESTPOOL1 $FILE_RAW
 
-	typeset expand_size=$(get_pool_prop size $TESTPOOL1)
-	typeset zfs_expand_size=$(get_prop avail $TESTPOOL1)
-
-	log_note "$TESTPOOL1 $type has previous size: $prev_size and " \
-	    "expanded size: $expand_size"
-	# compare available pool size from zfs
-	if [[ $zfs_expand_size -gt $zfs_prev_size ]]; then
-		# check for zpool history for the pool size expansion
-		if [[ $type == " " ]]; then
-			typeset expansion_size=$(($exp_size-$org_size))
-			typeset	size_addition=$(zpool history -il $TESTPOOL1 |\
-			    grep "pool '$TESTPOOL1' size:" | \
-			    grep "vdev online" | \
-			    grep "(+${expansion_size}" | wc -l)
-
-			if [[ $size_addition -ne 3 ]]; then
-				log_fail "pool $TESTPOOL1 has not expanded, " \
-				    "$size_addition/3 vdevs expanded"
-			fi
-		elif [[ $type == "mirror" ]]; then
-			typeset expansion_size=$(($exp_size-$org_size))
-			zpool history -il $TESTPOOL1 | \
-			    grep "pool '$TESTPOOL1' size:" | \
-			    grep "vdev online" | \
-			    grep "(+${expansion_size})" >/dev/null 2>&1
-
-			if [[ $? -ne 0 ]] ; then
-				log_fail "pool $TESTPOOL1 has not expanded"
-			fi
-		elif [[ $type == "draid:1s" ]]; then
-			typeset expansion_size=$((2*($exp_size-$org_size)))
-			zpool history -il $TESTPOOL1 | \
-			    grep "pool '$TESTPOOL1' size:" | \
-			    grep "vdev online" | \
-			    grep "(+${expansion_size})" >/dev/null 2>&1
-
-			if [[ $? -ne 0 ]]; then
-				log_fail "pool $TESTPOOL has not expanded"
-			fi
-		else
-			typeset expansion_size=$((3*($exp_size-$org_size)))
-			zpool history -il $TESTPOOL1 | \
-			    grep "pool '$TESTPOOL1' size:" | \
-			    grep "vdev online" | \
-			    grep "(+${expansion_size})" >/dev/null 2>&1
-
-			if [[ $? -ne 0 ]]; then
-				log_fail "pool $TESTPOOL has not expanded"
-			fi
-		fi
-	else
-		log_fail "pool $TESTPOOL1 is not autoexpanded after vdev " \
-		    "expansion.  Previous size: $zfs_prev_size and expanded " \
-		    "size: $zfs_expand_size"
+
+	# The expected free space values below were observed at the time of
+	# this commit.  However, we know ZFS overhead will change over time,
+	# and thus we do not do an exact comparison to these values in
+	# wait_for_autoexpand.  Rather, we make sure the free space
+	# is within some small percentage threshold of these values.
+	typeset exp_new_size=$(($prev_size * 2))
+	if [[ "$type" == " " ]] ; then
+		exp_new_free=6045892608
+	elif [[ "$type" == "mirror" ]] ; then
+		exp_new_free=1945997312
+	elif [[ "$type" == "raidz" ]] ; then
+		exp_new_free=3977637338
+	elif [[ "$type" == "draid:1s" ]] then
+		exp_new_free=1946000384
 	fi
 
+	wait_for_autoexpand $exp_new_size $exp_new_free
+
+	expand_size=$(get_pool_prop size $TESTPOOL1)
+
+	log_note "$TESTPOOL1 '$type' grew from $prev_size -> $expand_size with" \
+	    "free space from $zfs_prev_size -> $(get_prop avail $TESTPOOL1)"
+
 	cleanup
 done
 

From 15b64fbc94cc7aae1cef8787be2a36f677da92a7 Mon Sep 17 00:00:00 2001
From: George Wilson <george.wilson@delphix.com>
Date: Fri, 26 Aug 2022 16:04:27 -0500
Subject: [PATCH 07/39] Importing from cachefile can trip assertion

When importing from cachefile, it is possible that the builtin retry
logic will trip an assertion because it also fails to find the pool.
This fix addresses that case and returns the correct error message to
the user.

Reviewed-by: Richard Yao <ryao@gentoo.org>
Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: George Wilson <gwilson@delphix.com>
Closes #13781
---
 lib/libzutil/zutil_import.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/libzutil/zutil_import.c b/lib/libzutil/zutil_import.c
index f6f125e7a5df..1658215199f2 100644
--- a/lib/libzutil/zutil_import.c
+++ b/lib/libzutil/zutil_import.c
@@ -1660,6 +1660,8 @@ zpool_find_import_cached(libpc_handle_t *hdl, importargs_t *iarg)
 			 * caller.
 			 */
 			nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
+			if (pair == NULL)
+				continue;
 			fnvlist_add_nvlist(pools, nvpair_name(pair),
 			    fnvpair_value_nvlist(pair));
 

From b6ebf270ebd2a202dae0e14d2f16a929cdfdd61c Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Fri, 2 Sep 2022 16:21:18 -0400
Subject: [PATCH 08/39] Apply arc_shrink_shift to ARC above arc_c_min

It makes sense to free memory in smaller chunks when approaching
arc_c_min to let other kernel subsystems to free more, since after
that point we can't free anything.  This also matches behavior on
Linux, where to shrinker reported only the size above arc_c_min.

Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Closes #13794
---
 module/os/freebsd/zfs/arc_os.c | 5 ++++-
 module/zfs/arc.c               | 9 +++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c
index fddb1f0e87cb..77af092e1ed4 100644
--- a/module/os/freebsd/zfs/arc_os.c
+++ b/module/os/freebsd/zfs/arc_os.c
@@ -223,7 +223,10 @@ arc_lowmem(void *arg __unused, int howto __unused)
 	arc_warm = B_TRUE;
 	arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
 	free_memory = arc_available_memory();
-	to_free = (arc_c >> arc_shrink_shift) - MIN(free_memory, 0);
+	int64_t can_free = arc_c - arc_c_min;
+	if (can_free <= 0)
+		return;
+	to_free = (can_free >> arc_shrink_shift) - MIN(free_memory, 0);
 	DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free);
 	arc_reduce_target_size(to_free);
 
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 8d3882694718..215250ea6fec 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -5036,10 +5036,11 @@ arc_reap_cb(void *arg, zthr_t *zthr)
 	 */
 	free_memory = arc_available_memory();
 
-	int64_t to_free =
-	    (arc_c >> arc_shrink_shift) - free_memory;
-	if (to_free > 0) {
-		arc_reduce_target_size(to_free);
+	int64_t can_free = arc_c - arc_c_min;
+	if (can_free > 0) {
+		int64_t to_free = (can_free >> arc_shrink_shift) - free_memory;
+		if (to_free > 0)
+			arc_reduce_target_size(to_free);
 	}
 	spl_fstrans_unmark(cookie);
 }

From 78206a2e447a2973203aaebb92633a5e8e6d2d34 Mon Sep 17 00:00:00 2001
From: Ryan Moeller <ryan@iXsystems.com>
Date: Tue, 9 Aug 2022 09:05:29 +0000
Subject: [PATCH 09/39] FreeBSD: Mark ZFS_MODULE_PARAM_CALL as MPSAFE

ZFS_MODULE_PARAM_CALL handlers implement their own locking if needed
and do not require Giant.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Ryan Moeller <ryan@iXsystems.com>
Closes #13756
---
 include/os/freebsd/spl/sys/mod_os.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/os/freebsd/spl/sys/mod_os.h b/include/os/freebsd/spl/sys/mod_os.h
index 5695abee7b85..46ea2d15ac6e 100644
--- a/include/os/freebsd/spl/sys/mod_os.h
+++ b/include/os/freebsd/spl/sys/mod_os.h
@@ -52,7 +52,7 @@
 
 #define	ZFS_MODULE_PARAM_CALL_IMPL(parent, name, perm, args, desc) \
     SYSCTL_DECL(parent); \
-    SYSCTL_PROC(parent, OID_AUTO, name, perm | args, desc)
+    SYSCTL_PROC(parent, OID_AUTO, name, CTLFLAG_MPSAFE | perm | args, desc)
 
 #define	ZFS_MODULE_PARAM_CALL(scope_prefix, name_prefix, name, func, _, perm, desc) \
     ZFS_MODULE_PARAM_CALL_IMPL(_vfs_ ## scope_prefix, name, perm, func ## _args(name_prefix ## name), desc)

From aa9e887d2a715e987c02c7bb57f88f63442c553b Mon Sep 17 00:00:00 2001
From: Samuel <50765275+npc203@users.noreply.github.com>
Date: Tue, 6 Sep 2022 22:07:47 +0530
Subject: [PATCH 10/39] Fix column width in 'zpool iostat -v' and 'zpool list
 -v'

This commit fixes a minor spacing issue caused when
enumerating vdev names, which originated from #13031

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Akash B <akash-b@hpe.com>
Signed-off-by: Samuel Wycliffe <samuelwycliffe@gmail.com>
Closes #13811
---
 cmd/zpool/zpool_main.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index b93a6196beea..b2e7dc4a5b4d 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -5458,8 +5458,8 @@ get_namewidth_iostat(zpool_handle_t *zhp, void *data)
 	 * get_namewidth() returns the maximum width of any name in that column
 	 * for any pool/vdev/device line that will be output.
 	 */
-	width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags,
-	    cb->cb_verbose);
+	width = get_namewidth(zhp, cb->cb_namewidth,
+	    cb->cb_name_flags | VDEV_NAME_TYPE_ID, cb->cb_verbose);
 
 	/*
 	 * The width we are calculating is the width of the header and also the
@@ -6282,8 +6282,8 @@ get_namewidth_list(zpool_handle_t *zhp, void *data)
 	list_cbdata_t *cb = data;
 	int width;
 
-	width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags,
-	    cb->cb_verbose);
+	width = get_namewidth(zhp, cb->cb_namewidth,
+	    cb->cb_name_flags | VDEV_NAME_TYPE_ID, cb->cb_verbose);
 
 	if (width < 9)
 		width = 9;

From 2010c183bcc1a2edc2cb8ed5f2d065a35a891fec Mon Sep 17 00:00:00 2001
From: Walter Huf <hufman@gmail.com>
Date: Tue, 6 Sep 2022 10:02:18 -0700
Subject: [PATCH 11/39] Add xattr_handler support for Android kernels

Some ARM BSPs run the Android kernel, which has
a modified xattr_handler->get() function signature.
This adds support to compile against these kernels.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by: Walter Huf <hufman@gmail.com>
Closes #13824
---
 config/kernel-xattr-handler.m4               | 29 +++++++++++++++++++-
 include/os/linux/kernel/linux/xattr_compat.h | 14 ++++++++++
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/config/kernel-xattr-handler.m4 b/config/kernel-xattr-handler.m4
index 00b1e74a9ccb..b6cbfa155007 100644
--- a/config/kernel-xattr-handler.m4
+++ b/config/kernel-xattr-handler.m4
@@ -100,6 +100,19 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_GET], [
 			.get = get,
 		};
 	],[])
+
+	ZFS_LINUX_TEST_SRC([xattr_handler_get_dentry_inode_flags], [
+		#include <linux/xattr.h>
+
+		int get(const struct xattr_handler *handler,
+		    struct dentry *dentry, struct inode *inode,
+		    const char *name, void *buffer,
+		    size_t size, int flags) { return 0; }
+		static const struct xattr_handler
+		    xops __attribute__ ((unused)) = {
+			.get = get,
+		};
+	],[])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_GET], [
@@ -142,7 +155,21 @@ AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_GET], [
 				AC_DEFINE(HAVE_XATTR_GET_DENTRY, 1,
 				    [xattr_handler->get() wants dentry])
 			],[
-				ZFS_LINUX_TEST_ERROR([xattr get()])
+				dnl #
+				dnl # Android API change,
+				dnl # The xattr_handler->get() callback was
+				dnl # changed to take dentry, inode and flags.
+				dnl #
+				AC_MSG_RESULT(no)
+				AC_MSG_CHECKING(
+				    [whether xattr_handler->get() wants dentry and inode and flags])
+				ZFS_LINUX_TEST_RESULT([xattr_handler_get_dentry_inode_flags], [
+					AC_MSG_RESULT(yes)
+					AC_DEFINE(HAVE_XATTR_GET_DENTRY_INODE_FLAGS, 1,
+					    [xattr_handler->get() wants dentry and inode and flags])
+				],[
+					ZFS_LINUX_TEST_ERROR([xattr get()])
+				])
 			])
 		])
 	])
diff --git a/include/os/linux/kernel/linux/xattr_compat.h b/include/os/linux/kernel/linux/xattr_compat.h
index 54690727eab9..30403fe87397 100644
--- a/include/os/linux/kernel/linux/xattr_compat.h
+++ b/include/os/linux/kernel/linux/xattr_compat.h
@@ -115,6 +115,20 @@ fn(struct dentry *dentry, const char *name, void *buffer, size_t size,	\
 {									\
 	return (__ ## fn(dentry->d_inode, name, buffer, size));		\
 }
+/*
+ * Android API change,
+ * The xattr_handler->get() callback was changed to take a dentry and inode
+ * and flags, because the dentry might not be attached to an inode yet.
+ */
+#elif defined(HAVE_XATTR_GET_DENTRY_INODE_FLAGS)
+#define	ZPL_XATTR_GET_WRAPPER(fn)					\
+static int								\
+fn(const struct xattr_handler *handler, struct dentry *dentry,		\
+    struct inode *inode, const char *name, void *buffer,		\
+    size_t size, int flags)						\
+{									\
+	return (__ ## fn(inode, name, buffer, size));			\
+}
 #else
 #error "Unsupported kernel"
 #endif

From 7bbfac9d04dc06ba339be0540fd50e5166df8516 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Thu, 8 Sep 2022 10:32:30 -0700
Subject: [PATCH 12/39] zed: Fix config_sync autoexpand flood

Users were seeing floods of `config_sync` events when autoexpand was
enabled.  This happened because all "disk status change" udev events
invoke the autoexpand codepath, which calls zpool_relabel_disk(),
which in turn cause another "disk status change" event to happen,
in a feedback loop.  Note that "disk status change" happens every time
a user calls close() on a block device.

This commit breaks the feedback loop by only allowing an autoexpand
to happen if the disk actually changed size.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes: #7132
Closes: #7366
Closes #13729
---
 cmd/zed/agents/zfs_mod.c   | 155 +++++++++++++++++++++++++++++++++++--
 cmd/zed/zed_disk_event.c   |  16 ++++
 include/sys/sysevent/dev.h |   3 +
 3 files changed, 166 insertions(+), 8 deletions(-)

diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c
index a510d646e1f9..a4e23ca1a3b0 100644
--- a/cmd/zed/agents/zfs_mod.c
+++ b/cmd/zed/agents/zfs_mod.c
@@ -894,14 +894,90 @@ zfs_deliver_check(nvlist_t *nvl)
 	return (0);
 }
 
+/*
+ * Given a path to a vdev, lookup the vdev's physical size from its
+ * config nvlist.
+ *
+ * Returns the vdev's physical size in bytes on success, 0 on error.
+ */
+static uint64_t
+vdev_size_from_config(zpool_handle_t *zhp, const char *vdev_path)
+{
+	nvlist_t *nvl = NULL;
+	boolean_t avail_spare, l2cache, log;
+	vdev_stat_t *vs = NULL;
+	uint_t c;
+
+	nvl = zpool_find_vdev(zhp, vdev_path, &avail_spare, &l2cache, &log);
+	if (!nvl)
+		return (0);
+
+	verify(nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_VDEV_STATS,
+	    (uint64_t **)&vs, &c) == 0);
+	if (!vs) {
+		zed_log_msg(LOG_INFO, "%s: no nvlist for '%s'", __func__,
+		    vdev_path);
+		return (0);
+	}
+
+	return (vs->vs_pspace);
+}
+
+/*
+ * Given a path to a vdev, lookup if the vdev is a "whole disk" in the
+ * config nvlist.  "whole disk" means that ZFS was passed a whole disk
+ * at pool creation time, which it partitioned up and has full control over.
+ * Thus a partition with wholedisk=1 set tells us that zfs created the
+ * partition at creation time.  A partition without whole disk set would have
+ * been created by externally (like with fdisk) and passed to ZFS.
+ *
+ * Returns the whole disk value (either 0 or 1).
+ */
+static uint64_t
+vdev_whole_disk_from_config(zpool_handle_t *zhp, const char *vdev_path)
+{
+	nvlist_t *nvl = NULL;
+	boolean_t avail_spare, l2cache, log;
+	uint64_t wholedisk;
+
+	nvl = zpool_find_vdev(zhp, vdev_path, &avail_spare, &l2cache, &log);
+	if (!nvl)
+		return (0);
+
+	verify(nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_WHOLE_DISK,
+	    &wholedisk) == 0);
+
+	return (wholedisk);
+}
+
+/*
+ * If the device size grew more than 1% then return true.
+ */
+#define	DEVICE_GREW(oldsize, newsize) \
+		    ((newsize > oldsize) && \
+		    ((newsize / (newsize - oldsize)) <= 100))
+
 static int
 zfsdle_vdev_online(zpool_handle_t *zhp, void *data)
 {
-	char *devname = data;
 	boolean_t avail_spare, l2cache;
+	nvlist_t *udev_nvl = data;
 	nvlist_t *tgt;
 	int error;
 
+	char *tmp_devname, devname[MAXPATHLEN];
+	uint64_t guid;
+
+	if (nvlist_lookup_uint64(udev_nvl, ZFS_EV_VDEV_GUID, &guid) == 0) {
+		sprintf(devname, "%llu", (u_longlong_t)guid);
+	} else if (nvlist_lookup_string(udev_nvl, DEV_PHYS_PATH,
+	    &tmp_devname) == 0) {
+		strlcpy(devname, tmp_devname, MAXPATHLEN);
+		zfs_append_partition(devname, MAXPATHLEN);
+	} else {
+		zed_log_msg(LOG_INFO, "%s: no guid or physpath", __func__);
+	}
+
 	zed_log_msg(LOG_INFO, "zfsdle_vdev_online: searching for '%s' in '%s'",
 	    devname, zpool_get_name(zhp));
 
@@ -953,12 +1029,75 @@ zfsdle_vdev_online(zpool_handle_t *zhp, void *data)
 			vdev_state_t newstate;
 
 			if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) {
-				error = zpool_vdev_online(zhp, fullpath, 0,
-				    &newstate);
-				zed_log_msg(LOG_INFO, "zfsdle_vdev_online: "
-				    "setting device '%s' to ONLINE state "
-				    "in pool '%s': %d", fullpath,
-				    zpool_get_name(zhp), error);
+				/*
+				 * If this disk size has not changed, then
+				 * there's no need to do an autoexpand.  To
+				 * check we look at the disk's size in its
+				 * config, and compare it to the disk size
+				 * that udev is reporting.
+				 */
+				uint64_t udev_size = 0, conf_size = 0,
+				    wholedisk = 0, udev_parent_size = 0;
+
+				/*
+				 * Get the size of our disk that udev is
+				 * reporting.
+				 */
+				if (nvlist_lookup_uint64(udev_nvl, DEV_SIZE,
+				    &udev_size) != 0) {
+					udev_size = 0;
+				}
+
+				/*
+				 * Get the size of our disk's parent device
+				 * from udev (where sda1's parent is sda).
+				 */
+				if (nvlist_lookup_uint64(udev_nvl,
+				    DEV_PARENT_SIZE, &udev_parent_size) != 0) {
+					udev_parent_size = 0;
+				}
+
+				conf_size = vdev_size_from_config(zhp,
+				    fullpath);
+
+				wholedisk = vdev_whole_disk_from_config(zhp,
+				    fullpath);
+
+				/*
+				 * Only attempt an autoexpand if the vdev size
+				 * changed.  There are two different cases
+				 * to consider.
+				 *
+				 * 1. wholedisk=1
+				 * If you do a 'zpool create' on a whole disk
+				 * (like /dev/sda), then zfs will create
+				 * partitions on the disk (like /dev/sda1).  In
+				 * that case, wholedisk=1 will be set in the
+				 * partition's nvlist config.  So zed will need
+				 * to see if your parent device (/dev/sda)
+				 * expanded in size, and if so, then attempt
+				 * the autoexpand.
+				 *
+				 * 2. wholedisk=0
+				 * If you do a 'zpool create' on an existing
+				 * partition, or a device that doesn't allow
+				 * partitions, then wholedisk=0, and you will
+				 * simply need to check if the device itself
+				 * expanded in size.
+				 */
+				if (DEVICE_GREW(conf_size, udev_size) ||
+				    (wholedisk && DEVICE_GREW(conf_size,
+				    udev_parent_size))) {
+					error = zpool_vdev_online(zhp, fullpath,
+					    0, &newstate);
+
+					zed_log_msg(LOG_INFO,
+					    "%s: autoexpanding '%s' from %llu"
+					    " to %llu bytes in pool '%s': %d",
+					    __func__, fullpath, conf_size,
+					    MAX(udev_size, udev_parent_size),
+					    zpool_get_name(zhp), error);
+				}
 			}
 		}
 		zpool_close(zhp);
@@ -989,7 +1128,7 @@ zfs_deliver_dle(nvlist_t *nvl)
 		zed_log_msg(LOG_INFO, "zfs_deliver_dle: no guid or physpath");
 	}
 
-	if (zpool_iter(g_zfshdl, zfsdle_vdev_online, name) != 1) {
+	if (zpool_iter(g_zfshdl, zfsdle_vdev_online, nvl) != 1) {
 		zed_log_msg(LOG_INFO, "zfs_deliver_dle: device '%s' not "
 		    "found", name);
 		return (1);
diff --git a/cmd/zed/zed_disk_event.c b/cmd/zed/zed_disk_event.c
index d9b355292ab9..e31ec4cfc7e7 100644
--- a/cmd/zed/zed_disk_event.c
+++ b/cmd/zed/zed_disk_event.c
@@ -78,6 +78,8 @@ zed_udev_event(const char *class, const char *subclass, nvlist_t *nvl)
 		zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PHYS_PATH, strval);
 	if (nvlist_lookup_uint64(nvl, DEV_SIZE, &numval) == 0)
 		zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_SIZE, numval);
+	if (nvlist_lookup_uint64(nvl, DEV_PARENT_SIZE, &numval) == 0)
+		zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_PARENT_SIZE, numval);
 	if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &numval) == 0)
 		zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_POOL_GUID, numval);
 	if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &numval) == 0)
@@ -130,6 +132,20 @@ dev_event_nvlist(struct udev_device *dev)
 
 		numval *= strtoull(value, NULL, 10);
 		(void) nvlist_add_uint64(nvl, DEV_SIZE, numval);
+
+		/*
+		 * If the device has a parent, then get the parent block
+		 * device's size as well.  For example, /dev/sda1's parent
+		 * is /dev/sda.
+		 */
+		struct udev_device *parent_dev = udev_device_get_parent(dev);
+		if ((value = udev_device_get_sysattr_value(parent_dev, "size"))
+		    != NULL) {
+			uint64_t numval = DEV_BSIZE;
+
+			numval *= strtoull(value, NULL, 10);
+			(void) nvlist_add_uint64(nvl, DEV_PARENT_SIZE, numval);
+		}
 	}
 
 	/*
diff --git a/include/sys/sysevent/dev.h b/include/sys/sysevent/dev.h
index 1117538d822d..2418bbad469d 100644
--- a/include/sys/sysevent/dev.h
+++ b/include/sys/sysevent/dev.h
@@ -244,6 +244,9 @@ extern "C" {
 #define	DEV_PATH		"path"
 #define	DEV_IS_PART		"is_slice"
 #define	DEV_SIZE		"dev_size"
+
+/* Size of the whole parent block device (if dev is a partition) */
+#define	DEV_PARENT_SIZE		"dev_parent_size"
 #endif /* __linux__ */
 
 #define	EV_V1			1

From c8f795ba53acfe0239bfa5d75f64dce8e390a992 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=BD=D0=B0=D0=B1?= <nabijaczleweli@nabijaczleweli.xyz>
Date: Thu, 23 Dec 2021 21:26:19 +0100
Subject: [PATCH 13/39] config: check for parallel(1), use it for cstyle
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Before:
$ time make cstyle
real    0m23.118s
user    0m23.002s
sys     0m0.114s

After:
$ time make cstyle
real    0m4.577s
user    0m31.487s
sys     0m0.699s

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
Issue #12899
---
 Makefile.am               | 7 ++++++-
 config/always-parallel.m4 | 8 ++++++++
 config/zfs-build.m4       | 1 +
 3 files changed, 15 insertions(+), 1 deletion(-)
 create mode 100644 config/always-parallel.m4

diff --git a/Makefile.am b/Makefile.am
index 7e2b10b39dee..36d8cd2d6fb8 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -114,6 +114,11 @@ commitcheck:
 		${top_srcdir}/scripts/commitcheck.sh; \
 	fi
 
+if HAVE_PARALLEL
+cstyle_line = -print0 | parallel -X0 ${top_srcdir}/scripts/cstyle.pl -cpP {}
+else
+cstyle_line = -exec ${top_srcdir}/scripts/cstyle.pl -cpP {} +
+endif
 PHONY += cstyle
 cstyle:
 	@find ${top_srcdir} -name build -prune \
@@ -122,7 +127,7 @@ cstyle:
 		! -name 'opt_global.h' ! -name '*_if*.h' \
 		! -name 'zstd_compat_wrapper.h' \
 		! -path './module/zstd/lib/*' \
-		-exec ${top_srcdir}/scripts/cstyle.pl -cpP {} \+
+		$(cstyle_line)
 
 filter_executable = -exec test -x '{}' \; -print
 
diff --git a/config/always-parallel.m4 b/config/always-parallel.m4
new file mode 100644
index 000000000000..c1f1ae78e7e7
--- /dev/null
+++ b/config/always-parallel.m4
@@ -0,0 +1,8 @@
+dnl #
+dnl # Check if GNU parallel is available.
+dnl #
+AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PARALLEL], [
+	AC_CHECK_PROG([PARALLEL], [parallel], [yes])
+
+	AM_CONDITIONAL([HAVE_PARALLEL], [test "x$PARALLEL" = "xyes"])
+])
diff --git a/config/zfs-build.m4 b/config/zfs-build.m4
index 8ca596ecf06b..bd8e3ac80201 100644
--- a/config/zfs-build.m4
+++ b/config/zfs-build.m4
@@ -226,6 +226,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS], [
 	ZFS_AC_CONFIG_ALWAYS_SED
 	ZFS_AC_CONFIG_ALWAYS_CPPCHECK
 	ZFS_AC_CONFIG_ALWAYS_SHELLCHECK
+	ZFS_AC_CONFIG_ALWAYS_PARALLEL
 ])
 
 AC_DEFUN([ZFS_AC_CONFIG], [

From 8bd3dca9bf3e9a4315d58be316bcfaf8e76c6a6a Mon Sep 17 00:00:00 2001
From: George Amanakis <gamanakis@gmail.com>
Date: Thu, 11 Nov 2021 21:52:16 +0100
Subject: [PATCH 14/39] Introduce a tunable to exclude special class buffers
 from L2ARC

Special allocation class or dedup vdevs may have roughly the same
performance as L2ARC vdevs. Introduce a new tunable to exclude those
buffers from being cacheable on L2ARC.

Reviewed-by: Don Brady <don.brady@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: George Amanakis <gamanakis@gmail.com>
Closes #11761
Closes #12285
---
 include/sys/arc.h        |  1 +
 include/sys/dbuf.h       | 11 +------
 include/sys/dmu_objset.h |  4 ---
 man/man4/zfs.4           |  5 +++
 module/zfs/arc.c         | 12 +++++++
 module/zfs/dbuf.c        | 71 +++++++++++++++++++++++++++++++++++++---
 module/zfs/dmu.c         |  2 +-
 module/zfs/dmu_objset.c  | 34 +++++++++++++++++--
 8 files changed, 119 insertions(+), 21 deletions(-)

diff --git a/include/sys/arc.h b/include/sys/arc.h
index a3241f3685a6..5d8176894e60 100644
--- a/include/sys/arc.h
+++ b/include/sys/arc.h
@@ -85,6 +85,7 @@ typedef void arc_prune_func_t(int64_t bytes, void *priv);
 
 /* Shared module parameters */
 extern int zfs_arc_average_blocksize;
+extern int l2arc_exclude_special;
 
 /* generic arc_done_func_t's which you can use */
 arc_read_done_func_t arc_bcopy_func;
diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h
index 93d80066be82..2e7385113ec5 100644
--- a/include/sys/dbuf.h
+++ b/include/sys/dbuf.h
@@ -441,16 +441,7 @@ dbuf_find_dirty_eq(dmu_buf_impl_t *db, uint64_t txg)
 	(dbuf_is_metadata(_db) &&					\
 	((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
 
-#define	DBUF_IS_L2CACHEABLE(_db)					\
-	((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||	\
-	(dbuf_is_metadata(_db) &&					\
-	((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
-
-#define	DNODE_LEVEL_IS_L2CACHEABLE(_dn, _level)				\
-	((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_ALL ||	\
-	(((_level) > 0 ||						\
-	DMU_OT_IS_METADATA((_dn)->dn_handle->dnh_dnode->dn_type)) &&	\
-	((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
+boolean_t dbuf_is_l2cacheable(dmu_buf_impl_t *db);
 
 #ifdef ZFS_DEBUG
 
diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h
index e89ee64ea686..7ade2dc91247 100644
--- a/include/sys/dmu_objset.h
+++ b/include/sys/dmu_objset.h
@@ -200,10 +200,6 @@ struct objset {
 #define	DMU_GROUPUSED_DNODE(os)	((os)->os_groupused_dnode.dnh_dnode)
 #define	DMU_PROJECTUSED_DNODE(os) ((os)->os_projectused_dnode.dnh_dnode)
 
-#define	DMU_OS_IS_L2CACHEABLE(os)				\
-	((os)->os_secondary_cache == ZFS_CACHE_ALL ||		\
-	(os)->os_secondary_cache == ZFS_CACHE_METADATA)
-
 /* called from zpl */
 int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
 int dmu_objset_hold_flags(const char *name, boolean_t decrypt, void *tag,
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 6495e9b4cd20..c32dd4b1b27f 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -109,6 +109,11 @@ A value of
 .Sy 100
 disables this feature.
 .
+.It Sy l2arc_exclude_special Ns = Ns Sy 0 Ns | Ns 1 Pq int
+Controls whether buffers present on special vdevs are eligibile for caching
+into L2ARC.
+If set to 1, exclude dbufs on special vdevs from being cached to L2ARC.
+.
 .It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Pq  int
 Controls whether only MFU metadata and data are cached from ARC into L2ARC.
 This may be desired to avoid wasting space on L2ARC when reading/writing large
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 215250ea6fec..0ba366f1858f 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -877,6 +877,14 @@ static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
 #define	l2arc_hdr_arcstats_decrement_state(hdr) \
 	l2arc_hdr_arcstats_update((hdr), B_FALSE, B_TRUE)
 
+/*
+ * l2arc_exclude_special : A zfs module parameter that controls whether buffers
+ * 		present on special vdevs are eligibile for caching in L2ARC. If
+ * 		set to 1, exclude dbufs on special vdevs from being cached to
+ * 		L2ARC.
+ */
+int l2arc_exclude_special = 0;
+
 /*
  * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU
  * 		metadata and data are cached from ARC into L2ARC.
@@ -11136,6 +11144,10 @@ ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, ULONG, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW,
 	"Cache only MFU data from ARC into L2ARC");
 
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, exclude_special, INT, ZMOD_RW,
+	"If set to 1 exclude dbufs on special vdevs from being cached to "
+	"L2ARC.");
+
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int,
 	param_get_int, ZMOD_RW, "System free memory I/O throttle in bytes");
 
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index e687d96501ed..1a022c8b8a07 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -53,6 +53,7 @@
 #include <cityhash.h>
 #include <sys/spa_impl.h>
 #include <sys/wmsum.h>
+#include <sys/vdev_impl.h>
 
 kstat_t *dbuf_ksp;
 
@@ -594,6 +595,68 @@ dbuf_is_metadata(dmu_buf_impl_t *db)
 	}
 }
 
+/*
+ * We want to exclude buffers that are on a special allocation class from
+ * L2ARC.
+ */
+boolean_t
+dbuf_is_l2cacheable(dmu_buf_impl_t *db)
+{
+	vdev_t *vd = NULL;
+	zfs_cache_type_t cache = db->db_objset->os_secondary_cache;
+	blkptr_t *bp = db->db_blkptr;
+
+	if (bp != NULL && !BP_IS_HOLE(bp)) {
+		uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
+		vdev_t *rvd = db->db_objset->os_spa->spa_root_vdev;
+
+		if (vdev < rvd->vdev_children)
+			vd = rvd->vdev_child[vdev];
+
+		if (cache == ZFS_CACHE_ALL ||
+		    (dbuf_is_metadata(db) && cache == ZFS_CACHE_METADATA)) {
+			if (vd == NULL)
+				return (B_TRUE);
+
+			if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
+			    vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) ||
+			    l2arc_exclude_special == 0)
+				return (B_TRUE);
+		}
+	}
+
+	return (B_FALSE);
+}
+
+static inline boolean_t
+dnode_level_is_l2cacheable(blkptr_t *bp, dnode_t *dn, int64_t level)
+{
+	vdev_t *vd = NULL;
+	zfs_cache_type_t cache = dn->dn_objset->os_secondary_cache;
+
+	if (bp != NULL && !BP_IS_HOLE(bp)) {
+		uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
+		vdev_t *rvd = dn->dn_objset->os_spa->spa_root_vdev;
+
+		if (vdev < rvd->vdev_children)
+			vd = rvd->vdev_child[vdev];
+
+		if (cache == ZFS_CACHE_ALL || ((level > 0 ||
+		    DMU_OT_IS_METADATA(dn->dn_handle->dnh_dnode->dn_type)) &&
+		    cache == ZFS_CACHE_METADATA)) {
+			if (vd == NULL)
+				return (B_TRUE);
+
+			if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
+			    vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) ||
+			    l2arc_exclude_special == 0)
+				return (B_TRUE);
+		}
+	}
+
+	return (B_FALSE);
+}
+
 
 /*
  * This function *must* return indices evenly distributed between all
@@ -1523,7 +1586,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
 	DTRACE_SET_STATE(db, "read issued");
 	mutex_exit(&db->db_mtx);
 
-	if (DBUF_IS_L2CACHEABLE(db))
+	if (dbuf_is_l2cacheable(db))
 		aflags |= ARC_FLAG_L2CACHE;
 
 	dbuf_add_ref(db, NULL);
@@ -3372,7 +3435,7 @@ dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
 	dpa->dpa_arg = arg;
 
 	/* flag if L2ARC eligible, l2arc_noprefetch then decides */
-	if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
+	if (dnode_level_is_l2cacheable(&bp, dn, level))
 		dpa->dpa_aflags |= ARC_FLAG_L2CACHE;
 
 	/*
@@ -3390,7 +3453,7 @@ dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
 		zbookmark_phys_t zb;
 
 		/* flag if L2ARC eligible, l2arc_noprefetch then decides */
-		if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
+		if (dnode_level_is_l2cacheable(&bp, dn, level))
 			iter_aflags |= ARC_FLAG_L2CACHE;
 
 		SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
@@ -4989,7 +5052,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 			children_ready_cb = dbuf_write_children_ready;
 
 		dr->dr_zio = arc_write(pio, os->os_spa, txg,
-		    &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db),
+		    &dr->dr_bp_copy, data, dbuf_is_l2cacheable(db),
 		    &zp, dbuf_write_ready,
 		    children_ready_cb, dbuf_write_physdone,
 		    dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index 4e7127bd1bab..e38c9b452a28 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -1846,7 +1846,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
 	dsa->dsa_tx = NULL;
 
 	zio_nowait(arc_write(pio, os->os_spa, txg,
-	    zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
+	    zgd->zgd_bp, dr->dt.dl.dr_data, dbuf_is_l2cacheable(db),
 	    &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
 	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
 
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index b9380890230c..a8975797e8af 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -63,6 +63,8 @@
 #include <sys/dmu_recv.h>
 #include <sys/zfs_project.h>
 #include "zfs_namecheck.h"
+#include <sys/vdev_impl.h>
+#include <sys/arc.h>
 
 /*
  * Needed to close a window in dnode_move() that allows the objset to be freed
@@ -411,6 +413,34 @@ dnode_multilist_index_func(multilist_t *ml, void *obj)
 	    multilist_get_num_sublists(ml));
 }
 
+static inline boolean_t
+dmu_os_is_l2cacheable(objset_t *os)
+{
+	vdev_t *vd = NULL;
+	zfs_cache_type_t cache = os->os_secondary_cache;
+	blkptr_t *bp = os->os_rootbp;
+
+	if (bp != NULL && !BP_IS_HOLE(bp)) {
+		uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
+		vdev_t *rvd = os->os_spa->spa_root_vdev;
+
+		if (vdev < rvd->vdev_children)
+			vd = rvd->vdev_child[vdev];
+
+		if (cache == ZFS_CACHE_ALL || cache == ZFS_CACHE_METADATA) {
+			if (vd == NULL)
+				return (B_TRUE);
+
+			if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
+			    vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) ||
+			    l2arc_exclude_special == 0)
+				return (B_TRUE);
+		}
+	}
+
+	return (B_FALSE);
+}
+
 /*
  * Instantiates the objset_t in-memory structure corresponding to the
  * objset_phys_t that's pointed to by the specified blkptr_t.
@@ -453,7 +483,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 		SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
 		    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 
-		if (DMU_OS_IS_L2CACHEABLE(os))
+		if (dmu_os_is_l2cacheable(os))
 			aflags |= ARC_FLAG_L2CACHE;
 
 		if (ds != NULL && ds->ds_dir->dd_crypto_obj != 0) {
@@ -1661,7 +1691,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
 	}
 
 	zio = arc_write(pio, os->os_spa, tx->tx_txg,
-	    blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
+	    blkptr_copy, os->os_phys_buf, dmu_os_is_l2cacheable(os),
 	    &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done,
 	    os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 

From 03fa3ef264ab80296bad9b4ad54af8714a03df95 Mon Sep 17 00:00:00 2001
From: Akash B <akash-b@hpe.com>
Date: Wed, 9 Mar 2022 05:50:41 +0530
Subject: [PATCH 15/39] Add physical device size to SIZE column in 'zpool list
 -v'

Add physical device size/capacity only for physical devices in
'zpool list -v' instead of displaying "-" in the SIZE column.
This would make it easier to see the individual device capacity and
to determine which spares are large enough to replace which devices.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Dipak Ghosh <dipak.ghosh@hpe.com>
Signed-off-by: Akash B <akash-b@hpe.com>
Closes #12561
Closes #13106
---
 cmd/zpool/zpool_main.c                                   | 9 +++++++--
 include/sys/fs/zfs.h                                     | 1 +
 module/zfs/vdev.c                                        | 1 +
 .../functional/pool_checkpoint/checkpoint_lun_expsz.ksh  | 6 +++++-
 4 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index b2e7dc4a5b4d..3f46bd0513c9 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -6035,6 +6035,7 @@ print_one_column(zpool_prop_t prop, uint64_t value, const char *str,
 	size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL);
 
 	switch (prop) {
+	case ZPOOL_PROP_SIZE:
 	case ZPOOL_PROP_EXPANDSZ:
 	case ZPOOL_PROP_CHECKPOINT:
 	case ZPOOL_PROP_DEDUPRATIO:
@@ -6130,8 +6131,12 @@ print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
 		 * 'toplevel' boolean value is passed to the print_one_column()
 		 * to indicate that the value is valid.
 		 */
-		print_one_column(ZPOOL_PROP_SIZE, vs->vs_space, NULL, scripted,
-		    toplevel, format);
+		if (vs->vs_pspace)
+			print_one_column(ZPOOL_PROP_SIZE, vs->vs_pspace, NULL,
+			    scripted, B_TRUE, format);
+		else
+			print_one_column(ZPOOL_PROP_SIZE, vs->vs_space, NULL,
+			    scripted, toplevel, format);
 		print_one_column(ZPOOL_PROP_ALLOCATED, vs->vs_alloc, NULL,
 		    scripted, toplevel, format);
 		print_one_column(ZPOOL_PROP_FREE, vs->vs_space - vs->vs_alloc,
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index 5bbac576df02..c8e199fc679c 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -1102,6 +1102,7 @@ typedef struct vdev_stat {
 	uint64_t	vs_configured_ashift;   /* TLV vdev_ashift */
 	uint64_t	vs_logical_ashift;	/* vdev_logical_ashift  */
 	uint64_t	vs_physical_ashift;	/* vdev_physical_ashift */
+	uint64_t	vs_pspace;		/* physical capacity */
 } vdev_stat_t;
 
 /* BEGIN CSTYLED */
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index 636bb5005a1e..ccc35adc9f4b 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -4406,6 +4406,7 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
 		vs->vs_rsize = vdev_get_min_asize(vd);
 
 		if (vd->vdev_ops->vdev_op_leaf) {
+			vs->vs_pspace = vd->vdev_psize;
 			vs->vs_rsize += VDEV_LABEL_START_SIZE +
 			    VDEV_LABEL_END_SIZE;
 			/*
diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_lun_expsz.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_lun_expsz.ksh
index 59f64081a977..a18e634cefa7 100755
--- a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_lun_expsz.ksh
+++ b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_lun_expsz.ksh
@@ -48,14 +48,18 @@ log_must zpool checkpoint $NESTEDPOOL
 log_must truncate -s $EXPSZ $FILEDISK1
 log_must zpool online -e $NESTEDPOOL $FILEDISK1
 NEWSZ=$(zpool list -v | grep "$FILEDISK1" | awk '{print $2}')
+DEXPSZ=$(zpool list -v | grep "$FILEDISK1" | awk '{print $6}')
 nested_change_state_after_checkpoint
 log_mustnot [ "$INITSZ" = "$NEWSZ" ]
+log_must [ "$DEXPSZ" = "-" ]
 
 log_must zpool export $NESTEDPOOL
 log_must zpool import -d $FILEDISKDIR --rewind-to-checkpoint $NESTEDPOOL
 
 nested_verify_pre_checkpoint_state
 FINSZ=$(zpool list -v | grep "$FILEDISK1" | awk '{print $2}')
-log_must [ "$INITSZ" = "$FINSZ" ]
+DEXPSZ=$(zpool list -v | grep "$FILEDISK1" | awk '{print $6}')
+log_must [ "$EXPSZ" = "$FINSZ" ]
+log_must [ "$DEXPSZ" != "-" ]
 
 log_pass "LUN expansion rewinded correctly."

From 3f7c174b50a3430adf68ae2f91146cc4206c5e0c Mon Sep 17 00:00:00 2001
From: Richard Yao <richard.yao@alumni.stonybrook.edu>
Date: Mon, 12 Sep 2022 15:51:17 -0400
Subject: [PATCH 16/39] vdev_draid_lookup_map() should not iterate outside
 draid_maps

Coverity reported this as an out-of-bounds read.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Neal Gompa <ngompa@datto.com>
Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Closes #13865
---
 module/zfs/vdev_draid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c
index db87e69f2057..7e654ca24d20 100644
--- a/module/zfs/vdev_draid.c
+++ b/module/zfs/vdev_draid.c
@@ -541,7 +541,7 @@ vdev_draid_generate_perms(const draid_map_t *map, uint8_t **permsp)
 int
 vdev_draid_lookup_map(uint64_t children, const draid_map_t **mapp)
 {
-	for (int i = 0; i <= VDEV_DRAID_MAX_MAPS; i++) {
+	for (int i = 0; i < VDEV_DRAID_MAX_MAPS; i++) {
 		if (draid_maps[i].dm_children == children) {
 			*mapp = &draid_maps[i];
 			return (0);

From cde04badd1d76e3af050fb0232d06af8550e8b8b Mon Sep 17 00:00:00 2001
From: Christian Schwarz <me@cschwarz.com>
Date: Thu, 8 Sep 2022 02:04:15 +0200
Subject: [PATCH 17/39] make DMU_OT_IS_METADATA and DMU_OT_IS_ENCRYPTED return
 B_TRUE or B_FALSE

Without this patch, the

    ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf));

at the beginning of dbuf_assign_arcbuf can panic
if the object type is a DMU_OT_NEWTYPE that has
DMU_OT_METADATA set.

While we're at it, fix DMU_OT_IS_ENCRYPTED as well.

Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Christian Schwarz <christian.schwarz@nutanix.com>
Closes #13842
---
 include/sys/dmu.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index 0cf4dbc9f925..070d27fde3a9 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -136,7 +136,7 @@ typedef enum dmu_object_byteswap {
 #endif
 
 #define	DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \
-	((ot) & DMU_OT_METADATA) : \
+	(((ot) & DMU_OT_METADATA) != 0) : \
 	DMU_OT_IS_METADATA_IMPL(ot))
 
 #define	DMU_OT_IS_DDT(ot) \
@@ -147,7 +147,7 @@ typedef enum dmu_object_byteswap {
 	((ot) == DMU_OT_PLAIN_FILE_CONTENTS || (ot) == DMU_OT_UINT64_OTHER)
 
 #define	DMU_OT_IS_ENCRYPTED(ot) (((ot) & DMU_OT_NEWTYPE) ? \
-	((ot) & DMU_OT_ENCRYPTED) : \
+	(((ot) & DMU_OT_ENCRYPTED) != 0) : \
 	DMU_OT_IS_ENCRYPTED_IMPL(ot))
 
 /*

From a5b0d42540594c3df5c3c09cb552974c8bed0064 Mon Sep 17 00:00:00 2001
From: Ameer Hamza <106930537+ixhamza@users.noreply.github.com>
Date: Sat, 17 Sep 2022 01:52:25 +0500
Subject: [PATCH 18/39] zfs recv hangs if max recordsize is less than received
 recordsize

- Some optimizations for bqueue enqueue/dequeue.
- Added a fix to prevent deadlock when both bqueue_enqueue_impl()
and bqueue_dequeue() waits for signal to be triggered.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #13855
---
 include/sys/bqueue.h | 14 +++++++-------
 include/sys/spa.h    |  6 +++---
 module/zfs/bqueue.c  | 23 +++++++++++++----------
 3 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/include/sys/bqueue.h b/include/sys/bqueue.h
index 797aecd791a3..b9621966027a 100644
--- a/include/sys/bqueue.h
+++ b/include/sys/bqueue.h
@@ -30,22 +30,22 @@ typedef struct bqueue {
 	kmutex_t bq_lock;
 	kcondvar_t bq_add_cv;
 	kcondvar_t bq_pop_cv;
-	uint64_t bq_size;
-	uint64_t bq_maxsize;
-	uint64_t bq_fill_fraction;
+	size_t bq_size;
+	size_t bq_maxsize;
+	uint_t bq_fill_fraction;
 	size_t bq_node_offset;
 } bqueue_t;
 
 typedef struct bqueue_node {
 	list_node_t bqn_node;
-	uint64_t bqn_size;
+	size_t bqn_size;
 } bqueue_node_t;
 
 
-int bqueue_init(bqueue_t *, uint64_t, uint64_t, size_t);
+int bqueue_init(bqueue_t *, uint_t, size_t, size_t);
 void bqueue_destroy(bqueue_t *);
-void bqueue_enqueue(bqueue_t *, void *, uint64_t);
-void bqueue_enqueue_flush(bqueue_t *, void *, uint64_t);
+void bqueue_enqueue(bqueue_t *, void *, size_t);
+void bqueue_enqueue_flush(bqueue_t *, void *, size_t);
 void *bqueue_dequeue(bqueue_t *);
 boolean_t bqueue_empty(bqueue_t *);
 
diff --git a/include/sys/spa.h b/include/sys/spa.h
index f168015abffc..3eebcd84fccf 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -78,9 +78,9 @@ struct dsl_crypto_params;
  * against the cost of COWing a giant block to modify one byte, and the
  * large latency of reading or writing a large block.
  *
- * Note that although blocks up to 16MB are supported, the recordsize
- * property can not be set larger than zfs_max_recordsize (default 1MB).
- * See the comment near zfs_max_recordsize in dsl_dataset.c for details.
+ * The recordsize property can not be set larger than zfs_max_recordsize
+ * (default 16MB on 64-bit and 1MB on 32-bit). See the comment near
+ * zfs_max_recordsize in dsl_dataset.c for details.
  *
  * Note that although the LSIZE field of the blkptr_t can store sizes up
  * to 32MB, the dnode's dn_datablkszsec can only store sizes up to
diff --git a/module/zfs/bqueue.c b/module/zfs/bqueue.c
index 22539efc4e23..ec5ce4388ec8 100644
--- a/module/zfs/bqueue.c
+++ b/module/zfs/bqueue.c
@@ -42,8 +42,7 @@ obj2node(bqueue_t *q, void *data)
  * Return 0 on success, or -1 on failure.
  */
 int
-bqueue_init(bqueue_t *q, uint64_t fill_fraction, uint64_t size,
-    size_t node_offset)
+bqueue_init(bqueue_t *q, uint_t fill_fraction, size_t size, size_t node_offset)
 {
 	if (fill_fraction == 0) {
 		return (-1);
@@ -78,22 +77,26 @@ bqueue_destroy(bqueue_t *q)
 }
 
 static void
-bqueue_enqueue_impl(bqueue_t *q, void *data, uint64_t item_size,
-    boolean_t flush)
+bqueue_enqueue_impl(bqueue_t *q, void *data, size_t item_size, boolean_t flush)
 {
 	ASSERT3U(item_size, >, 0);
 	ASSERT3U(item_size, <=, q->bq_maxsize);
 	mutex_enter(&q->bq_lock);
 	obj2node(q, data)->bqn_size = item_size;
-	while (q->bq_size + item_size > q->bq_maxsize) {
+	while (q->bq_size && q->bq_size + item_size > q->bq_maxsize) {
+		/*
+		 * Wake up bqueue_dequeue() thread if already sleeping in order
+		 * to prevent the deadlock condition
+		 */
+		cv_signal(&q->bq_pop_cv);
 		cv_wait_sig(&q->bq_add_cv, &q->bq_lock);
 	}
 	q->bq_size += item_size;
 	list_insert_tail(&q->bq_list, data);
-	if (q->bq_size >= q->bq_maxsize / q->bq_fill_fraction)
-		cv_signal(&q->bq_pop_cv);
 	if (flush)
 		cv_broadcast(&q->bq_pop_cv);
+	else if (q->bq_size >= q->bq_maxsize / q->bq_fill_fraction)
+		cv_signal(&q->bq_pop_cv);
 	mutex_exit(&q->bq_lock);
 }
 
@@ -103,7 +106,7 @@ bqueue_enqueue_impl(bqueue_t *q, void *data, uint64_t item_size,
  * > 0.
  */
 void
-bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size)
+bqueue_enqueue(bqueue_t *q, void *data, size_t item_size)
 {
 	bqueue_enqueue_impl(q, data, item_size, B_FALSE);
 }
@@ -117,7 +120,7 @@ bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size)
  * destroy the condvar before the enqueuing thread is done.
  */
 void
-bqueue_enqueue_flush(bqueue_t *q, void *data, uint64_t item_size)
+bqueue_enqueue_flush(bqueue_t *q, void *data, size_t item_size)
 {
 	bqueue_enqueue_impl(q, data, item_size, B_TRUE);
 }
@@ -130,7 +133,7 @@ void *
 bqueue_dequeue(bqueue_t *q)
 {
 	void *ret = NULL;
-	uint64_t item_size;
+	size_t item_size;
 	mutex_enter(&q->bq_lock);
 	while (q->bq_size == 0) {
 		cv_wait_sig(&q->bq_pop_cv, &q->bq_lock);

From 999830a0212e74530d41b9ef3022fa91166c5d58 Mon Sep 17 00:00:00 2001
From: Kevin Jin <33590050+jxdking@users.noreply.github.com>
Date: Thu, 1 Jul 2021 11:20:27 -0400
Subject: [PATCH 19/39] Optimize txg_kick() process (#12274)

Use dp_dirty_pertxg[] for txg_kick(), instead of dp_dirty_total in
original code. Extra parameter "txg" is added for txg_kick(), thus it
knows which txg to kick. Also txg_kick() call is moved from
dsl_pool_need_dirty_delay() to dsl_pool_dirty_space() so that we can
know the txg number assigned for txg_kick().

Some unnecessary code regarding dp_dirty_total in txg_sync_thread() is
also cleaned up.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: jxdking <lostking2008@hotmail.com>
Closes #12274
---
 include/sys/txg.h     |  2 +-
 module/zfs/dsl_pool.c | 25 +++++++++++++++++++------
 module/zfs/txg.c      | 36 ++++++++++++++----------------------
 3 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/include/sys/txg.h b/include/sys/txg.h
index 22158bd1a5e6..f38f0006c040 100644
--- a/include/sys/txg.h
+++ b/include/sys/txg.h
@@ -78,7 +78,7 @@ extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks);
 
 extern void txg_delay(struct dsl_pool *dp, uint64_t txg, hrtime_t delta,
     hrtime_t resolution);
-extern void txg_kick(struct dsl_pool *dp);
+extern void txg_kick(struct dsl_pool *dp, uint64_t txg);
 
 /*
  * Wait until the given transaction group has finished syncing.
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
index 456ef5372e2e..ba1cb96a76fc 100644
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -902,18 +902,26 @@ dsl_pool_need_dirty_delay(dsl_pool_t *dp)
 {
 	uint64_t delay_min_bytes =
 	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
-	uint64_t dirty_min_bytes =
-	    zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100;
-	uint64_t dirty;
 
 	mutex_enter(&dp->dp_lock);
-	dirty = dp->dp_dirty_total;
+	uint64_t dirty = dp->dp_dirty_total;
 	mutex_exit(&dp->dp_lock);
-	if (dirty > dirty_min_bytes)
-		txg_kick(dp);
+
 	return (dirty > delay_min_bytes);
 }
 
+static boolean_t
+dsl_pool_need_dirty_sync(dsl_pool_t *dp, uint64_t txg)
+{
+	ASSERT(MUTEX_HELD(&dp->dp_lock));
+
+	uint64_t dirty_min_bytes =
+	    zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100;
+	uint64_t dirty = dp->dp_dirty_pertxg[txg & TXG_MASK];
+
+	return (dirty > dirty_min_bytes);
+}
+
 void
 dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
 {
@@ -921,7 +929,12 @@ dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
 		mutex_enter(&dp->dp_lock);
 		dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space;
 		dsl_pool_dirty_delta(dp, space);
+		boolean_t needsync = !dmu_tx_is_syncing(tx) &&
+		    dsl_pool_need_dirty_sync(dp, tx->tx_txg);
 		mutex_exit(&dp->dp_lock);
+
+		if (needsync)
+			txg_kick(dp, tx->tx_txg);
 	}
 }
 
diff --git a/module/zfs/txg.c b/module/zfs/txg.c
index c55b1d8f9601..c9eb84bbdb12 100644
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c
@@ -498,14 +498,6 @@ txg_wait_callbacks(dsl_pool_t *dp)
 		taskq_wait_outstanding(tx->tx_commit_cb_taskq, 0);
 }
 
-static boolean_t
-txg_is_syncing(dsl_pool_t *dp)
-{
-	tx_state_t *tx = &dp->dp_tx;
-	ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
-	return (tx->tx_syncing_txg != 0);
-}
-
 static boolean_t
 txg_is_quiescing(dsl_pool_t *dp)
 {
@@ -539,8 +531,6 @@ txg_sync_thread(void *arg)
 		clock_t timeout = zfs_txg_timeout * hz;
 		clock_t timer;
 		uint64_t txg;
-		uint64_t dirty_min_bytes =
-		    zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100;
 
 		/*
 		 * We sync when we're scanning, there's someone waiting
@@ -551,8 +541,7 @@ txg_sync_thread(void *arg)
 		while (!dsl_scan_active(dp->dp_scan) &&
 		    !tx->tx_exiting && timer > 0 &&
 		    tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
-		    !txg_has_quiesced_to_sync(dp) &&
-		    dp->dp_dirty_total < dirty_min_bytes) {
+		    !txg_has_quiesced_to_sync(dp)) {
 			dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
 			    (u_longlong_t)tx->tx_synced_txg,
 			    (u_longlong_t)tx->tx_sync_txg_waiting, dp);
@@ -566,6 +555,11 @@ txg_sync_thread(void *arg)
 		 * prompting it to do so if necessary.
 		 */
 		while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) {
+			if (txg_is_quiescing(dp)) {
+				txg_thread_wait(tx, &cpr,
+				    &tx->tx_quiesce_done_cv, 0);
+				continue;
+			}
 			if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
 				tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
 			cv_broadcast(&tx->tx_quiesce_more_cv);
@@ -791,24 +785,22 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg, boolean_t should_quiesce)
 }
 
 /*
- * If there isn't a txg syncing or in the pipeline, push another txg through
- * the pipeline by quiescing the open txg.
+ * Pass in the txg number that should be synced.
  */
 void
-txg_kick(dsl_pool_t *dp)
+txg_kick(dsl_pool_t *dp, uint64_t txg)
 {
 	tx_state_t *tx = &dp->dp_tx;
 
 	ASSERT(!dsl_pool_config_held(dp));
 
+	if (tx->tx_sync_txg_waiting >= txg)
+		return;
+
 	mutex_enter(&tx->tx_sync_lock);
-	if (!txg_is_syncing(dp) &&
-	    !txg_is_quiescing(dp) &&
-	    tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&
-	    tx->tx_sync_txg_waiting <= tx->tx_synced_txg &&
-	    tx->tx_quiesced_txg <= tx->tx_synced_txg) {
-		tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1;
-		cv_broadcast(&tx->tx_quiesce_more_cv);
+	if (tx->tx_sync_txg_waiting < txg) {
+		tx->tx_sync_txg_waiting = txg;
+		cv_broadcast(&tx->tx_sync_more_cv);
 	}
 	mutex_exit(&tx->tx_sync_lock);
 }

From d05f3039f7749508229e16b981723ec95c8764d1 Mon Sep 17 00:00:00 2001
From: Kevin Jin <33590050+jxdking@users.noreply.github.com>
Date: Tue, 20 Jul 2021 11:40:24 -0400
Subject: [PATCH 20/39] Add Module Parameter Regarding Log Size Limit

zfs_wrlog_data_max
The upper limit of TX_WRITE log data. Once it is reached,
write operation is blocked, until log data is cleared out
after txg sync. It only counts TX_WRITE log with WR_COPIED
or WR_NEED_COPY.

Reviewed-by: Prakash Surya <prakash.surya@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: jxdking <lostking2008@hotmail.com>
Closes #12284
---
 include/sys/dmu_tx.h   |  1 +
 include/sys/dsl_pool.h |  7 ++++++
 man/man4/zfs.4         | 12 +++++++++
 module/zfs/arc.c       | 12 +++++++++
 module/zfs/dmu_tx.c    |  7 ++++++
 module/zfs/dsl_pool.c  | 57 ++++++++++++++++++++++++++++++++++++++++++
 module/zfs/zfs_log.c   |  5 ++++
 module/zfs/zvol.c      |  7 ++++--
 8 files changed, 106 insertions(+), 2 deletions(-)

diff --git a/include/sys/dmu_tx.h b/include/sys/dmu_tx.h
index 60e9ed6e26f5..71a9ac7ca7bf 100644
--- a/include/sys/dmu_tx.h
+++ b/include/sys/dmu_tx.h
@@ -124,6 +124,7 @@ typedef struct dmu_tx_stats {
 	kstat_named_t dmu_tx_dirty_throttle;
 	kstat_named_t dmu_tx_dirty_delay;
 	kstat_named_t dmu_tx_dirty_over_max;
+	kstat_named_t dmu_tx_wrlog_over_max;
 	kstat_named_t dmu_tx_dirty_frees_delay;
 	kstat_named_t dmu_tx_quota;
 } dmu_tx_stats_t;
diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h
index e5eb9a20e9ca..1b4e2924facf 100644
--- a/include/sys/dsl_pool.h
+++ b/include/sys/dsl_pool.h
@@ -40,6 +40,7 @@
 #include <sys/rrwlock.h>
 #include <sys/dsl_synctask.h>
 #include <sys/mmp.h>
+#include <sys/aggsum.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -58,6 +59,7 @@ struct dsl_deadlist;
 
 extern unsigned long zfs_dirty_data_max;
 extern unsigned long zfs_dirty_data_max_max;
+extern unsigned long zfs_wrlog_data_max;
 extern int zfs_dirty_data_sync_percent;
 extern int zfs_dirty_data_max_percent;
 extern int zfs_dirty_data_max_max_percent;
@@ -118,6 +120,9 @@ typedef struct dsl_pool {
 	uint64_t dp_mos_compressed_delta;
 	uint64_t dp_mos_uncompressed_delta;
 
+	aggsum_t dp_wrlog_pertxg[TXG_SIZE];
+	aggsum_t dp_wrlog_total;
+
 	/*
 	 * Time of most recently scheduled (furthest in the future)
 	 * wakeup for delayed transactions.
@@ -158,6 +163,8 @@ uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy);
 uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp,
     zfs_space_check_t slop_policy);
 uint64_t dsl_pool_deferred_space(dsl_pool_t *dp);
+void dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg);
+boolean_t dsl_pool_wrlog_over_max(dsl_pool_t *dp);
 void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
 void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg);
 void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index c32dd4b1b27f..19c67a61ad74 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -1096,6 +1096,18 @@ Start syncing out a transaction group if there's at least this much dirty data
 This should be less than
 .Sy zfs_vdev_async_write_active_min_dirty_percent .
 .
+.It Sy zfs_wrlog_data_max Ns = Pq int
+The upper limit of write-transaction zil log data size in bytes.
+Once it is reached, write operation is blocked, until log data is cleared out
+after transaction group sync. Because of some overhead, it should be set
+at least 2 times the size of
+.Sy zfs_dirty_data_max
+.No to prevent harming normal write throughput.
+It also should be smaller than the size of the slog device if slog is present.
+.Pp
+Defaults to
+.Sy zfs_dirty_data_max*2
+.
 .It Sy zfs_fallocate_reserve_percent Ns = Ns Sy 110 Ns % Pq uint
 Since ZFS is a copy-on-write filesystem with snapshots, blocks cannot be
 preallocated for a file in order to guarantee that later writes will not
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index 0ba366f1858f..17193ed079fe 100644
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -8062,6 +8062,18 @@ arc_init(void)
 		zfs_dirty_data_max = MIN(zfs_dirty_data_max,
 		    zfs_dirty_data_max_max);
 	}
+
+	if (zfs_wrlog_data_max == 0) {
+
+		/*
+		 * dp_wrlog_total is reduced for each txg at the end of
+		 * spa_sync(). However, dp_dirty_total is reduced every time
+		 * a block is written out. Thus under normal operation,
+		 * dp_wrlog_total could grow 2 times as big as
+		 * zfs_dirty_data_max.
+		 */
+		zfs_wrlog_data_max = zfs_dirty_data_max * 2;
+	}
 }
 
 void
diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c
index 0beb983f992f..5fa516866668 100644
--- a/module/zfs/dmu_tx.c
+++ b/module/zfs/dmu_tx.c
@@ -53,6 +53,7 @@ dmu_tx_stats_t dmu_tx_stats = {
 	{ "dmu_tx_dirty_throttle",	KSTAT_DATA_UINT64 },
 	{ "dmu_tx_dirty_delay",		KSTAT_DATA_UINT64 },
 	{ "dmu_tx_dirty_over_max",	KSTAT_DATA_UINT64 },
+	{ "dmu_tx_wrlog_over_max",	KSTAT_DATA_UINT64 },
 	{ "dmu_tx_dirty_frees_delay",	KSTAT_DATA_UINT64 },
 	{ "dmu_tx_quota",		KSTAT_DATA_UINT64 },
 };
@@ -884,6 +885,12 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
 		return (SET_ERROR(ERESTART));
 	}
 
+	if (!tx->tx_dirty_delayed &&
+	    dsl_pool_wrlog_over_max(tx->tx_pool)) {
+		DMU_TX_STAT_BUMP(dmu_tx_wrlog_over_max);
+		return (SET_ERROR(ERESTART));
+	}
+
 	if (!tx->tx_dirty_delayed &&
 	    dsl_pool_need_dirty_delay(tx->tx_pool)) {
 		tx->tx_wait_dirty = B_TRUE;
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
index ba1cb96a76fc..b91dd4cfa8a6 100644
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -104,6 +104,14 @@ unsigned long zfs_dirty_data_max_max = 0;
 int zfs_dirty_data_max_percent = 10;
 int zfs_dirty_data_max_max_percent = 25;
 
+/*
+ * zfs_wrlog_data_max, the upper limit of TX_WRITE log data.
+ * Once it is reached, write operation is blocked,
+ * until log data is cleared out after txg sync.
+ * It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY.
+ */
+unsigned long zfs_wrlog_data_max = 0;
+
 /*
  * If there's at least this much dirty data (as a percentage of
  * zfs_dirty_data_max), push out a txg.  This should be less than
@@ -220,6 +228,11 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 	mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
 
+	aggsum_init(&dp->dp_wrlog_total, 0);
+	for (int i = 0; i < TXG_SIZE; i++) {
+		aggsum_init(&dp->dp_wrlog_pertxg[i], 0);
+	}
+
 	dp->dp_zrele_taskq = taskq_create("z_zrele", 100, defclsyspri,
 	    boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC |
 	    TASKQ_THREADS_CPU_PCT);
@@ -416,6 +429,14 @@ dsl_pool_close(dsl_pool_t *dp)
 	rrw_destroy(&dp->dp_config_rwlock);
 	mutex_destroy(&dp->dp_lock);
 	cv_destroy(&dp->dp_spaceavail_cv);
+
+	ASSERT0(aggsum_value(&dp->dp_wrlog_total));
+	aggsum_fini(&dp->dp_wrlog_total);
+	for (int i = 0; i < TXG_SIZE; i++) {
+		ASSERT0(aggsum_value(&dp->dp_wrlog_pertxg[i]));
+		aggsum_fini(&dp->dp_wrlog_pertxg[i]);
+	}
+
 	taskq_destroy(dp->dp_unlinked_drain_taskq);
 	taskq_destroy(dp->dp_zrele_taskq);
 	if (dp->dp_blkstats != NULL)
@@ -590,6 +611,36 @@ dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
 		cv_signal(&dp->dp_spaceavail_cv);
 }
 
+void
+dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg)
+{
+	ASSERT3S(size, >=, 0);
+
+	aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], size);
+	aggsum_add(&dp->dp_wrlog_total, size);
+
+	/* Choose a value slightly bigger than min dirty sync bytes */
+	uint64_t sync_min =
+	    zfs_dirty_data_max * (zfs_dirty_data_sync_percent + 10) / 100;
+	if (aggsum_compare(&dp->dp_wrlog_pertxg[txg & TXG_MASK], sync_min) > 0)
+		txg_kick(dp, txg);
+}
+
+boolean_t
+dsl_pool_wrlog_over_max(dsl_pool_t *dp)
+{
+	return (aggsum_compare(&dp->dp_wrlog_total, zfs_wrlog_data_max) > 0);
+}
+
+static void
+dsl_pool_wrlog_clear(dsl_pool_t *dp, uint64_t txg)
+{
+	int64_t delta;
+	delta = -(int64_t)aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]);
+	aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], delta);
+	aggsum_add(&dp->dp_wrlog_total, delta);
+}
+
 #ifdef ZFS_DEBUG
 static boolean_t
 dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg)
@@ -814,6 +865,9 @@ dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
 		ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
 		dmu_buf_rele(ds->ds_dbuf, zilog);
 	}
+
+	dsl_pool_wrlog_clear(dp, txg);
+
 	ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
 }
 
@@ -1409,6 +1463,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, delay_min_dirty_percent, INT, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, ULONG, ZMOD_RW,
 	"Determines the dirty space limit");
 
+ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, ULONG, ZMOD_RW,
+	"The size limit of write-transaction zil log data");
+
 /* zfs_dirty_data_max_max only applied at module load in arc_init(). */
 ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD,
 	"zfs_dirty_data_max upper bound in bytes");
diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c
index fd4c848d57f2..9e52bed77a61 100644
--- a/module/zfs/zfs_log.c
+++ b/module/zfs/zfs_log.c
@@ -538,6 +538,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
 	itx_wr_state_t write_state;
 	uintptr_t fsync_cnt;
 	uint64_t gen = 0;
+	ssize_t size = resid;
 
 	if (zil_replaying(zilog, tx) || zp->z_unlinked ||
 	    zfs_xattr_owner_unlinked(zp)) {
@@ -623,6 +624,10 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
 		off += len;
 		resid -= len;
 	}
+
+	if (write_state == WR_COPIED || write_state == WR_NEED_COPY) {
+		dsl_pool_wrlog_count(zilog->zl_dmu_pool, size, tx->tx_txg);
+	}
 }
 
 /*
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c
index 59b05b4b08d0..7d141a12288b 100644
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -84,10 +84,8 @@
 #include <sys/zfs_rlock.h>
 #include <sys/spa_impl.h>
 #include <sys/zvol.h>
-
 #include <sys/zvol_impl.h>
 
-
 unsigned int zvol_inhibit_dev = 0;
 unsigned int zvol_volmode = ZFS_VOLMODE_GEOM;
 
@@ -577,6 +575,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
 	uint32_t blocksize = zv->zv_volblocksize;
 	zilog_t *zilog = zv->zv_zilog;
 	itx_wr_state_t write_state;
+	uint64_t sz = size;
 
 	if (zil_replaying(zilog, tx))
 		return;
@@ -628,6 +627,10 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
 		offset += len;
 		size -= len;
 	}
+
+	if (write_state == WR_COPIED || write_state == WR_NEED_COPY) {
+		dsl_pool_wrlog_count(zilog->zl_dmu_pool, sz, tx->tx_txg);
+	}
 }
 
 /*

From ebbbe01e31f6cdc79b32a8de2f50691972d184f7 Mon Sep 17 00:00:00 2001
From: Rich Ercolani <214141+rincebrain@users.noreply.github.com>
Date: Thu, 31 Mar 2022 13:09:18 -0400
Subject: [PATCH 21/39] Ask libtool to stop hiding some errors

For #13083, curiously, it did not print the actual error, just
that the compile failed with "Error 1".

In theory, this flag should cause it to report errors twice sometimes.
In practice, I'm pretty okay with reporting some twice if it avoids
reporting some never.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Damian Szuberski <szuberskidamian@gmail.com>
Signed-off-by: Rich Ercolani <rincebrain@gmail.com>
Closes #13086
---
 lib/libavl/Makefile.am        | 3 +++
 lib/libefi/Makefile.am        | 3 +++
 lib/libicp/Makefile.am        | 2 ++
 lib/libnvpair/Makefile.am     | 3 +++
 lib/libshare/Makefile.am      | 3 +++
 lib/libspl/Makefile.am        | 3 +++
 lib/libtpool/Makefile.am      | 3 +++
 lib/libunicode/Makefile.am    | 3 +++
 lib/libuutil/Makefile.am      | 3 +++
 lib/libzfs/Makefile.am        | 3 +++
 lib/libzfs_core/Makefile.am   | 3 +++
 lib/libzfsbootenv/Makefile.am | 3 +++
 lib/libzpool/Makefile.am      | 3 +++
 lib/libzstd/Makefile.am       | 2 ++
 lib/libzutil/Makefile.am      | 3 +++
 15 files changed, 43 insertions(+)

diff --git a/lib/libavl/Makefile.am b/lib/libavl/Makefile.am
index 2e0a431c77fb..de8ba34d5ba0 100644
--- a/lib/libavl/Makefile.am
+++ b/lib/libavl/Makefile.am
@@ -5,6 +5,9 @@ VPATH = $(top_srcdir)/module/avl/
 # Includes kernel code, generate warnings for large stack frames
 AM_CFLAGS += $(FRAME_LARGER_THAN)
 
+# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020
+AM_CFLAGS += -no-suppress
+
 noinst_LTLIBRARIES = libavl.la
 
 KERNEL_C = \
diff --git a/lib/libefi/Makefile.am b/lib/libefi/Makefile.am
index b26f7a6dcd5b..5f77ac480a9f 100644
--- a/lib/libefi/Makefile.am
+++ b/lib/libefi/Makefile.am
@@ -2,6 +2,9 @@ include $(top_srcdir)/config/Rules.am
 
 AM_CFLAGS += $(LIBUUID_CFLAGS) $(ZLIB_CFLAGS)
 
+# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020
+AM_CFLAGS += -no-suppress
+
 noinst_LTLIBRARIES = libefi.la
 
 USER_C = \
diff --git a/lib/libicp/Makefile.am b/lib/libicp/Makefile.am
index e4a9ee862101..9a2510d0d222 100644
--- a/lib/libicp/Makefile.am
+++ b/lib/libicp/Makefile.am
@@ -6,6 +6,8 @@ VPATH = \
 
 # Includes kernel code, generate warnings for large stack frames
 AM_CFLAGS += $(FRAME_LARGER_THAN)
+# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020
+AM_CFLAGS += -no-suppress
 
 noinst_LTLIBRARIES = libicp.la
 
diff --git a/lib/libnvpair/Makefile.am b/lib/libnvpair/Makefile.am
index a3e1fa307f7c..f9f1eb539239 100644
--- a/lib/libnvpair/Makefile.am
+++ b/lib/libnvpair/Makefile.am
@@ -8,6 +8,9 @@ VPATH = \
 # and required CFLAGS for libtirpc
 AM_CFLAGS += $(FRAME_LARGER_THAN) $(LIBTIRPC_CFLAGS)
 
+# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020
+AM_CFLAGS += -no-suppress
+
 lib_LTLIBRARIES = libnvpair.la
 
 include $(top_srcdir)/config/Abigail.am
diff --git a/lib/libshare/Makefile.am b/lib/libshare/Makefile.am
index 7cef13c3da7c..0fce333506ae 100644
--- a/lib/libshare/Makefile.am
+++ b/lib/libshare/Makefile.am
@@ -2,6 +2,9 @@ include $(top_srcdir)/config/Rules.am
 
 DEFAULT_INCLUDES += -I$(srcdir)
 
+# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020
+AM_CFLAGS += -no-suppress
+
 noinst_LTLIBRARIES = libshare.la
 
 USER_C = \
diff --git a/lib/libspl/Makefile.am b/lib/libspl/Makefile.am
index 61432225a708..b59919bfb9e9 100644
--- a/lib/libspl/Makefile.am
+++ b/lib/libspl/Makefile.am
@@ -2,6 +2,9 @@ include $(top_srcdir)/config/Rules.am
 
 SUBDIRS = include
 
+# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020
+AM_CFLAGS += -no-suppress
+
 noinst_LTLIBRARIES = libspl_assert.la libspl.la
 
 libspl_assert_la_SOURCES = \
diff --git a/lib/libtpool/Makefile.am b/lib/libtpool/Makefile.am
index 3aff56f05f1e..ce9d03a67919 100644
--- a/lib/libtpool/Makefile.am
+++ b/lib/libtpool/Makefile.am
@@ -3,6 +3,9 @@ include $(top_srcdir)/config/Rules.am
 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61118
 AM_CFLAGS += $(NO_CLOBBERED)
 
+# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020
+AM_CFLAGS += -no-suppress
+
 noinst_LTLIBRARIES = libtpool.la
 
 USER_C = \
diff --git a/lib/libunicode/Makefile.am b/lib/libunicode/Makefile.am
index b82975f68efd..5b12b3e916f3 100644
--- a/lib/libunicode/Makefile.am
+++ b/lib/libunicode/Makefile.am
@@ -5,6 +5,9 @@ VPATH = $(top_srcdir)/module/unicode
 # Includes kernel code, generate warnings for large stack frames
 AM_CFLAGS += $(FRAME_LARGER_THAN)
 
+# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020
+AM_CFLAGS += -no-suppress
+
 noinst_LTLIBRARIES = libunicode.la
 
 KERNEL_C = \
diff --git a/lib/libuutil/Makefile.am b/lib/libuutil/Makefile.am
index 16d5023451bb..05b7ed0db8cb 100644
--- a/lib/libuutil/Makefile.am
+++ b/lib/libuutil/Makefile.am
@@ -1,5 +1,8 @@
 include $(top_srcdir)/config/Rules.am
 
+# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020
+AM_CFLAGS += -no-suppress
+
 lib_LTLIBRARIES = libuutil.la
 
 include $(top_srcdir)/config/Abigail.am
diff --git a/lib/libzfs/Makefile.am b/lib/libzfs/Makefile.am
index 7acaee4183a5..77e12b9e8d8a 100644
--- a/lib/libzfs/Makefile.am
+++ b/lib/libzfs/Makefile.am
@@ -8,6 +8,9 @@ VPATH = \
 # Suppress unused but set variable warnings often due to ASSERTs
 AM_CFLAGS += $(LIBCRYPTO_CFLAGS) $(ZLIB_CFLAGS)
 
+# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020
+AM_CFLAGS += -no-suppress
+
 pkgconfig_DATA = libzfs.pc
 
 lib_LTLIBRARIES = libzfs.la
diff --git a/lib/libzfs_core/Makefile.am b/lib/libzfs_core/Makefile.am
index 67e554dc8706..33a889a09586 100644
--- a/lib/libzfs_core/Makefile.am
+++ b/lib/libzfs_core/Makefile.am
@@ -2,6 +2,9 @@ include $(top_srcdir)/config/Rules.am
 
 pkgconfig_DATA = libzfs_core.pc
 
+# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020
+AM_CFLAGS += -no-suppress
+
 lib_LTLIBRARIES = libzfs_core.la
 
 include $(top_srcdir)/config/Abigail.am
diff --git a/lib/libzfsbootenv/Makefile.am b/lib/libzfsbootenv/Makefile.am
index 984df0b8a353..8a6bb76acfe7 100644
--- a/lib/libzfsbootenv/Makefile.am
+++ b/lib/libzfsbootenv/Makefile.am
@@ -2,6 +2,9 @@ include $(top_srcdir)/config/Rules.am
 
 pkgconfig_DATA = libzfsbootenv.pc
 
+# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020
+AM_CFLAGS += -no-suppress
+
 lib_LTLIBRARIES = libzfsbootenv.la
 
 include $(top_srcdir)/config/Abigail.am
diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am
index db7c376318d5..4ce3b4cd2f1d 100644
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@@ -24,6 +24,9 @@ AM_CFLAGS += $(ZLIB_CFLAGS)
 
 AM_CFLAGS += -DLIB_ZPOOL_BUILD
 
+# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020
+AM_CFLAGS += -no-suppress
+
 lib_LTLIBRARIES = libzpool.la
 
 USER_C = \
diff --git a/lib/libzstd/Makefile.am b/lib/libzstd/Makefile.am
index c9ed7e2aafbc..e3bc5c446ee9 100644
--- a/lib/libzstd/Makefile.am
+++ b/lib/libzstd/Makefile.am
@@ -5,6 +5,8 @@ VPATH = $(top_srcdir)/module/zstd
 # -fno-tree-vectorize is set for gcc in zstd/common/compiler.h
 # Set it for other compilers, too.
 AM_CFLAGS += -fno-tree-vectorize
+# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020
+AM_CFLAGS += -no-suppress
 
 noinst_LTLIBRARIES = libzstd.la
 
diff --git a/lib/libzutil/Makefile.am b/lib/libzutil/Makefile.am
index 6351e0ebf64b..f55b7798f1c0 100644
--- a/lib/libzutil/Makefile.am
+++ b/lib/libzutil/Makefile.am
@@ -2,6 +2,9 @@ include $(top_srcdir)/config/Rules.am
 
 AM_CFLAGS += $(LIBBLKID_CFLAGS) $(LIBUDEV_CFLAGS)
 
+# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020
+AM_CFLAGS += -no-suppress
+
 DEFAULT_INCLUDES += -I$(srcdir)
 
 noinst_LTLIBRARIES = libzutil.la

From 44cec45f729e35275cd479770e8d1402137fe99b Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 8 Sep 2022 13:30:53 -0400
Subject: [PATCH 22/39] Improve too large physical ashift handling

When iterating through children physical ashifts for vdev, prefer
ones above the maximum logical ashift, that we can actually use,
but within the administrator defined maximum.

When selecting top-level vdev ashift, do not set it to the defined
maximum in case physical ashift is even higher, but just ignore one.
Using the maximum does not prevent misaligned writes, but reduces
space efficiency.  Since ZFS tries to write data sequentially and
aggregates the writes, in many cases large misanigned writes may be
not as bad as the space penalty otherwise.

Allow internal physical ashifts for vdevs higher than SHIFT_MAX.
May be one day allocator or aggregation could benefit from that.

Reduce zfs_vdev_max_auto_ashift default from 16 (64KB) to 14 (16KB),
so that ZFS may still use bigger ashifts up to SHIFT_MAX (64KB),
but only if it really has to or explicitly told to, but not as an
"optimization".

There are some read-intensive NVMe SSDs that report Preferred Write
Alignment of 64KB, and attempt to build RAIDZ2 of those leads to a
space inefficiency that can't be justified.  Instead these changes
make ZFS fall back to logical ashift of 12 (4KB) by default and
only warn user that it may be suboptimal for performance.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #13798
---
 include/sys/vdev_impl.h                       |  1 +
 man/man4/zfs.4                                |  5 ++-
 module/os/freebsd/zfs/vdev_geom.c             |  3 +-
 module/zfs/vdev.c                             | 36 +++++++++++++++++--
 module/zfs/vdev_draid.c                       | 10 ++++--
 module/zfs/vdev_mirror.c                      | 10 ++++--
 module/zfs/vdev_raidz.c                       | 10 ++++--
 tests/zfs-tests/include/tunables.cfg          |  2 ++
 .../cli_root/zpool_add/add-o_ashift.ksh       |  5 ++-
 9 files changed, 69 insertions(+), 13 deletions(-)

diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h
index 3cfde40a77fe..da846d8504fe 100644
--- a/include/sys/vdev_impl.h
+++ b/include/sys/vdev_impl.h
@@ -642,6 +642,7 @@ extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise);
  */
 int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj);
 void vdev_metaslab_group_create(vdev_t *vd);
+uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b);
 
 /*
  * Vdev ashift optimization tunables
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 19c67a61ad74..3508ac9c83fd 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -347,9 +347,12 @@ When a vdev is added, target this number of metaslabs per top-level vdev.
 .It Sy zfs_vdev_default_ms_shift Ns = Ns Sy 29 Po 512MB Pc Pq int
 Default limit for metaslab size.
 .
-.It Sy zfs_vdev_max_auto_ashift Ns = Ns Sy ASHIFT_MAX Po 16 Pc Pq ulong
+.It Sy zfs_vdev_max_auto_ashift Ns = Ns Sy 14 Pq ulong
 Maximum ashift used when optimizing for logical -> physical sector size on new
 top-level vdevs.
+May be increased up to
+.Sy ASHIFT_MAX Po 16 Pc ,
+but this may negatively impact pool space efficiency.
 .
 .It Sy zfs_vdev_min_auto_ashift Ns = Ns Sy ASHIFT_MIN Po 9 Pc Pq ulong
 Minimum ashift used when creating new top-level vdevs.
diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c
index 5447eb922062..c8fa2b00c002 100644
--- a/module/os/freebsd/zfs/vdev_geom.c
+++ b/module/os/freebsd/zfs/vdev_geom.c
@@ -956,8 +956,7 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
 	*logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
 	*physical_ashift = 0;
 	if (pp->stripesize && pp->stripesize > (1 << *logical_ashift) &&
-	    ISP2(pp->stripesize) && pp->stripesize <= (1 << ASHIFT_MAX) &&
-	    pp->stripeoffset == 0)
+	    ISP2(pp->stripesize) && pp->stripeoffset == 0)
 		*physical_ashift = highbit(pp->stripesize) - 1;
 
 	/*
diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c
index ccc35adc9f4b..67fb5bf8f17e 100644
--- a/module/zfs/vdev.c
+++ b/module/zfs/vdev.c
@@ -134,7 +134,15 @@ int zfs_vdev_standard_sm_blksz = (1 << 17);
  */
 int zfs_nocacheflush = 0;
 
-uint64_t zfs_vdev_max_auto_ashift = ASHIFT_MAX;
+/*
+ * Maximum and minimum ashift values that can be automatically set based on
+ * vdev's physical ashift (disk's physical sector size).  While ASHIFT_MAX
+ * is higher than the maximum value, it is intentionally limited here to not
+ * excessively impact pool space efficiency.  Higher ashift values may still
+ * be forced by vdev logical ashift or by user via ashift property, but won't
+ * be set automatically as a performance optimization.
+ */
+uint64_t zfs_vdev_max_auto_ashift = 14;
 uint64_t zfs_vdev_min_auto_ashift = ASHIFT_MIN;
 
 /*PRINTFLIKE2*/
@@ -1835,6 +1843,24 @@ vdev_set_deflate_ratio(vdev_t *vd)
 	}
 }
 
+/*
+ * Choose the best of two ashifts, preferring one between logical ashift
+ * (absolute minimum) and administrator defined maximum, otherwise take
+ * the biggest of the two.
+ */
+uint64_t
+vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b)
+{
+	if (a > logical && a <= zfs_vdev_max_auto_ashift) {
+		if (b <= logical || b > zfs_vdev_max_auto_ashift)
+			return (a);
+		else
+			return (MAX(a, b));
+	} else if (b <= logical || b > zfs_vdev_max_auto_ashift)
+		return (MAX(a, b));
+	return (b);
+}
+
 /*
  * Maximize performance by inflating the configured ashift for top level
  * vdevs to be as close to the physical ashift as possible while maintaining
@@ -1846,7 +1872,8 @@ vdev_ashift_optimize(vdev_t *vd)
 {
 	ASSERT(vd == vd->vdev_top);
 
-	if (vd->vdev_ashift < vd->vdev_physical_ashift) {
+	if (vd->vdev_ashift < vd->vdev_physical_ashift &&
+	    vd->vdev_physical_ashift <= zfs_vdev_max_auto_ashift) {
 		vd->vdev_ashift = MIN(
 		    MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift),
 		    MAX(zfs_vdev_min_auto_ashift,
@@ -4452,7 +4479,10 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
 		vs->vs_configured_ashift = vd->vdev_top != NULL
 		    ? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
 		vs->vs_logical_ashift = vd->vdev_logical_ashift;
-		vs->vs_physical_ashift = vd->vdev_physical_ashift;
+		if (vd->vdev_physical_ashift <= ASHIFT_MAX)
+			vs->vs_physical_ashift = vd->vdev_physical_ashift;
+		else
+			vs->vs_physical_ashift = 0;
 
 		/*
 		 * Report fragmentation and rebuild progress for top-level,
diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c
index 7e654ca24d20..10d09517effd 100644
--- a/module/zfs/vdev_draid.c
+++ b/module/zfs/vdev_draid.c
@@ -1496,8 +1496,14 @@ vdev_draid_calculate_asize(vdev_t *vd, uint64_t *asizep, uint64_t *max_asizep,
 		asize = MIN(asize - 1, cvd->vdev_asize - 1) + 1;
 		max_asize = MIN(max_asize - 1, cvd->vdev_max_asize - 1) + 1;
 		logical_ashift = MAX(logical_ashift, cvd->vdev_ashift);
-		physical_ashift = MAX(physical_ashift,
-		    cvd->vdev_physical_ashift);
+	}
+	for (int c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+
+		if (cvd->vdev_ops == &vdev_draid_spare_ops)
+			continue;
+		physical_ashift = vdev_best_ashift(logical_ashift,
+		    physical_ashift, cvd->vdev_physical_ashift);
 	}
 
 	*asizep = asize;
diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c
index 50b86725b78a..d80a767043a5 100644
--- a/module/zfs/vdev_mirror.c
+++ b/module/zfs/vdev_mirror.c
@@ -409,8 +409,14 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
 		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
-		*physical_ashift = MAX(*physical_ashift,
-		    cvd->vdev_physical_ashift);
+	}
+	for (int c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+
+		if (cvd->vdev_open_error)
+			continue;
+		*physical_ashift = vdev_best_ashift(*logical_ashift,
+		    *physical_ashift, cvd->vdev_physical_ashift);
 	}
 
 	if (numerrors == vd->vdev_children) {
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index 424de0b33e09..5c25007f17b9 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -1426,8 +1426,14 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
 		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
-		*physical_ashift = MAX(*physical_ashift,
-		    cvd->vdev_physical_ashift);
+	}
+	for (c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+
+		if (cvd->vdev_open_error != 0)
+			continue;
+		*physical_ashift = vdev_best_ashift(*logical_ashift,
+		    *physical_ashift, cvd->vdev_physical_ashift);
 	}
 
 	*asize *= vd->vdev_children;
diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg
index fff43e469165..0fd2f48f2c1f 100644
--- a/tests/zfs-tests/include/tunables.cfg
+++ b/tests/zfs-tests/include/tunables.cfg
@@ -81,7 +81,9 @@ TRIM_TXG_BATCH			trim.txg_batch			zfs_trim_txg_batch
 TXG_HISTORY			txg.history			zfs_txg_history
 TXG_TIMEOUT			txg.timeout			zfs_txg_timeout
 UNLINK_SUSPEND_PROGRESS		UNSUPPORTED			zfs_unlink_suspend_progress
+VDEV_FILE_LOGICAL_ASHIFT	vdev.file.logical_ashift	vdev_file_logical_ashift
 VDEV_FILE_PHYSICAL_ASHIFT	vdev.file.physical_ashift	vdev_file_physical_ashift
+VDEV_MAX_AUTO_ASHIFT		vdev.max_auto_ashift		zfs_vdev_max_auto_ashift
 VDEV_MIN_MS_COUNT		vdev.min_ms_count		zfs_vdev_min_ms_count
 VDEV_VALIDATE_SKIP		vdev.validate_skip		vdev_validate_skip
 VOL_INHIBIT_DEV			UNSUPPORTED			zvol_inhibit_dev
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh
index 89cc4b0d3082..0fa1c0055b3c 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh
@@ -57,7 +57,9 @@ disk2=$TEST_BASE_DIR/disk2
 log_must mkfile $SIZE $disk1
 log_must mkfile $SIZE $disk2
 
+logical_ashift=$(get_tunable VDEV_FILE_LOGICAL_ASHIFT)
 orig_ashift=$(get_tunable VDEV_FILE_PHYSICAL_ASHIFT)
+max_auto_ashift=$(get_tunable VDEV_MAX_AUTO_ASHIFT)
 
 typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16")
 for ashift in ${ashifts[@]}
@@ -81,7 +83,8 @@ do
 	log_must zpool create $TESTPOOL $disk1
 	log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT $ashift
 	log_must zpool add $TESTPOOL $disk2
-	verify_ashift $disk2 $ashift
+	exp=$(( (ashift <= max_auto_ashift) ? ashift : logical_ashift ))
+	verify_ashift $disk2 $exp
 	if [[ $? -ne 0 ]]
 	then
 		log_fail "Device was added without setting ashift value to "\

From faa1e4082d64ccdb8b9b9b40bcff23b259133a75 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=BD=D0=B0=D0=B1?= <nabijaczleweli@nabijaczleweli.xyz>
Date: Thu, 27 May 2021 17:09:32 +0200
Subject: [PATCH 23/39] include: move SPA_MINBLOCKSHIFT and zio_encrypt to
 sys/fs/zfs.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These are used by userspace, so should live in a public header

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
Closes #12116
---
 include/sys/fs/zfs.h | 41 +++++++++++++++++++++++++++++++++++++++++
 include/sys/spa.h    | 21 ---------------------
 include/sys/zio.h    | 17 -----------------
 3 files changed, 41 insertions(+), 38 deletions(-)

diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index c8e199fc679c..be6a6b2074cd 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -1615,6 +1615,47 @@ typedef enum {
 #define	ZFS_EV_HIST_DSID	"history_dsid"
 #define	ZFS_EV_RESILVER_TYPE	"resilver_type"
 
+
+/*
+ * We currently support block sizes from 512 bytes to 16MB.
+ * The benefits of larger blocks, and thus larger IO, need to be weighed
+ * against the cost of COWing a giant block to modify one byte, and the
+ * large latency of reading or writing a large block.
+ *
+ * Note that although blocks up to 16MB are supported, the recordsize
+ * property can not be set larger than zfs_max_recordsize (default 1MB).
+ * See the comment near zfs_max_recordsize in dsl_dataset.c for details.
+ *
+ * Note that although the LSIZE field of the blkptr_t can store sizes up
+ * to 32MB, the dnode's dn_datablkszsec can only store sizes up to
+ * 32MB - 512 bytes.  Therefore, we limit SPA_MAXBLOCKSIZE to 16MB.
+ */
+#define	SPA_MINBLOCKSHIFT	9
+#define	SPA_OLD_MAXBLOCKSHIFT	17
+#define	SPA_MAXBLOCKSHIFT	24
+#define	SPA_MINBLOCKSIZE	(1ULL << SPA_MINBLOCKSHIFT)
+#define	SPA_OLD_MAXBLOCKSIZE	(1ULL << SPA_OLD_MAXBLOCKSHIFT)
+#define	SPA_MAXBLOCKSIZE	(1ULL << SPA_MAXBLOCKSHIFT)
+
+
+/* supported encryption algorithms */
+enum zio_encrypt {
+	ZIO_CRYPT_INHERIT = 0,
+	ZIO_CRYPT_ON,
+	ZIO_CRYPT_OFF,
+	ZIO_CRYPT_AES_128_CCM,
+	ZIO_CRYPT_AES_192_CCM,
+	ZIO_CRYPT_AES_256_CCM,
+	ZIO_CRYPT_AES_128_GCM,
+	ZIO_CRYPT_AES_192_GCM,
+	ZIO_CRYPT_AES_256_GCM,
+	ZIO_CRYPT_FUNCTIONS
+};
+
+#define	ZIO_CRYPT_ON_VALUE	ZIO_CRYPT_AES_256_GCM
+#define	ZIO_CRYPT_DEFAULT	ZIO_CRYPT_OFF
+
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/include/sys/spa.h b/include/sys/spa.h
index 3eebcd84fccf..67724a68f0e8 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -72,27 +72,6 @@ struct dsl_pool;
 struct dsl_dataset;
 struct dsl_crypto_params;
 
-/*
- * We currently support block sizes from 512 bytes to 16MB.
- * The benefits of larger blocks, and thus larger IO, need to be weighed
- * against the cost of COWing a giant block to modify one byte, and the
- * large latency of reading or writing a large block.
- *
- * The recordsize property can not be set larger than zfs_max_recordsize
- * (default 16MB on 64-bit and 1MB on 32-bit). See the comment near
- * zfs_max_recordsize in dsl_dataset.c for details.
- *
- * Note that although the LSIZE field of the blkptr_t can store sizes up
- * to 32MB, the dnode's dn_datablkszsec can only store sizes up to
- * 32MB - 512 bytes.  Therefore, we limit SPA_MAXBLOCKSIZE to 16MB.
- */
-#define	SPA_MINBLOCKSHIFT	9
-#define	SPA_OLD_MAXBLOCKSHIFT	17
-#define	SPA_MAXBLOCKSHIFT	24
-#define	SPA_MINBLOCKSIZE	(1ULL << SPA_MINBLOCKSHIFT)
-#define	SPA_OLD_MAXBLOCKSIZE	(1ULL << SPA_OLD_MAXBLOCKSHIFT)
-#define	SPA_MAXBLOCKSIZE	(1ULL << SPA_MAXBLOCKSHIFT)
-
 /*
  * Alignment Shift (ashift) is an immutable, internal top-level vdev property
  * which can only be set at vdev creation time. Physical writes are always done
diff --git a/include/sys/zio.h b/include/sys/zio.h
index 5bb712083458..39de5175b7db 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -108,23 +108,6 @@ enum zio_checksum {
 
 #define	ZIO_DEDUPCHECKSUM	ZIO_CHECKSUM_SHA256
 
-/* supported encryption algorithms */
-enum zio_encrypt {
-	ZIO_CRYPT_INHERIT = 0,
-	ZIO_CRYPT_ON,
-	ZIO_CRYPT_OFF,
-	ZIO_CRYPT_AES_128_CCM,
-	ZIO_CRYPT_AES_192_CCM,
-	ZIO_CRYPT_AES_256_CCM,
-	ZIO_CRYPT_AES_128_GCM,
-	ZIO_CRYPT_AES_192_GCM,
-	ZIO_CRYPT_AES_256_GCM,
-	ZIO_CRYPT_FUNCTIONS
-};
-
-#define	ZIO_CRYPT_ON_VALUE	ZIO_CRYPT_AES_256_GCM
-#define	ZIO_CRYPT_DEFAULT	ZIO_CRYPT_OFF
-
 /* macros defining encryption lengths */
 #define	ZIO_OBJSET_MAC_LEN		32
 #define	ZIO_DATA_IV_LEN			12

From d5105f068f19f0b1f3aa3e14a76ffd377a72bb58 Mon Sep 17 00:00:00 2001
From: Ameer Hamza <106930537+ixhamza@users.noreply.github.com>
Date: Sat, 17 Sep 2022 01:52:25 +0500
Subject: [PATCH 24/39] zfs recv hangs if max recordsize is less than received
 recordsize

- Some optimizations for bqueue enqueue/dequeue.
- Added a fix to prevent deadlock when both bqueue_enqueue_impl()
and bqueue_dequeue() waits for signal to be triggered.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #13855
---
 include/sys/fs/zfs.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h
index be6a6b2074cd..df1cc060130a 100644
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -1622,9 +1622,9 @@ typedef enum {
  * against the cost of COWing a giant block to modify one byte, and the
  * large latency of reading or writing a large block.
  *
- * Note that although blocks up to 16MB are supported, the recordsize
- * property can not be set larger than zfs_max_recordsize (default 1MB).
- * See the comment near zfs_max_recordsize in dsl_dataset.c for details.
+ * The recordsize property can not be set larger than zfs_max_recordsize
+ * (default 16MB on 64-bit and 1MB on 32-bit). See the comment near
+ * zfs_max_recordsize in dsl_dataset.c for details.
  *
  * Note that although the LSIZE field of the blkptr_t can store sizes up
  * to 32MB, the dnode's dn_datablkszsec can only store sizes up to

From 035e52f59152bbea35653e5fa2215152f81823f8 Mon Sep 17 00:00:00 2001
From: Ameer Hamza <106930537+ixhamza@users.noreply.github.com>
Date: Wed, 21 Sep 2022 03:19:05 +0500
Subject: [PATCH 25/39] Delay ZFS_PROP_SHARESMB property to handle it for
 encrypted raw receive

For encrypted raw receive, objset creation is delayed until a call to
dmu_recv_stream(). ZFS_PROP_SHARESMB property requires objset to be
populated when calling zpl_earlier_version(). To correctly handle the
ZFS_PROP_SHARESMB property for encrypted raw receive, this change
delays setting the property.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Ameer Hamza <ahamza@ixsystems.com>
Closes #13878
---
 module/zfs/zfs_ioctl.c                            | 15 +++++++++++++++
 .../functional/rsend/send_encrypted_props.ksh     |  8 ++++++++
 2 files changed, 23 insertions(+)

diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 3336bb783251..3d2492b9b908 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -4787,6 +4787,11 @@ extract_delay_props(nvlist_t *props)
 	static const zfs_prop_t delayable[] = {
 		ZFS_PROP_REFQUOTA,
 		ZFS_PROP_KEYLOCATION,
+		/*
+		 * Setting ZFS_PROP_SHARESMB requires the objset type to be
+		 * known, which is not possible prior to receipt of raw sends.
+		 */
+		ZFS_PROP_SHARESMB,
 		0
 	};
 	int i;
@@ -4850,6 +4855,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops,
 	offset_t off, noff;
 	nvlist_t *local_delayprops = NULL;
 	nvlist_t *recv_delayprops = NULL;
+	nvlist_t *inherited_delayprops = NULL;
 	nvlist_t *origprops = NULL; /* existing properties */
 	nvlist_t *origrecvd = NULL; /* existing received properties */
 	boolean_t first_recvd_props = B_FALSE;
@@ -4964,6 +4970,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops,
 		local_delayprops = extract_delay_props(oprops);
 		(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL,
 		    oprops, *errors);
+		inherited_delayprops = extract_delay_props(xprops);
 		(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED,
 		    xprops, *errors);
 
@@ -5021,6 +5028,10 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops,
 			(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL,
 			    local_delayprops, *errors);
 		}
+		if (inherited_delayprops != NULL && error == 0) {
+			(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED,
+			    inherited_delayprops, *errors);
+		}
 	}
 
 	/*
@@ -5040,6 +5051,10 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops,
 		ASSERT(nvlist_merge(localprops, local_delayprops, 0) == 0);
 		nvlist_free(local_delayprops);
 	}
+	if (inherited_delayprops != NULL) {
+		ASSERT(nvlist_merge(localprops, inherited_delayprops, 0) == 0);
+		nvlist_free(inherited_delayprops);
+	}
 	*read_bytes = off - noff;
 
 #ifdef	ZFS_DEBUG
diff --git a/tests/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh b/tests/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh
index 793904db91ca..c0c7b682def9 100755
--- a/tests/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh
+++ b/tests/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh
@@ -133,6 +133,14 @@ recv_cksum=$(md5digest /$ds/$TESTFILE0)
 log_must test "$recv_cksum" == "$cksum"
 log_must zfs destroy -r $ds
 
+# Test that we can override sharesmb property for encrypted raw stream.
+log_note "Must be able to override sharesmb property for encrypted raw stream"
+ds=$TESTPOOL/recv
+log_must eval "zfs send -w $esnap > $sendfile"
+log_must eval "zfs recv -o sharesmb=on $ds < $sendfile"
+log_must test "$(get_prop 'sharesmb' $ds)" == "on"
+log_must zfs destroy -r $ds
+
 # Test that we can override encryption properties on a properties stream
 # of an unencrypted dataset, turning it into an encryption root.
 log_note "Must be able to receive stream with props as encryption root"

From 5096ed31c8c3d5fb673b34a2e555da6f9e0dc62b Mon Sep 17 00:00:00 2001
From: Richard Yao <richard.yao@alumni.stonybrook.edu>
Date: Thu, 15 Sep 2022 19:21:21 -0400
Subject: [PATCH 26/39] Fix incorrect size given to bqueue_enqueue() call in
 dmu_redact.c

We pass sizeof (struct redact_record *) rather than sizeof (struct
redact_record). Passing the pointer size is wrong.

Coverity caught this in two places.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Closes #13885
---
 module/zfs/dmu_redact.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/module/zfs/dmu_redact.c b/module/zfs/dmu_redact.c
index 7efe423d35f0..5184ef6888df 100644
--- a/module/zfs/dmu_redact.c
+++ b/module/zfs/dmu_redact.c
@@ -141,7 +141,7 @@ record_merge_enqueue(bqueue_t *q, struct redact_record **build,
 {
 	if (new->eos_marker) {
 		if (*build != NULL)
-			bqueue_enqueue(q, *build, sizeof (*build));
+			bqueue_enqueue(q, *build, sizeof (**build));
 		bqueue_enqueue_flush(q, new, sizeof (*new));
 		return;
 	}
@@ -823,7 +823,7 @@ perform_thread_merge(bqueue_t *q, uint32_t num_threads,
 	avl_destroy(&end_tree);
 	kmem_free(redact_nodes, num_threads * sizeof (*redact_nodes));
 	if (current_record != NULL)
-		bqueue_enqueue(q, current_record, sizeof (current_record));
+		bqueue_enqueue(q, current_record, sizeof (*current_record));
 	return (err);
 }
 

From b66f8d3c2b79d6a6bd7e2ee850ec9b892dc31093 Mon Sep 17 00:00:00 2001
From: Richard Yao <richard.yao@alumni.stonybrook.edu>
Date: Thu, 15 Sep 2022 19:22:33 -0400
Subject: [PATCH 27/39] Add zfs_btree_verify_intensity kernel module parameter

I see a few issues in the issue tracker that might be aided by being
able to turn this on. We have no module parameter for it, so I would
like to add one.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Closes #13874
---
 cmd/zdb/zdb.c      |  2 +-
 man/man4/zfs.4     | 16 ++++++++++++++++
 module/zfs/btree.c |  8 +++++++-
 3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
index db8e2200a72b..4e57538d2234 100644
--- a/cmd/zdb/zdb.c
+++ b/cmd/zdb/zdb.c
@@ -112,7 +112,7 @@ extern int zfs_vdev_async_read_max_active;
 extern boolean_t spa_load_verify_dryrun;
 extern boolean_t spa_mode_readable_spacemaps;
 extern int zfs_reconstruct_indirect_combinations_max;
-extern int zfs_btree_verify_intensity;
+extern uint_t zfs_btree_verify_intensity;
 
 static const char cmdname[] = "zdb";
 uint8_t dump_opt[256];
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 3508ac9c83fd..97539bcc5010 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -1348,6 +1348,22 @@ _
 .TE
 .Sy \& * No Requires debug build.
 .
+.It Sy zfs_btree_verify_intensity Ns = Ns Sy 0 Pq uint
+Enables btree verification.
+The following settings are culminative:
+.TS
+box;
+lbz r l l .
+	Value	Description
+
+	1	Verify height.
+	2	Verify pointers from children to parent.
+	3	Verify element counts.
+	4	Verify element order. (expensive)
+*	5	Verify unused memory is poisoned. (expensive)
+.TE
+.Sy \& * No Requires debug build.
+.
 .It Sy zfs_free_leak_on_eio Ns = Ns Sy 0 Ns | Ns 1 Pq int
 If destroy encounters an
 .Sy EIO
diff --git a/module/zfs/btree.c b/module/zfs/btree.c
index 36755f97929c..e16c4ebef6ba 100644
--- a/module/zfs/btree.c
+++ b/module/zfs/btree.c
@@ -53,7 +53,7 @@ kmem_cache_t *zfs_btree_leaf_cache;
  * (while the asymptotic complexity of the other steps is the same, the
  * importance of the constant factors cannot be denied).
  */
-int zfs_btree_verify_intensity = 0;
+uint_t zfs_btree_verify_intensity = 0;
 
 /*
  * Convenience functions to silence warnings from memcpy/memmove's
@@ -2171,3 +2171,9 @@ zfs_btree_verify(zfs_btree_t *tree)
 		return;
 	zfs_btree_verify_poison(tree);
 }
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, btree_verify_intensity, UINT, ZMOD_RW,
+	"Enable btree verification. Levels above 4 require ZFS be built "
+	"with debugging");
+/* END CSTYLED */

From 91e02156ddea27d980fa8e5f7f3d10dda06139d2 Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Mon, 19 Sep 2022 11:07:15 -0700
Subject: [PATCH 28/39] Revert "Reduce dbuf_find() lock contention"

This reverts commit 34dbc618f50cfcd392f90af80c140398c38cbcd1.  While this
change resolved the lock contention observed for certain workloads, it
inadventantly reduced the maximum hash inserts/removes per second.  This
appears to be due to the slightly higher acquisition cost of a rwlock vs
a mutex.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
---
 include/sys/dbuf.h      |  7 ++++---
 module/zfs/dbuf.c       | 26 +++++++++++++-------------
 module/zfs/dbuf_stats.c |  4 ++--
 3 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h
index 2e7385113ec5..b757b2664178 100644
--- a/include/sys/dbuf.h
+++ b/include/sys/dbuf.h
@@ -321,12 +321,13 @@ typedef struct dmu_buf_impl {
 	uint8_t db_dirtycnt;
 } dmu_buf_impl_t;
 
-#define	DBUF_RWLOCKS 8192
-#define	DBUF_HASH_RWLOCK(h, idx) (&(h)->hash_rwlocks[(idx) & (DBUF_RWLOCKS-1)])
+/* Note: the dbuf hash table is exposed only for the mdb module */
+#define	DBUF_MUTEXES 2048
+#define	DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)])
 typedef struct dbuf_hash_table {
 	uint64_t hash_table_mask;
 	dmu_buf_impl_t **hash_table;
-	krwlock_t hash_rwlocks[DBUF_RWLOCKS] ____cacheline_aligned;
+	kmutex_t hash_mutexes[DBUF_MUTEXES] ____cacheline_aligned;
 } dbuf_hash_table_t;
 
 typedef void (*dbuf_prefetch_fn)(void *, uint64_t, uint64_t, boolean_t);
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index 1a022c8b8a07..7ecc2812b4e4 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -339,18 +339,18 @@ dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
 	hv = dbuf_hash(os, obj, level, blkid);
 	idx = hv & h->hash_table_mask;
 
-	rw_enter(DBUF_HASH_RWLOCK(h, idx), RW_READER);
+	mutex_enter(DBUF_HASH_MUTEX(h, idx));
 	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
 		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
 			mutex_enter(&db->db_mtx);
 			if (db->db_state != DB_EVICTING) {
-				rw_exit(DBUF_HASH_RWLOCK(h, idx));
+				mutex_exit(DBUF_HASH_MUTEX(h, idx));
 				return (db);
 			}
 			mutex_exit(&db->db_mtx);
 		}
 	}
-	rw_exit(DBUF_HASH_RWLOCK(h, idx));
+	mutex_exit(DBUF_HASH_MUTEX(h, idx));
 	return (NULL);
 }
 
@@ -393,13 +393,13 @@ dbuf_hash_insert(dmu_buf_impl_t *db)
 	hv = dbuf_hash(os, obj, level, blkid);
 	idx = hv & h->hash_table_mask;
 
-	rw_enter(DBUF_HASH_RWLOCK(h, idx), RW_WRITER);
+	mutex_enter(DBUF_HASH_MUTEX(h, idx));
 	for (dbf = h->hash_table[idx], i = 0; dbf != NULL;
 	    dbf = dbf->db_hash_next, i++) {
 		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
 			mutex_enter(&dbf->db_mtx);
 			if (dbf->db_state != DB_EVICTING) {
-				rw_exit(DBUF_HASH_RWLOCK(h, idx));
+				mutex_exit(DBUF_HASH_MUTEX(h, idx));
 				return (dbf);
 			}
 			mutex_exit(&dbf->db_mtx);
@@ -417,7 +417,7 @@ dbuf_hash_insert(dmu_buf_impl_t *db)
 	mutex_enter(&db->db_mtx);
 	db->db_hash_next = h->hash_table[idx];
 	h->hash_table[idx] = db;
-	rw_exit(DBUF_HASH_RWLOCK(h, idx));
+	mutex_exit(DBUF_HASH_MUTEX(h, idx));
 	uint64_t he = atomic_inc_64_nv(&dbuf_stats.hash_elements.value.ui64);
 	DBUF_STAT_MAX(hash_elements_max, he);
 
@@ -474,13 +474,13 @@ dbuf_hash_remove(dmu_buf_impl_t *db)
 
 	/*
 	 * We mustn't hold db_mtx to maintain lock ordering:
-	 * DBUF_HASH_RWLOCK > db_mtx.
+	 * DBUF_HASH_MUTEX > db_mtx.
 	 */
 	ASSERT(zfs_refcount_is_zero(&db->db_holds));
 	ASSERT(db->db_state == DB_EVICTING);
 	ASSERT(!MUTEX_HELD(&db->db_mtx));
 
-	rw_enter(DBUF_HASH_RWLOCK(h, idx), RW_WRITER);
+	mutex_enter(DBUF_HASH_MUTEX(h, idx));
 	dbp = &h->hash_table[idx];
 	while ((dbf = *dbp) != db) {
 		dbp = &dbf->db_hash_next;
@@ -491,7 +491,7 @@ dbuf_hash_remove(dmu_buf_impl_t *db)
 	if (h->hash_table[idx] &&
 	    h->hash_table[idx]->db_hash_next == NULL)
 		DBUF_STAT_BUMPDOWN(hash_chains);
-	rw_exit(DBUF_HASH_RWLOCK(h, idx));
+	mutex_exit(DBUF_HASH_MUTEX(h, idx));
 	atomic_dec_64(&dbuf_stats.hash_elements.value.ui64);
 }
 
@@ -914,8 +914,8 @@ dbuf_init(void)
 	    sizeof (dmu_buf_impl_t),
 	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
 
-	for (i = 0; i < DBUF_RWLOCKS; i++)
-		rw_init(&h->hash_rwlocks[i], NULL, RW_DEFAULT, NULL);
+	for (i = 0; i < DBUF_MUTEXES; i++)
+		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
 
 	dbuf_stats_init(h);
 
@@ -981,8 +981,8 @@ dbuf_fini(void)
 
 	dbuf_stats_destroy();
 
-	for (i = 0; i < DBUF_RWLOCKS; i++)
-		rw_destroy(&h->hash_rwlocks[i]);
+	for (i = 0; i < DBUF_MUTEXES; i++)
+		mutex_destroy(&h->hash_mutexes[i]);
 #if defined(_KERNEL)
 	/*
 	 * Large allocations which do not require contiguous pages
diff --git a/module/zfs/dbuf_stats.c b/module/zfs/dbuf_stats.c
index 037190a81bb3..12bb568a08cc 100644
--- a/module/zfs/dbuf_stats.c
+++ b/module/zfs/dbuf_stats.c
@@ -137,7 +137,7 @@ dbuf_stats_hash_table_data(char *buf, size_t size, void *data)
 	if (size)
 		buf[0] = 0;
 
-	rw_enter(DBUF_HASH_RWLOCK(h, dsh->idx), RW_READER);
+	mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx));
 	for (db = h->hash_table[dsh->idx]; db != NULL; db = db->db_hash_next) {
 		/*
 		 * Returning ENOMEM will cause the data and header functions
@@ -158,7 +158,7 @@ dbuf_stats_hash_table_data(char *buf, size_t size, void *data)
 
 		mutex_exit(&db->db_mtx);
 	}
-	rw_exit(DBUF_HASH_RWLOCK(h, dsh->idx));
+	mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx));
 
 	return (error);
 }

From 33223cbc3cbed37fdcecedc18b4b82406c73c01b Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Tue, 24 May 2022 12:46:35 -0400
Subject: [PATCH 29/39] Refactor Log Size Limit

Original Log Size Limit implementation blocked all writes in case of
limit reached until the TXG is committed and the log is freed.  It
caused huge delays and following speed spikes in application writes.

This implementation instead smoothly throttles writes, using exactly
the same mechanism as used for dirty data.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: jxdking <lostking2008@hotmail.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored-By: iXsystems, Inc.
Issue #12284
Closes #13476
---
 include/sys/dmu_tx.h   |  2 +-
 include/sys/dsl_pool.h |  2 +-
 man/man4/zfs.4         |  6 ++---
 module/zfs/dmu_tx.c    | 58 +++++++++++++++++++++++++++---------------
 module/zfs/dsl_pool.c  | 17 ++++++++-----
 5 files changed, 53 insertions(+), 32 deletions(-)

diff --git a/include/sys/dmu_tx.h b/include/sys/dmu_tx.h
index 71a9ac7ca7bf..ad3f1b0e47ca 100644
--- a/include/sys/dmu_tx.h
+++ b/include/sys/dmu_tx.h
@@ -124,8 +124,8 @@ typedef struct dmu_tx_stats {
 	kstat_named_t dmu_tx_dirty_throttle;
 	kstat_named_t dmu_tx_dirty_delay;
 	kstat_named_t dmu_tx_dirty_over_max;
-	kstat_named_t dmu_tx_wrlog_over_max;
 	kstat_named_t dmu_tx_dirty_frees_delay;
+	kstat_named_t dmu_tx_wrlog_delay;
 	kstat_named_t dmu_tx_quota;
 } dmu_tx_stats_t;
 
diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h
index 1b4e2924facf..e93bd0557c1e 100644
--- a/include/sys/dsl_pool.h
+++ b/include/sys/dsl_pool.h
@@ -164,7 +164,7 @@ uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp,
     zfs_space_check_t slop_policy);
 uint64_t dsl_pool_deferred_space(dsl_pool_t *dp);
 void dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg);
-boolean_t dsl_pool_wrlog_over_max(dsl_pool_t *dp);
+boolean_t dsl_pool_need_wrlog_delay(dsl_pool_t *dp);
 void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
 void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg);
 void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 97539bcc5010..fcb97d716d54 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -1101,9 +1101,9 @@ This should be less than
 .
 .It Sy zfs_wrlog_data_max Ns = Pq int
 The upper limit of write-transaction zil log data size in bytes.
-Once it is reached, write operation is blocked, until log data is cleared out
-after transaction group sync. Because of some overhead, it should be set
-at least 2 times the size of
+Write operations are throttled when approaching the limit until log data is
+cleared out after transaction group sync.
+Because of some overhead, it should be set at least 2 times the size of
 .Sy zfs_dirty_data_max
 .No to prevent harming normal write throughput.
 It also should be smaller than the size of the slog device if slog is present.
diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c
index 5fa516866668..1eed0526b51d 100644
--- a/module/zfs/dmu_tx.c
+++ b/module/zfs/dmu_tx.c
@@ -53,8 +53,8 @@ dmu_tx_stats_t dmu_tx_stats = {
 	{ "dmu_tx_dirty_throttle",	KSTAT_DATA_UINT64 },
 	{ "dmu_tx_dirty_delay",		KSTAT_DATA_UINT64 },
 	{ "dmu_tx_dirty_over_max",	KSTAT_DATA_UINT64 },
-	{ "dmu_tx_wrlog_over_max",	KSTAT_DATA_UINT64 },
 	{ "dmu_tx_dirty_frees_delay",	KSTAT_DATA_UINT64 },
+	{ "dmu_tx_wrlog_delay",		KSTAT_DATA_UINT64 },
 	{ "dmu_tx_quota",		KSTAT_DATA_UINT64 },
 };
 
@@ -781,34 +781,49 @@ static void
 dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
 {
 	dsl_pool_t *dp = tx->tx_pool;
-	uint64_t delay_min_bytes =
+	uint64_t delay_min_bytes, wrlog;
+	hrtime_t wakeup, tx_time = 0, now;
+
+	/* Calculate minimum transaction time for the dirty data amount. */
+	delay_min_bytes =
 	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
-	hrtime_t wakeup, min_tx_time, now;
+	if (dirty > delay_min_bytes) {
+		/*
+		 * The caller has already waited until we are under the max.
+		 * We make them pass us the amount of dirty data so we don't
+		 * have to handle the case of it being >= the max, which
+		 * could cause a divide-by-zero if it's == the max.
+		 */
+		ASSERT3U(dirty, <, zfs_dirty_data_max);
 
-	if (dirty <= delay_min_bytes)
-		return;
+		tx_time = zfs_delay_scale * (dirty - delay_min_bytes) /
+		    (zfs_dirty_data_max - dirty);
+	}
 
-	/*
-	 * The caller has already waited until we are under the max.
-	 * We make them pass us the amount of dirty data so we don't
-	 * have to handle the case of it being >= the max, which could
-	 * cause a divide-by-zero if it's == the max.
-	 */
-	ASSERT3U(dirty, <, zfs_dirty_data_max);
+	/* Calculate minimum transaction time for the TX_WRITE log size. */
+	wrlog = aggsum_upper_bound(&dp->dp_wrlog_total);
+	delay_min_bytes =
+	    zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100;
+	if (wrlog >= zfs_wrlog_data_max) {
+		tx_time = zfs_delay_max_ns;
+	} else if (wrlog > delay_min_bytes) {
+		tx_time = MAX(zfs_delay_scale * (wrlog - delay_min_bytes) /
+		    (zfs_wrlog_data_max - wrlog), tx_time);
+	}
 
+	if (tx_time == 0)
+		return;
+
+	tx_time = MIN(tx_time, zfs_delay_max_ns);
 	now = gethrtime();
-	min_tx_time = zfs_delay_scale *
-	    (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
-	min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
-	if (now > tx->tx_start + min_tx_time)
+	if (now > tx->tx_start + tx_time)
 		return;
 
 	DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
-	    uint64_t, min_tx_time);
+	    uint64_t, tx_time);
 
 	mutex_enter(&dp->dp_lock);
-	wakeup = MAX(tx->tx_start + min_tx_time,
-	    dp->dp_last_wakeup + min_tx_time);
+	wakeup = MAX(tx->tx_start + tx_time, dp->dp_last_wakeup + tx_time);
 	dp->dp_last_wakeup = wakeup;
 	mutex_exit(&dp->dp_lock);
 
@@ -886,8 +901,9 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
 	}
 
 	if (!tx->tx_dirty_delayed &&
-	    dsl_pool_wrlog_over_max(tx->tx_pool)) {
-		DMU_TX_STAT_BUMP(dmu_tx_wrlog_over_max);
+	    dsl_pool_need_wrlog_delay(tx->tx_pool)) {
+		tx->tx_wait_dirty = B_TRUE;
+		DMU_TX_STAT_BUMP(dmu_tx_wrlog_delay);
 		return (SET_ERROR(ERESTART));
 	}
 
diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c
index b91dd4cfa8a6..4036c8671f2d 100644
--- a/module/zfs/dsl_pool.c
+++ b/module/zfs/dsl_pool.c
@@ -105,9 +105,8 @@ int zfs_dirty_data_max_percent = 10;
 int zfs_dirty_data_max_max_percent = 25;
 
 /*
- * zfs_wrlog_data_max, the upper limit of TX_WRITE log data.
- * Once it is reached, write operation is blocked,
- * until log data is cleared out after txg sync.
+ * The upper limit of TX_WRITE log data.  Write operations are throttled
+ * when approaching the limit until log data is cleared out after txg sync.
  * It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY.
  */
 unsigned long zfs_wrlog_data_max = 0;
@@ -621,15 +620,18 @@ dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg)
 
 	/* Choose a value slightly bigger than min dirty sync bytes */
 	uint64_t sync_min =
-	    zfs_dirty_data_max * (zfs_dirty_data_sync_percent + 10) / 100;
+	    zfs_wrlog_data_max * (zfs_dirty_data_sync_percent + 10) / 200;
 	if (aggsum_compare(&dp->dp_wrlog_pertxg[txg & TXG_MASK], sync_min) > 0)
 		txg_kick(dp, txg);
 }
 
 boolean_t
-dsl_pool_wrlog_over_max(dsl_pool_t *dp)
+dsl_pool_need_wrlog_delay(dsl_pool_t *dp)
 {
-	return (aggsum_compare(&dp->dp_wrlog_total, zfs_wrlog_data_max) > 0);
+	uint64_t delay_min_bytes =
+	    zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100;
+
+	return (aggsum_compare(&dp->dp_wrlog_total, delay_min_bytes) > 0);
 }
 
 static void
@@ -639,6 +641,9 @@ dsl_pool_wrlog_clear(dsl_pool_t *dp, uint64_t txg)
 	delta = -(int64_t)aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]);
 	aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], delta);
 	aggsum_add(&dp->dp_wrlog_total, delta);
+	/* Compact per-CPU sums after the big change. */
+	(void) aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]);
+	(void) aggsum_value(&dp->dp_wrlog_total);
 }
 
 #ifdef ZFS_DEBUG

From 835e03682c22f95a774f1a21d6e96e00f063fef9 Mon Sep 17 00:00:00 2001
From: Richard Yao <richard.yao@alumni.stonybrook.edu>
Date: Mon, 26 Sep 2022 19:44:22 -0400
Subject: [PATCH 30/39] Linux: Fix uninitialized variable usage in
 zio_do_crypt_data()

Coverity complained about this. An error from `hkdf_sha512()` before uio
initialization will cause pointers to uninitialized memory to be passed
to `zio_crypt_destroy_uio()`. This is a regression that was introduced
by cf63739191b6cac629d053930a4aea592bca3819. Interestingly, this never
affected FreeBSD, since the FreeBSD version never had that patch ported.
Since moving uio initialization to the top of this function would slow
down the qat_crypt() path, we only move the `memset()` calls to the top
of the function. This is sufficient to fix this problem.

Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Neal Gompa <ngompa@datto.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Closes #13944
---
 module/os/linux/zfs/zio_crypt.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/module/os/linux/zfs/zio_crypt.c b/module/os/linux/zfs/zio_crypt.c
index 381769eab682..50e93909659f 100644
--- a/module/os/linux/zfs/zio_crypt.c
+++ b/module/os/linux/zfs/zio_crypt.c
@@ -1900,6 +1900,9 @@ zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key,
 	crypto_ctx_template_t tmpl;
 	uint8_t *authbuf = NULL;
 
+	memset(&puio, 0, sizeof (puio));
+	memset(&cuio, 0, sizeof (cuio));
+
 	/*
 	 * If the needed key is the current one, just use it. Otherwise we
 	 * need to generate a temporary one from the given salt + master key.
@@ -1960,9 +1963,6 @@ zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key,
 		/* If the hardware implementation fails fall back to software */
 	}
 
-	bzero(&puio, sizeof (zfs_uio_t));
-	bzero(&cuio, sizeof (zfs_uio_t));
-
 	/* create uios for encryption */
 	ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf,
 	    cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len,

From c973929b29bb945d1a1fd9f54a5238360f0e1029 Mon Sep 17 00:00:00 2001
From: Richard Yao <richard.yao@alumni.stonybrook.edu>
Date: Tue, 27 Sep 2022 19:44:13 -0400
Subject: [PATCH 31/39] LUA: Fix CVE-2014-5461

Apply the fix from upstream.

http://www.lua.org/bugs.html#5.2.2-1
https://www.opencve.io/cve/CVE-2014-5461

It should be noted that exploiting this requires the `SYS_CONFIG`
privilege, and anyone with that privilege likely has other opportunities
to do exploits, so it is unlikely that bad actors could exploit this
unless system administrators are executing untrusted ZFS Channel
Programs.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Closes #13949
---
 module/lua/ldo.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/module/lua/ldo.c b/module/lua/ldo.c
index 08a952007d10..a9835c4f571d 100644
--- a/module/lua/ldo.c
+++ b/module/lua/ldo.c
@@ -406,7 +406,7 @@ int luaD_precall (lua_State *L, StkId func, int nresults) {
       StkId base;
       Proto *p = clLvalue(func)->p;
       n = cast_int(L->top - func) - 1;  /* number of real arguments */
-      luaD_checkstack(L, p->maxstacksize);
+      luaD_checkstack(L, p->maxstacksize + p->numparams);
       for (; n < p->numparams; n++)
         setnilvalue(L->top++);  /* complete missing arguments */
       if (!p->is_vararg) {

From 8dcd6af62318a85606d664a3ba99d17b411a5892 Mon Sep 17 00:00:00 2001
From: Ryan Moeller <ryan@iXsystems.com>
Date: Tue, 2 Aug 2022 19:34:23 -0400
Subject: [PATCH 32/39] FreeBSD: Ignore symlink to i386 includes

A symlink to i386 includes is created in the build dir on amd64 since
freebsd/freebsd-src@d07600c563039f252becc29ac7d9a454b6b0600d

Tell git to ignore it like the other include links.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Ryan Moeller <ryan@iXsystems.com>
Closes #13719
---
 module/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/module/.gitignore b/module/.gitignore
index 7a4bd3673e77..0ec6052f1bb0 100644
--- a/module/.gitignore
+++ b/module/.gitignore
@@ -22,5 +22,6 @@
 /export_syms
 /machine
 /x86
+/i386
 
 !Makefile.in

From 55816c64dadac5fecd858a88f769184283a4808b Mon Sep 17 00:00:00 2001
From: Richard Yao <richard.yao@alumni.stonybrook.edu>
Date: Wed, 14 Sep 2022 15:51:55 -0400
Subject: [PATCH 33/39] FreeBSD: Fix integer conversion for
 vnlru_free{,_vfsops}()

When reviewing #13875, I noticed that our FreeBSD code has an issue
where it converts from `int64_t` to `int` when calling
`vnlru_free{,_vfsops}()`. The result is that if the int64_t is `1 <<
36`, the int will be 0, since the low bits are 0. Even when some low
bits are set, a value such as `((1 << 36) + 1)` would truncate to 1,
which is wrong.

There is protection against this on 32-bit platforms, but on 64-bit
platforms, there is no check to protect us, so we add a check.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Closes #13882
---
 module/os/freebsd/zfs/arc_os.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c
index 77af092e1ed4..590d1c04b9a5 100644
--- a/module/os/freebsd/zfs/arc_os.c
+++ b/module/os/freebsd/zfs/arc_os.c
@@ -161,6 +161,12 @@ arc_prune_task(void *arg)
 	int64_t nr_scan = (intptr_t)arg;
 
 	arc_reduce_target_size(ptob(nr_scan));
+
+#ifndef __ILP32__
+	if (nr_scan > INT_MAX)
+		nr_scan = INT_MAX;
+#endif
+
 #if __FreeBSD_version >= 1300139
 	sx_xlock(&arc_vnlru_lock);
 	vnlru_free_vfsops(nr_scan, &zfs_vfsops, arc_vnlru_marker);

From 2c8e3e4b28a40e6a1b1926ce03da1a725f0e60f8 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Tue, 20 Sep 2022 02:17:27 +0200
Subject: [PATCH 34/39] FreeBSD: stop passing LK_INTERLOCK to VOP_LOCK

There is an ongoing effort to eliminate this feature.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Closes #13908
---
 module/os/freebsd/zfs/zfs_ctldir.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/module/os/freebsd/zfs/zfs_ctldir.c b/module/os/freebsd/zfs/zfs_ctldir.c
index 3a5c9f8caf0a..5bd2e1510ddb 100644
--- a/module/os/freebsd/zfs/zfs_ctldir.c
+++ b/module/os/freebsd/zfs/zfs_ctldir.c
@@ -976,12 +976,13 @@ zfsctl_snapdir_lookup(struct vop_lookup_args *ap)
 		 */
 		VI_LOCK(*vpp);
 		if (((*vpp)->v_iflag & VI_MOUNT) == 0) {
+			VI_UNLOCK(*vpp);
 			/*
 			 * Upgrade to exclusive lock in order to:
 			 * - avoid race conditions
 			 * - satisfy the contract of mount_snapshot()
 			 */
-			err = VOP_LOCK(*vpp, LK_TRYUPGRADE | LK_INTERLOCK);
+			err = VOP_LOCK(*vpp, LK_TRYUPGRADE);
 			if (err == 0)
 				break;
 		} else {

From eec942cc54652254a39f36fb12976bcc3b4a2706 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Wed, 21 Sep 2022 00:21:30 +0200
Subject: [PATCH 35/39] FreeBSD: catch up to 1400068

Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Closes #13909
---
 module/os/freebsd/zfs/zfs_vnops_os.c | 41 ++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 11 deletions(-)

diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c
index f6bc9c0c6afb..ea6388dd515e 100644
--- a/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -981,13 +981,17 @@ zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp,
 		case RENAME:
 			if (error == ENOENT) {
 				error = EJUSTRETURN;
+#if __FreeBSD_version < 1400068
 				cnp->cn_flags |= SAVENAME;
+#endif
 				break;
 			}
 			fallthrough;
 		case DELETE:
+#if __FreeBSD_version < 1400068
 			if (error == 0)
 				cnp->cn_flags |= SAVENAME;
+#endif
 			break;
 		}
 	}
@@ -1337,7 +1341,10 @@ zfs_lookup_internal(znode_t *dzp, const char *name, vnode_t **vpp,
 	cnp->cn_nameptr = __DECONST(char *, name);
 	cnp->cn_namelen = strlen(name);
 	cnp->cn_nameiop = nameiop;
-	cnp->cn_flags = ISLASTCN | SAVENAME;
+	cnp->cn_flags = ISLASTCN;
+#if __FreeBSD_version < 1400068
+	cnp->cn_flags |= SAVENAME;
+#endif
 	cnp->cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
 	cnp->cn_cred = kcred;
 #if __FreeBSD_version < 1400037
@@ -4642,7 +4649,9 @@ zfs_freebsd_create(struct vop_create_args *ap)
 	znode_t *zp = NULL;
 	int rc, mode;
 
+#if __FreeBSD_version < 1400068
 	ASSERT(cnp->cn_flags & SAVENAME);
+#endif
 
 	vattr_init_mask(vap);
 	mode = vap->va_mode & ALLPERMS;
@@ -4672,7 +4681,9 @@ static int
 zfs_freebsd_remove(struct vop_remove_args *ap)
 {
 
+#if __FreeBSD_version < 1400068
 	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
+#endif
 
 	return (zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
 	    ap->a_cnp->cn_cred));
@@ -4694,7 +4705,9 @@ zfs_freebsd_mkdir(struct vop_mkdir_args *ap)
 	znode_t *zp = NULL;
 	int rc;
 
+#if __FreeBSD_version < 1400068
 	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
+#endif
 
 	vattr_init_mask(vap);
 	*ap->a_vpp = NULL;
@@ -4720,7 +4733,9 @@ zfs_freebsd_rmdir(struct vop_rmdir_args *ap)
 {
 	struct componentname *cnp = ap->a_cnp;
 
+#if __FreeBSD_version < 1400068
 	ASSERT(cnp->cn_flags & SAVENAME);
+#endif
 
 	return (zfs_rmdir_(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
 }
@@ -4974,8 +4989,10 @@ zfs_freebsd_rename(struct vop_rename_args *ap)
 	vnode_t *tvp = ap->a_tvp;
 	int error;
 
+#if __FreeBSD_version < 1400068
 	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
 	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
+#endif
 
 	error = zfs_do_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
 	    ap->a_tcnp, ap->a_fcnp->cn_cred);
@@ -5011,7 +5028,9 @@ zfs_freebsd_symlink(struct vop_symlink_args *ap)
 #endif
 	int rc;
 
+#if __FreeBSD_version < 1400068
 	ASSERT(cnp->cn_flags & SAVENAME);
+#endif
 
 	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
 	vattr_init_mask(vap);
@@ -5105,7 +5124,9 @@ zfs_freebsd_link(struct vop_link_args *ap)
 	if (tdvp->v_mount != vp->v_mount)
 		return (EXDEV);
 
+#if __FreeBSD_version < 1400068
 	ASSERT(cnp->cn_flags & SAVENAME);
+#endif
 
 	return (zfs_link(VTOZ(tdvp), VTOZ(vp),
 	    cnp->cn_nameptr, cnp->cn_cred, 0));
@@ -5364,10 +5385,10 @@ zfs_getextattr_dir(struct vop_getextattr_args *ap, const char *attrname)
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp);
 #endif
 	error = vn_open_cred(&nd, &flags, 0, VN_OPEN_INVFS, ap->a_cred, NULL);
-	vp = nd.ni_vp;
-	NDFREE_PNBUF(&nd);
 	if (error != 0)
 		return (error);
+	vp = nd.ni_vp;
+	NDFREE_PNBUF(&nd);
 
 	if (ap->a_size != NULL) {
 		error = VOP_GETATTR(vp, &va, ap->a_cred);
@@ -5481,12 +5502,10 @@ zfs_deleteextattr_dir(struct vop_deleteextattr_args *ap, const char *attrname)
 	    UIO_SYSSPACE, attrname, xvp);
 #endif
 	error = namei(&nd);
-	vp = nd.ni_vp;
-	if (error != 0) {
-		NDFREE_PNBUF(&nd);
+	if (error != 0)
 		return (error);
-	}
 
+	vp = nd.ni_vp;
 	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
 	NDFREE_PNBUF(&nd);
 
@@ -5612,10 +5631,10 @@ zfs_setextattr_dir(struct vop_setextattr_args *ap, const char *attrname)
 #endif
 	error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred,
 	    NULL);
-	vp = nd.ni_vp;
-	NDFREE_PNBUF(&nd);
 	if (error != 0)
 		return (error);
+	vp = nd.ni_vp;
+	NDFREE_PNBUF(&nd);
 
 	VATTR_NULL(&va);
 	va.va_size = 0;
@@ -5767,10 +5786,10 @@ zfs_listextattr_dir(struct vop_listextattr_args *ap, const char *attrprefix)
 	    UIO_SYSSPACE, ".", xvp);
 #endif
 	error = namei(&nd);
-	vp = nd.ni_vp;
-	NDFREE_PNBUF(&nd);
 	if (error != 0)
 		return (error);
+	vp = nd.ni_vp;
+	NDFREE_PNBUF(&nd);
 
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;

From 63d4838b4ac5557510db70aff334ec7caa96114c Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Wed, 21 Sep 2022 00:22:32 +0200
Subject: [PATCH 36/39] FreeBSD: handle V_PCATCH

See https://cgit.FreeBSD.org/src/commit/?id=a75d1ddd74312f5dd79bc1e965f7077679659f2e

Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Closes #13910
---
 module/os/freebsd/zfs/zfs_file_os.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/module/os/freebsd/zfs/zfs_file_os.c b/module/os/freebsd/zfs/zfs_file_os.c
index fd86a75416e6..60c9ff0581e0 100644
--- a/module/os/freebsd/zfs/zfs_file_os.c
+++ b/module/os/freebsd/zfs/zfs_file_os.c
@@ -226,7 +226,11 @@ zfs_vop_fsync(vnode_t *vp)
 	struct mount *mp;
 	int error;
 
+#if __FreeBSD_version < 1400068
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+#else
+	if ((error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH)) != 0)
+#endif
 		goto drop;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	error = VOP_FSYNC(vp, MNT_WAIT, curthread);

From a2705b1dd5f8d186db02091b96efdd5f87e38090 Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Fri, 23 Sep 2022 10:24:19 -0700
Subject: [PATCH 37/39] zpool: Don't print "repairing" on force faulted drives

If you force fault a drive that's resilvering, it's scan stats can get
frozen in time, giving the false impression that it's being resilvered.
This commit checks the vdev state to see if the vdev is healthy before
reporting "resilvering" or "repairing" in zpool status.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #13927
Closes #13930
---
 cmd/zpool/zpool_main.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c
index 3f46bd0513c9..54464731b52e 100644
--- a/cmd/zpool/zpool_main.c
+++ b/cmd/zpool/zpool_main.c
@@ -2438,7 +2438,14 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
 	(void) nvlist_lookup_uint64_array(root, ZPOOL_CONFIG_SCAN_STATS,
 	    (uint64_t **)&ps, &c);
 
-	if (ps != NULL && ps->pss_state == DSS_SCANNING && children == 0) {
+	/*
+	 * If you force fault a drive that's resilvering, its scan stats can
+	 * get frozen in time, giving the false impression that it's
+	 * being resilvered.  That's why we check the state to see if the vdev
+	 * is healthy before reporting "resilvering" or "repairing".
+	 */
+	if (ps != NULL && ps->pss_state == DSS_SCANNING && children == 0 &&
+	    vs->vs_state == VDEV_STATE_HEALTHY) {
 		if (vs->vs_scan_processed != 0) {
 			(void) printf(gettext("  (%s)"),
 			    (ps->pss_func == POOL_SCAN_RESILVER) ?
@@ -2450,7 +2457,7 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
 
 	/* The top-level vdevs have the rebuild stats */
 	if (vrs != NULL && vrs->vrs_state == VDEV_REBUILD_ACTIVE &&
-	    children == 0) {
+	    children == 0 && vs->vs_state == VDEV_STATE_HEALTHY) {
 		if (vs->vs_rebuild_processed != 0) {
 			(void) printf(gettext("  (resilvering)"));
 		}

From 566e908fa01eb91e0637347987bc61772d47aee1 Mon Sep 17 00:00:00 2001
From: Richard Yao <richard.yao@alumni.stonybrook.edu>
Date: Tue, 27 Sep 2022 15:36:58 -0400
Subject: [PATCH 38/39] Fix bad free in skein code

Clang's static analyzer found a bad free caused by skein_mac_atomic().
It will allocate a context on the stack and then pass it to
skein_final(), which attempts to free it. Upon inspection,
skein_digest_atomic() also has the same problem.

These functions were created to match the OpenSolaris ICP API, so I was
curious how we avoided this in other providers and looked at the SHA2
code. It appears that SHA2 has a SHA2Final() helper function that is
called by the exported sha2_mac_final()/sha2_digest_final() as well as
the sha2_mac_atomic() and sha2_digest_atomic() functions. The real work
is done in SHA2Final() while some checks and the free are done in
sha2_mac_final()/sha2_digest_final().

We fix the use after free in the skein code by taking inspiration from
the SHA2 code. We introduce a skein_final_nofree() that does most of the
work, and make skein_final() into a function that calls it and then
frees the memory.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Closes #13954
---
 module/icp/io/skein_mod.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/module/icp/io/skein_mod.c b/module/icp/io/skein_mod.c
index 5ee36af12bcb..8992c5895e5b 100644
--- a/module/icp/io/skein_mod.c
+++ b/module/icp/io/skein_mod.c
@@ -494,7 +494,8 @@ skein_update(crypto_ctx_t *ctx, crypto_data_t *data, crypto_req_handle_t req)
  */
 /*ARGSUSED*/
 static int
-skein_final(crypto_ctx_t *ctx, crypto_data_t *digest, crypto_req_handle_t req)
+skein_final_nofree(crypto_ctx_t *ctx, crypto_data_t *digest,
+    crypto_req_handle_t req)
 {
 	int error = CRYPTO_SUCCESS;
 
@@ -525,6 +526,17 @@ skein_final(crypto_ctx_t *ctx, crypto_data_t *digest, crypto_req_handle_t req)
 	else
 		digest->cd_length = 0;
 
+	return (error);
+}
+
+static int
+skein_final(crypto_ctx_t *ctx, crypto_data_t *digest, crypto_req_handle_t req)
+{
+	int error = skein_final_nofree(ctx, digest, req);
+
+	if (error == CRYPTO_BUFFER_TOO_SMALL)
+		return (error);
+
 	bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
 	kmem_free(SKEIN_CTX(ctx), sizeof (*(SKEIN_CTX(ctx))));
 	SKEIN_CTX_LVALUE(ctx) = NULL;
@@ -560,7 +572,7 @@ skein_digest_atomic(crypto_provider_handle_t provider,
 
 	if ((error = skein_update(&ctx, data, digest)) != CRYPTO_SUCCESS)
 		goto out;
-	if ((error = skein_final(&ctx, data, digest)) != CRYPTO_SUCCESS)
+	if ((error = skein_final_nofree(&ctx, data, digest)) != CRYPTO_SUCCESS)
 		goto out;
 
 out:
@@ -669,7 +681,7 @@ skein_mac_atomic(crypto_provider_handle_t provider,
 
 	if ((error = skein_update(&ctx, data, req)) != CRYPTO_SUCCESS)
 		goto errout;
-	if ((error = skein_final(&ctx, mac, req)) != CRYPTO_SUCCESS)
+	if ((error = skein_final_nofree(&ctx, mac, req)) != CRYPTO_SUCCESS)
 		goto errout;
 
 	return (CRYPTO_SUCCESS);

From 6a6bd493988c75331deab06e5352a9bed035a87d Mon Sep 17 00:00:00 2001
From: Tony Hutter <hutter2@llnl.gov>
Date: Mon, 19 Sep 2022 09:50:46 -0700
Subject: [PATCH 39/39] Tag zfs-2.1.6

META file and changelog updated.

Signed-off-by: Tony Hutter <hutter2@llnl.gov>
---
 META | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/META b/META
index baf85fe89de0..7dd5b311d0c2 100644
--- a/META
+++ b/META
@@ -1,7 +1,7 @@
 Meta:          1
 Name:          zfs
 Branch:        1.0
-Version:       2.1.5
+Version:       2.1.6
 Release:       1
 Release-Tags:  relext
 License:       CDDL