From 8131a96544fd19411c23424af85140e91e41c001 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Mon, 12 Sep 2022 14:22:15 -0400 Subject: [PATCH 01/39] Fix use-after-free in btree code Coverty static analysis found these. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Reviewed-by: Neal Gompa Signed-off-by: Richard Yao Closes #10989 Closes #13861 --- module/zfs/btree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/module/zfs/btree.c b/module/zfs/btree.c index 03c46473c1ec..36755f97929c 100644 --- a/module/zfs/btree.c +++ b/module/zfs/btree.c @@ -1608,8 +1608,8 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count, 1); new_rm_hdr->bth_count = 0; - zfs_btree_node_destroy(tree, new_rm_hdr); zfs_btree_remove_from_node(tree, parent, new_rm_hdr); + zfs_btree_node_destroy(tree, new_rm_hdr); } /* Remove the element at the specific location. */ @@ -1817,10 +1817,10 @@ zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where) /* Move our elements to the left neighbor. */ bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep, k_count + 1); - zfs_btree_node_destroy(tree, rm_hdr); /* Remove the emptied node from the parent. */ zfs_btree_remove_from_node(tree, parent, rm_hdr); + zfs_btree_node_destroy(tree, rm_hdr); zfs_btree_verify(tree); } From e1b49e3f1d53038a8c2bbd4a297988b5c850aab4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Neal=20Gompa=20=28=E3=83=8B=E3=83=BC=E3=83=AB=E3=83=BB?= =?UTF-8?q?=E3=82=B4=E3=83=B3=E3=83=91=29?= Date: Tue, 24 May 2022 17:07:01 -0400 Subject: [PATCH 02/39] rpm: Use the correct version-release information in dependencies This tightly links the subpackages together and ensures that everything is upgraded together. Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Neal Gompa Closes #13489 --- rpm/generic/zfs.spec.in | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in index 1cd3f6b520ea..c2894aa7bb80 100644 --- a/rpm/generic/zfs.spec.in +++ b/rpm/generic/zfs.spec.in @@ -120,12 +120,12 @@ License: @ZFS_META_LICENSE@ URL: https://github.com/openzfs/zfs Source0: %{name}-%{version}.tar.gz BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) -Requires: libzpool5 = %{version} -Requires: libnvpair3 = %{version} -Requires: libuutil3 = %{version} -Requires: libzfs5 = %{version} +Requires: libzpool5%{?_isa} = %{version}-%{release} +Requires: libnvpair3%{?_isa} = %{version}-%{release} +Requires: libuutil3%{?_isa} = %{version}-%{release} +Requires: libzfs5%{?_isa} = %{version}-%{release} Requires: %{name}-kmod = %{version} -Provides: %{name}-kmod-common = %{version} +Provides: %{name}-kmod-common = %{version}-%{release} Obsoletes: spl # zfs-fuse provides the same commands and man pages that OpenZFS does. @@ -255,13 +255,13 @@ This package provides support for managing ZFS filesystems %package -n libzfs5-devel Summary: Development headers Group: System Environment/Kernel -Requires: libzfs5 = %{version} -Requires: libzpool5 = %{version} -Requires: libnvpair3 = %{version} -Requires: libuutil3 = %{version} -Provides: libzpool5-devel -Provides: libnvpair3-devel -Provides: libuutil3-devel +Requires: libzfs5%{?_isa} = %{version}-%{release} +Requires: libzpool5%{?_isa} = %{version}-%{release} +Requires: libnvpair3%{?_isa} = %{version}-%{release} +Requires: libuutil3%{?_isa} = %{version}-%{release} +Provides: libzpool5-devel = %{version}-%{release} +Provides: libnvpair3-devel = %{version}-%{release} +Provides: libuutil3-devel = %{version}-%{release} Obsoletes: zfs-devel Obsoletes: libzfs2-devel Obsoletes: libzfs4-devel @@ -313,8 +313,8 @@ Summary: Python %{python_version} wrapper for libzfs_core Group: Development/Languages/Python License: Apache-2.0 BuildArch: noarch -Requires: libzfs5 = %{version} -Requires: libnvpair3 = %{version} +Requires: libzfs5 = %{version}-%{release} +Requires: libnvpair3 = %{version}-%{release} Requires: libffi Requires: python%{__python_pkg_version} Requires: %{__python_cffi_pkg} @@ -339,7 +339,6 @@ This package provides a python wrapper for the libzfs_core C library. Summary: Initramfs module Group: System Environment/Kernel Requires: %{name}%{?_isa} = %{version}-%{release} -Requires: %{name} = %{version}-%{release} Requires: initramfs-tools %description initramfs From f48d9b426919c8c8bce8d075e05f59cce9053728 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Mon, 11 Jul 2022 11:35:01 -0700 Subject: [PATCH 03/39] rpm: Silence "unversioned Obsoletes" warnings on EL 9 Get rid of RPM warnings on AlmaLinux 9: "It's not recommended to have unversioned Obsoletes" Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #13584 Closes #13638 --- rpm/generic/zfs-dkms.spec.in | 2 +- rpm/generic/zfs.spec.in | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/rpm/generic/zfs-dkms.spec.in b/rpm/generic/zfs-dkms.spec.in index 55f0f1cf5249..920b90e88912 100644 --- a/rpm/generic/zfs-dkms.spec.in +++ b/rpm/generic/zfs-dkms.spec.in @@ -31,7 +31,7 @@ Requires(post): gcc, make, perl, diffutils %if 0%{?rhel}%{?fedora}%{?mageia}%{?suse_version} Requires: kernel-devel >= @ZFS_META_KVER_MIN@, kernel-devel <= @ZFS_META_KVER_MAX@.999 Requires(post): kernel-devel >= @ZFS_META_KVER_MIN@, kernel-devel <= @ZFS_META_KVER_MAX@.999 -Obsoletes: spl-dkms +Obsoletes: spl-dkms <= %{version} %endif Provides: %{module}-kmod = %{version} AutoReqProv: no diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in index c2894aa7bb80..8cab1c3d70bb 100644 --- a/rpm/generic/zfs.spec.in +++ b/rpm/generic/zfs.spec.in @@ -126,7 +126,7 @@ Requires: libuutil3%{?_isa} = %{version}-%{release} Requires: libzfs5%{?_isa} = %{version}-%{release} Requires: %{name}-kmod = %{version} Provides: %{name}-kmod-common = %{version}-%{release} -Obsoletes: spl +Obsoletes: spl <= %{version} # zfs-fuse provides the same commands and man pages that OpenZFS does. # Renaming those on either side would conflict with all available documentation. @@ -178,8 +178,8 @@ This package contains the core ZFS command line utilities. %package -n libzpool5 Summary: Native ZFS pool library for Linux Group: System Environment/Kernel -Obsoletes: libzpool2 -Obsoletes: libzpool4 +Obsoletes: libzpool2 <= %{version} +Obsoletes: libzpool4 <= %{version} %description -n libzpool5 This package contains the zpool library, which provides support @@ -195,7 +195,7 @@ for managing zpools %package -n libnvpair3 Summary: Solaris name-value library for Linux Group: System Environment/Kernel -Obsoletes: libnvpair1 +Obsoletes: libnvpair1 <= %{version} %description -n libnvpair3 This package contains routines for packing and unpacking name-value @@ -213,7 +213,7 @@ to write self describing data structures on disk. %package -n libuutil3 Summary: Solaris userland utility library for Linux Group: System Environment/Kernel -Obsoletes: libuutil1 +Obsoletes: libuutil1 <= %{version} %description -n libuutil3 This library provides a variety of compatibility functions for OpenZFS: @@ -239,8 +239,8 @@ This library provides a variety of compatibility functions for OpenZFS: %package -n libzfs5 Summary: Native ZFS filesystem library for Linux Group: System Environment/Kernel -Obsoletes: libzfs2 -Obsoletes: libzfs4 +Obsoletes: libzfs2 <= %{version} +Obsoletes: libzfs4 <= %{version} %description -n libzfs5 This package provides support for managing ZFS filesystems @@ -262,9 +262,9 @@ Requires: libuutil3%{?_isa} = %{version}-%{release} Provides: libzpool5-devel = %{version}-%{release} Provides: libnvpair3-devel = %{version}-%{release} Provides: libuutil3-devel = %{version}-%{release} -Obsoletes: zfs-devel -Obsoletes: libzfs2-devel -Obsoletes: libzfs4-devel +Obsoletes: zfs-devel <= %{version} +Obsoletes: libzfs2-devel <= %{version} +Obsoletes: libzfs4-devel <= %{version} %description -n libzfs5-devel This package contains the header files needed for building additional From acd74646399c7b40ab5f54ea2f286ed9c9edaa64 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Mon, 11 Jul 2022 13:35:19 -0700 Subject: [PATCH 04/39] zed: Ignore false 'atari' partitions in autoreplace libudev will sometimes falsely identify an 'atari' partition on a blank disk, preventing it from being used in an autoreplace. This seems to be a known issue. The workaround is to just ignore the fake partition and continue with the autoreplace. Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #13497 Closes #13632 --- cmd/zed/zed_disk_event.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/cmd/zed/zed_disk_event.c b/cmd/zed/zed_disk_event.c index 52b80d8c4c93..ed774effe256 100644 --- a/cmd/zed/zed_disk_event.c +++ b/cmd/zed/zed_disk_event.c @@ -208,6 +208,12 @@ zed_udev_monitor(void *arg) * if this is a disk and it is partitioned, then the * zfs label will reside in a DEVTYPE=partition and * we can skip passing this event + * + * Special case: Blank disks are sometimes reported with + * an erroneous 'atari' partition, and should not be + * excluded from being used as an autoreplace disk: + * + * https://github.com/openzfs/zfs/issues/13497 */ type = udev_device_get_property_value(dev, "DEVTYPE"); part = udev_device_get_property_value(dev, @@ -215,14 +221,23 @@ zed_udev_monitor(void *arg) if (type != NULL && type[0] != '\0' && strcmp(type, "disk") == 0 && part != NULL && part[0] != '\0') { - zed_log_msg(LOG_INFO, - "%s: skip %s since it has a %s partition already", - __func__, - udev_device_get_property_value(dev, "DEVNAME"), - part); - /* skip and wait for partition event */ - udev_device_unref(dev); - continue; + const char *devname = + udev_device_get_property_value(dev, "DEVNAME"); + + if (strcmp(part, "atari") == 0) { + zed_log_msg(LOG_INFO, + "%s: %s is reporting an atari partition, " + "but we're going to assume it's a false " + "positive and still use it (issue #13497)", + __func__, devname); + } else { + zed_log_msg(LOG_INFO, + "%s: skip %s since it has a %s partition " + "already", __func__, devname, part); + /* skip and wait for partition event */ + udev_device_unref(dev); + continue; + } } /* From 65f8f92d12c081aa67d6312b5210fa10518d7ebf Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Thu, 14 Jul 2022 10:19:37 -0700 Subject: [PATCH 05/39] zed: Look for NVMe DEVPATH if no ID_BUS We tried replacing an NVMe drive using autoreplace, only to see zed reject it with: zed[27955]: zed_udev_monitor: /dev/nvme5n1 no devid source This happened because ZED saw that ID_BUS was not set by udev for the NVMe drive, and thus didn't think it was "real drive". This commit allows NVMe drives to be autoreplaced even if ID_BUS is not set. Reviewed-by: Don Brady Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #13512 Closes #13646 --- cmd/zed/zed_disk_event.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/cmd/zed/zed_disk_event.c b/cmd/zed/zed_disk_event.c index ed774effe256..d9b355292ab9 100644 --- a/cmd/zed/zed_disk_event.c +++ b/cmd/zed/zed_disk_event.c @@ -169,7 +169,7 @@ zed_udev_monitor(void *arg) while (1) { struct udev_device *dev; const char *action, *type, *part, *sectors; - const char *bus, *uuid; + const char *bus, *uuid, *devpath; const char *class, *subclass; nvlist_t *nvl; boolean_t is_zfs = B_FALSE; @@ -263,10 +263,19 @@ zed_udev_monitor(void *arg) * device id string is required in the message schema * for matching with vdevs. Preflight here for expected * udev information. + * + * Special case: + * NVMe devices don't have ID_BUS set (at least on RHEL 7-8), + * but they are valid for autoreplace. Add a special case for + * them by searching for "/nvme/" in the udev DEVPATH: + * + * DEVPATH=/devices/pci0000:00/0000:00:1e.0/nvme/nvme2/nvme2n1 */ bus = udev_device_get_property_value(dev, "ID_BUS"); uuid = udev_device_get_property_value(dev, "DM_UUID"); - if (!is_zfs && (bus == NULL && uuid == NULL)) { + devpath = udev_device_get_devpath(dev); + if (!is_zfs && (bus == NULL && uuid == NULL && + strstr(devpath, "/nvme/") == NULL)) { zed_log_msg(LOG_INFO, "zed_udev_monitor: %s no devid " "source", udev_device_get_devnode(dev)); udev_device_unref(dev); From b1be0a5c151bb8f7d85430465c57d85d9be48075 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Tue, 9 Aug 2022 13:26:46 -0700 Subject: [PATCH 06/39] ZTS: Fix zpool_expand_001_pos `zpool_expand_001_pos` was often failing due to not seeing autoexpand commands in the `zpool history`. During testing, I found this to be unreliable (sometimes the "online" wouldn't appear in `zpool history`) and unnecessary, as we could simply check that the pool increased in size. This commit revamps the test to check for the expanded pool size and corresponding new free space. Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #13743 --- .../zpool_expand/zpool_expand_001_pos.ksh | 112 +++++++++--------- 1 file changed, 55 insertions(+), 57 deletions(-) diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh index 6bbd46289f7c..8760f48dd2a4 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh @@ -54,6 +54,14 @@ verify_runnable "global" +# We override $org_size and $exp_size from zpool_expand.cfg to make sure we get +# an expected free space value every time. Otherwise, if we left it +# configurable, the free space ratio to pool size ratio would diverge too much +# much at low $org_size values. +# +org_size=$((1024 * 1024 * 1024)) +exp_size=$(($org_size * 2)) + function cleanup { poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1 @@ -68,11 +76,35 @@ function cleanup unload_scsi_debug } +# Wait for the size of a pool to autoexpand to $1 and the total free space to +# expand to $2 (both values allowing a 10% tolerance). +# +# Wait for up to 10 seconds for this to happen (typically takes 1-2 seconds) +# +function wait_for_autoexpand +{ + typeset exp_new_size=$1 + typeset exp_new_free=$2 + + for i in $(seq 1 10) ; do + typeset new_size=$(get_pool_prop size $TESTPOOL1) + typeset new_free=$(get_prop avail $TESTPOOL1) + # Values need to be within 90% of each other (10% tolerance) + if within_percent $new_size $exp_new_size 90 > /dev/null && \ + within_percent $new_free $exp_new_free 90 > /dev/null ; then + return + fi + sleep 1 + done + log_fail "$TESTPOOL never expanded to $exp_new_size with $exp_new_free" \ + " free space (got $new_size with $new_free free space)" +} + log_onexit cleanup log_assert "zpool can be autoexpanded after set autoexpand=on on vdev expansion" -for type in " " mirror raidz draid:1s; do +for type in " " mirror raidz; do log_note "Setting up loopback, scsi_debug, and file vdevs" log_must truncate -s $org_size $FILE_LO DEV1=$(losetup -f) @@ -105,72 +137,38 @@ for type in " " mirror raidz draid:1s; do log_note "Expanding loopback, scsi_debug, and file vdevs" log_must truncate -s $exp_size $FILE_LO log_must losetup -c $DEV1 - sleep 3 echo "2" > /sys/bus/pseudo/drivers/scsi_debug/virtual_gb echo "1" > /sys/class/block/$DEV2/device/rescan block_device_wait - sleep 3 log_must truncate -s $exp_size $FILE_RAW log_must zpool online -e $TESTPOOL1 $FILE_RAW - typeset expand_size=$(get_pool_prop size $TESTPOOL1) - typeset zfs_expand_size=$(get_prop avail $TESTPOOL1) - - log_note "$TESTPOOL1 $type has previous size: $prev_size and " \ - "expanded size: $expand_size" - # compare available pool size from zfs - if [[ $zfs_expand_size -gt $zfs_prev_size ]]; then - # check for zpool history for the pool size expansion - if [[ $type == " " ]]; then - typeset expansion_size=$(($exp_size-$org_size)) - typeset size_addition=$(zpool history -il $TESTPOOL1 |\ - grep "pool '$TESTPOOL1' size:" | \ - grep "vdev online" | \ - grep "(+${expansion_size}" | wc -l) - - if [[ $size_addition -ne 3 ]]; then - log_fail "pool $TESTPOOL1 has not expanded, " \ - "$size_addition/3 vdevs expanded" - fi - elif [[ $type == "mirror" ]]; then - typeset expansion_size=$(($exp_size-$org_size)) - zpool history -il $TESTPOOL1 | \ - grep "pool '$TESTPOOL1' size:" | \ - grep "vdev online" | \ - grep "(+${expansion_size})" >/dev/null 2>&1 - - if [[ $? -ne 0 ]] ; then - log_fail "pool $TESTPOOL1 has not expanded" - fi - elif [[ $type == "draid:1s" ]]; then - typeset expansion_size=$((2*($exp_size-$org_size))) - zpool history -il $TESTPOOL1 | \ - grep "pool '$TESTPOOL1' size:" | \ - grep "vdev online" | \ - grep "(+${expansion_size})" >/dev/null 2>&1 - - if [[ $? -ne 0 ]]; then - log_fail "pool $TESTPOOL has not expanded" - fi - else - typeset expansion_size=$((3*($exp_size-$org_size))) - zpool history -il $TESTPOOL1 | \ - grep "pool '$TESTPOOL1' size:" | \ - grep "vdev online" | \ - grep "(+${expansion_size})" >/dev/null 2>&1 - - if [[ $? -ne 0 ]]; then - log_fail "pool $TESTPOOL has not expanded" - fi - fi - else - log_fail "pool $TESTPOOL1 is not autoexpanded after vdev " \ - "expansion. Previous size: $zfs_prev_size and expanded " \ - "size: $zfs_expand_size" + + # The expected free space values below were observed at the time of + # this commit. However, we know ZFS overhead will change over time, + # and thus we do not do an exact comparison to these values in + # wait_for_autoexpand. Rather, we make sure the free space + # is within some small percentage threshold of these values. + typeset exp_new_size=$(($prev_size * 2)) + if [[ "$type" == " " ]] ; then + exp_new_free=6045892608 + elif [[ "$type" == "mirror" ]] ; then + exp_new_free=1945997312 + elif [[ "$type" == "raidz" ]] ; then + exp_new_free=3977637338 + elif [[ "$type" == "draid:1s" ]] then + exp_new_free=1946000384 fi + wait_for_autoexpand $exp_new_size $exp_new_free + + expand_size=$(get_pool_prop size $TESTPOOL1) + + log_note "$TESTPOOL1 '$type' grew from $prev_size -> $expand_size with" \ + "free space from $zfs_prev_size -> $(get_prop avail $TESTPOOL1)" + cleanup done From 15b64fbc94cc7aae1cef8787be2a36f677da92a7 Mon Sep 17 00:00:00 2001 From: George Wilson Date: Fri, 26 Aug 2022 16:04:27 -0500 Subject: [PATCH 07/39] Importing from cachefile can trip assertion When importing from cachefile, it is possible that the builtin retry logic will trip an assertion because it also fails to find the pool. This fix addresses that case and returns the correct error message to the user. Reviewed-by: Richard Yao Reviewed-by: Serapheim Dimitropoulos Reviewed-by: Brian Behlendorf Signed-off-by: George Wilson Closes #13781 --- lib/libzutil/zutil_import.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/libzutil/zutil_import.c b/lib/libzutil/zutil_import.c index f6f125e7a5df..1658215199f2 100644 --- a/lib/libzutil/zutil_import.c +++ b/lib/libzutil/zutil_import.c @@ -1660,6 +1660,8 @@ zpool_find_import_cached(libpc_handle_t *hdl, importargs_t *iarg) * caller. */ nvpair_t *pair = nvlist_next_nvpair(nv, NULL); + if (pair == NULL) + continue; fnvlist_add_nvlist(pools, nvpair_name(pair), fnvpair_value_nvlist(pair)); From b6ebf270ebd2a202dae0e14d2f16a929cdfdd61c Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 2 Sep 2022 16:21:18 -0400 Subject: [PATCH 08/39] Apply arc_shrink_shift to ARC above arc_c_min It makes sense to free memory in smaller chunks when approaching arc_c_min to let other kernel subsystems to free more, since after that point we can't free anything. This also matches behavior on Linux, where to shrinker reported only the size above arc_c_min. Reviewed-by: Ryan Moeller Reviewed-by: Allan Jude Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Closes #13794 --- module/os/freebsd/zfs/arc_os.c | 5 ++++- module/zfs/arc.c | 9 +++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c index fddb1f0e87cb..77af092e1ed4 100644 --- a/module/os/freebsd/zfs/arc_os.c +++ b/module/os/freebsd/zfs/arc_os.c @@ -223,7 +223,10 @@ arc_lowmem(void *arg __unused, int howto __unused) arc_warm = B_TRUE; arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); free_memory = arc_available_memory(); - to_free = (arc_c >> arc_shrink_shift) - MIN(free_memory, 0); + int64_t can_free = arc_c - arc_c_min; + if (can_free <= 0) + return; + to_free = (can_free >> arc_shrink_shift) - MIN(free_memory, 0); DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free); arc_reduce_target_size(to_free); diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 8d3882694718..215250ea6fec 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -5036,10 +5036,11 @@ arc_reap_cb(void *arg, zthr_t *zthr) */ free_memory = arc_available_memory(); - int64_t to_free = - (arc_c >> arc_shrink_shift) - free_memory; - if (to_free > 0) { - arc_reduce_target_size(to_free); + int64_t can_free = arc_c - arc_c_min; + if (can_free > 0) { + int64_t to_free = (can_free >> arc_shrink_shift) - free_memory; + if (to_free > 0) + arc_reduce_target_size(to_free); } spl_fstrans_unmark(cookie); } From 78206a2e447a2973203aaebb92633a5e8e6d2d34 Mon Sep 17 00:00:00 2001 From: Ryan Moeller Date: Tue, 9 Aug 2022 09:05:29 +0000 Subject: [PATCH 09/39] FreeBSD: Mark ZFS_MODULE_PARAM_CALL as MPSAFE ZFS_MODULE_PARAM_CALL handlers implement their own locking if needed and do not require Giant. Reviewed-by: Alexander Motin Signed-off-by: Ryan Moeller Closes #13756 --- include/os/freebsd/spl/sys/mod_os.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/os/freebsd/spl/sys/mod_os.h b/include/os/freebsd/spl/sys/mod_os.h index 5695abee7b85..46ea2d15ac6e 100644 --- a/include/os/freebsd/spl/sys/mod_os.h +++ b/include/os/freebsd/spl/sys/mod_os.h @@ -52,7 +52,7 @@ #define ZFS_MODULE_PARAM_CALL_IMPL(parent, name, perm, args, desc) \ SYSCTL_DECL(parent); \ - SYSCTL_PROC(parent, OID_AUTO, name, perm | args, desc) + SYSCTL_PROC(parent, OID_AUTO, name, CTLFLAG_MPSAFE | perm | args, desc) #define ZFS_MODULE_PARAM_CALL(scope_prefix, name_prefix, name, func, _, perm, desc) \ ZFS_MODULE_PARAM_CALL_IMPL(_vfs_ ## scope_prefix, name, perm, func ## _args(name_prefix ## name), desc) From aa9e887d2a715e987c02c7bb57f88f63442c553b Mon Sep 17 00:00:00 2001 From: Samuel <50765275+npc203@users.noreply.github.com> Date: Tue, 6 Sep 2022 22:07:47 +0530 Subject: [PATCH 10/39] Fix column width in 'zpool iostat -v' and 'zpool list -v' This commit fixes a minor spacing issue caused when enumerating vdev names, which originated from #13031 Reviewed-by: Brian Behlendorf Reviewed-by: Akash B Signed-off-by: Samuel Wycliffe Closes #13811 --- cmd/zpool/zpool_main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index b93a6196beea..b2e7dc4a5b4d 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -5458,8 +5458,8 @@ get_namewidth_iostat(zpool_handle_t *zhp, void *data) * get_namewidth() returns the maximum width of any name in that column * for any pool/vdev/device line that will be output. */ - width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags, - cb->cb_verbose); + width = get_namewidth(zhp, cb->cb_namewidth, + cb->cb_name_flags | VDEV_NAME_TYPE_ID, cb->cb_verbose); /* * The width we are calculating is the width of the header and also the @@ -6282,8 +6282,8 @@ get_namewidth_list(zpool_handle_t *zhp, void *data) list_cbdata_t *cb = data; int width; - width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags, - cb->cb_verbose); + width = get_namewidth(zhp, cb->cb_namewidth, + cb->cb_name_flags | VDEV_NAME_TYPE_ID, cb->cb_verbose); if (width < 9) width = 9; From 2010c183bcc1a2edc2cb8ed5f2d065a35a891fec Mon Sep 17 00:00:00 2001 From: Walter Huf Date: Tue, 6 Sep 2022 10:02:18 -0700 Subject: [PATCH 11/39] Add xattr_handler support for Android kernels Some ARM BSPs run the Android kernel, which has a modified xattr_handler->get() function signature. This adds support to compile against these kernels. Reviewed-by: Brian Behlendorf Reviewed-by: Ryan Moeller Signed-off-by: Walter Huf Closes #13824 --- config/kernel-xattr-handler.m4 | 29 +++++++++++++++++++- include/os/linux/kernel/linux/xattr_compat.h | 14 ++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/config/kernel-xattr-handler.m4 b/config/kernel-xattr-handler.m4 index 00b1e74a9ccb..b6cbfa155007 100644 --- a/config/kernel-xattr-handler.m4 +++ b/config/kernel-xattr-handler.m4 @@ -100,6 +100,19 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_GET], [ .get = get, }; ],[]) + + ZFS_LINUX_TEST_SRC([xattr_handler_get_dentry_inode_flags], [ + #include + + int get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, + size_t size, int flags) { return 0; } + static const struct xattr_handler + xops __attribute__ ((unused)) = { + .get = get, + }; + ],[]) ]) AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_GET], [ @@ -142,7 +155,21 @@ AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_GET], [ AC_DEFINE(HAVE_XATTR_GET_DENTRY, 1, [xattr_handler->get() wants dentry]) ],[ - ZFS_LINUX_TEST_ERROR([xattr get()]) + dnl # + dnl # Android API change, + dnl # The xattr_handler->get() callback was + dnl # changed to take dentry, inode and flags. + dnl # + AC_MSG_RESULT(no) + AC_MSG_CHECKING( + [whether xattr_handler->get() wants dentry and inode and flags]) + ZFS_LINUX_TEST_RESULT([xattr_handler_get_dentry_inode_flags], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_XATTR_GET_DENTRY_INODE_FLAGS, 1, + [xattr_handler->get() wants dentry and inode and flags]) + ],[ + ZFS_LINUX_TEST_ERROR([xattr get()]) + ]) ]) ]) ]) diff --git a/include/os/linux/kernel/linux/xattr_compat.h b/include/os/linux/kernel/linux/xattr_compat.h index 54690727eab9..30403fe87397 100644 --- a/include/os/linux/kernel/linux/xattr_compat.h +++ b/include/os/linux/kernel/linux/xattr_compat.h @@ -115,6 +115,20 @@ fn(struct dentry *dentry, const char *name, void *buffer, size_t size, \ { \ return (__ ## fn(dentry->d_inode, name, buffer, size)); \ } +/* + * Android API change, + * The xattr_handler->get() callback was changed to take a dentry and inode + * and flags, because the dentry might not be attached to an inode yet. + */ +#elif defined(HAVE_XATTR_GET_DENTRY_INODE_FLAGS) +#define ZPL_XATTR_GET_WRAPPER(fn) \ +static int \ +fn(const struct xattr_handler *handler, struct dentry *dentry, \ + struct inode *inode, const char *name, void *buffer, \ + size_t size, int flags) \ +{ \ + return (__ ## fn(inode, name, buffer, size)); \ +} #else #error "Unsupported kernel" #endif From 7bbfac9d04dc06ba339be0540fd50e5166df8516 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Thu, 8 Sep 2022 10:32:30 -0700 Subject: [PATCH 12/39] zed: Fix config_sync autoexpand flood Users were seeing floods of `config_sync` events when autoexpand was enabled. This happened because all "disk status change" udev events invoke the autoexpand codepath, which calls zpool_relabel_disk(), which in turn cause another "disk status change" event to happen, in a feedback loop. Note that "disk status change" happens every time a user calls close() on a block device. This commit breaks the feedback loop by only allowing an autoexpand to happen if the disk actually changed size. Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes: #7132 Closes: #7366 Closes #13729 --- cmd/zed/agents/zfs_mod.c | 155 +++++++++++++++++++++++++++++++++++-- cmd/zed/zed_disk_event.c | 16 ++++ include/sys/sysevent/dev.h | 3 + 3 files changed, 166 insertions(+), 8 deletions(-) diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index a510d646e1f9..a4e23ca1a3b0 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -894,14 +894,90 @@ zfs_deliver_check(nvlist_t *nvl) return (0); } +/* + * Given a path to a vdev, lookup the vdev's physical size from its + * config nvlist. + * + * Returns the vdev's physical size in bytes on success, 0 on error. + */ +static uint64_t +vdev_size_from_config(zpool_handle_t *zhp, const char *vdev_path) +{ + nvlist_t *nvl = NULL; + boolean_t avail_spare, l2cache, log; + vdev_stat_t *vs = NULL; + uint_t c; + + nvl = zpool_find_vdev(zhp, vdev_path, &avail_spare, &l2cache, &log); + if (!nvl) + return (0); + + verify(nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0); + if (!vs) { + zed_log_msg(LOG_INFO, "%s: no nvlist for '%s'", __func__, + vdev_path); + return (0); + } + + return (vs->vs_pspace); +} + +/* + * Given a path to a vdev, lookup if the vdev is a "whole disk" in the + * config nvlist. "whole disk" means that ZFS was passed a whole disk + * at pool creation time, which it partitioned up and has full control over. + * Thus a partition with wholedisk=1 set tells us that zfs created the + * partition at creation time. A partition without whole disk set would have + * been created by externally (like with fdisk) and passed to ZFS. + * + * Returns the whole disk value (either 0 or 1). + */ +static uint64_t +vdev_whole_disk_from_config(zpool_handle_t *zhp, const char *vdev_path) +{ + nvlist_t *nvl = NULL; + boolean_t avail_spare, l2cache, log; + uint64_t wholedisk; + + nvl = zpool_find_vdev(zhp, vdev_path, &avail_spare, &l2cache, &log); + if (!nvl) + return (0); + + verify(nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_WHOLE_DISK, + &wholedisk) == 0); + + return (wholedisk); +} + +/* + * If the device size grew more than 1% then return true. + */ +#define DEVICE_GREW(oldsize, newsize) \ + ((newsize > oldsize) && \ + ((newsize / (newsize - oldsize)) <= 100)) + static int zfsdle_vdev_online(zpool_handle_t *zhp, void *data) { - char *devname = data; boolean_t avail_spare, l2cache; + nvlist_t *udev_nvl = data; nvlist_t *tgt; int error; + char *tmp_devname, devname[MAXPATHLEN]; + uint64_t guid; + + if (nvlist_lookup_uint64(udev_nvl, ZFS_EV_VDEV_GUID, &guid) == 0) { + sprintf(devname, "%llu", (u_longlong_t)guid); + } else if (nvlist_lookup_string(udev_nvl, DEV_PHYS_PATH, + &tmp_devname) == 0) { + strlcpy(devname, tmp_devname, MAXPATHLEN); + zfs_append_partition(devname, MAXPATHLEN); + } else { + zed_log_msg(LOG_INFO, "%s: no guid or physpath", __func__); + } + zed_log_msg(LOG_INFO, "zfsdle_vdev_online: searching for '%s' in '%s'", devname, zpool_get_name(zhp)); @@ -953,12 +1029,75 @@ zfsdle_vdev_online(zpool_handle_t *zhp, void *data) vdev_state_t newstate; if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) { - error = zpool_vdev_online(zhp, fullpath, 0, - &newstate); - zed_log_msg(LOG_INFO, "zfsdle_vdev_online: " - "setting device '%s' to ONLINE state " - "in pool '%s': %d", fullpath, - zpool_get_name(zhp), error); + /* + * If this disk size has not changed, then + * there's no need to do an autoexpand. To + * check we look at the disk's size in its + * config, and compare it to the disk size + * that udev is reporting. + */ + uint64_t udev_size = 0, conf_size = 0, + wholedisk = 0, udev_parent_size = 0; + + /* + * Get the size of our disk that udev is + * reporting. + */ + if (nvlist_lookup_uint64(udev_nvl, DEV_SIZE, + &udev_size) != 0) { + udev_size = 0; + } + + /* + * Get the size of our disk's parent device + * from udev (where sda1's parent is sda). + */ + if (nvlist_lookup_uint64(udev_nvl, + DEV_PARENT_SIZE, &udev_parent_size) != 0) { + udev_parent_size = 0; + } + + conf_size = vdev_size_from_config(zhp, + fullpath); + + wholedisk = vdev_whole_disk_from_config(zhp, + fullpath); + + /* + * Only attempt an autoexpand if the vdev size + * changed. There are two different cases + * to consider. + * + * 1. wholedisk=1 + * If you do a 'zpool create' on a whole disk + * (like /dev/sda), then zfs will create + * partitions on the disk (like /dev/sda1). In + * that case, wholedisk=1 will be set in the + * partition's nvlist config. So zed will need + * to see if your parent device (/dev/sda) + * expanded in size, and if so, then attempt + * the autoexpand. + * + * 2. wholedisk=0 + * If you do a 'zpool create' on an existing + * partition, or a device that doesn't allow + * partitions, then wholedisk=0, and you will + * simply need to check if the device itself + * expanded in size. + */ + if (DEVICE_GREW(conf_size, udev_size) || + (wholedisk && DEVICE_GREW(conf_size, + udev_parent_size))) { + error = zpool_vdev_online(zhp, fullpath, + 0, &newstate); + + zed_log_msg(LOG_INFO, + "%s: autoexpanding '%s' from %llu" + " to %llu bytes in pool '%s': %d", + __func__, fullpath, conf_size, + MAX(udev_size, udev_parent_size), + zpool_get_name(zhp), error); + } } } zpool_close(zhp); @@ -989,7 +1128,7 @@ zfs_deliver_dle(nvlist_t *nvl) zed_log_msg(LOG_INFO, "zfs_deliver_dle: no guid or physpath"); } - if (zpool_iter(g_zfshdl, zfsdle_vdev_online, name) != 1) { + if (zpool_iter(g_zfshdl, zfsdle_vdev_online, nvl) != 1) { zed_log_msg(LOG_INFO, "zfs_deliver_dle: device '%s' not " "found", name); return (1); diff --git a/cmd/zed/zed_disk_event.c b/cmd/zed/zed_disk_event.c index d9b355292ab9..e31ec4cfc7e7 100644 --- a/cmd/zed/zed_disk_event.c +++ b/cmd/zed/zed_disk_event.c @@ -78,6 +78,8 @@ zed_udev_event(const char *class, const char *subclass, nvlist_t *nvl) zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PHYS_PATH, strval); if (nvlist_lookup_uint64(nvl, DEV_SIZE, &numval) == 0) zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_SIZE, numval); + if (nvlist_lookup_uint64(nvl, DEV_PARENT_SIZE, &numval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_PARENT_SIZE, numval); if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &numval) == 0) zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_POOL_GUID, numval); if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &numval) == 0) @@ -130,6 +132,20 @@ dev_event_nvlist(struct udev_device *dev) numval *= strtoull(value, NULL, 10); (void) nvlist_add_uint64(nvl, DEV_SIZE, numval); + + /* + * If the device has a parent, then get the parent block + * device's size as well. For example, /dev/sda1's parent + * is /dev/sda. + */ + struct udev_device *parent_dev = udev_device_get_parent(dev); + if ((value = udev_device_get_sysattr_value(parent_dev, "size")) + != NULL) { + uint64_t numval = DEV_BSIZE; + + numval *= strtoull(value, NULL, 10); + (void) nvlist_add_uint64(nvl, DEV_PARENT_SIZE, numval); + } } /* diff --git a/include/sys/sysevent/dev.h b/include/sys/sysevent/dev.h index 1117538d822d..2418bbad469d 100644 --- a/include/sys/sysevent/dev.h +++ b/include/sys/sysevent/dev.h @@ -244,6 +244,9 @@ extern "C" { #define DEV_PATH "path" #define DEV_IS_PART "is_slice" #define DEV_SIZE "dev_size" + +/* Size of the whole parent block device (if dev is a partition) */ +#define DEV_PARENT_SIZE "dev_parent_size" #endif /* __linux__ */ #define EV_V1 1 From c8f795ba53acfe0239bfa5d75f64dce8e390a992 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=BD=D0=B0=D0=B1?= Date: Thu, 23 Dec 2021 21:26:19 +0100 Subject: [PATCH 13/39] config: check for parallel(1), use it for cstyle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before: $ time make cstyle real 0m23.118s user 0m23.002s sys 0m0.114s After: $ time make cstyle real 0m4.577s user 0m31.487s sys 0m0.699s Reviewed-by: Brian Behlendorf Signed-off-by: Ahelenia Ziemiańska Issue #12899 --- Makefile.am | 7 ++++++- config/always-parallel.m4 | 8 ++++++++ config/zfs-build.m4 | 1 + 3 files changed, 15 insertions(+), 1 deletion(-) create mode 100644 config/always-parallel.m4 diff --git a/Makefile.am b/Makefile.am index 7e2b10b39dee..36d8cd2d6fb8 100644 --- a/Makefile.am +++ b/Makefile.am @@ -114,6 +114,11 @@ commitcheck: ${top_srcdir}/scripts/commitcheck.sh; \ fi +if HAVE_PARALLEL +cstyle_line = -print0 | parallel -X0 ${top_srcdir}/scripts/cstyle.pl -cpP {} +else +cstyle_line = -exec ${top_srcdir}/scripts/cstyle.pl -cpP {} + +endif PHONY += cstyle cstyle: @find ${top_srcdir} -name build -prune \ @@ -122,7 +127,7 @@ cstyle: ! -name 'opt_global.h' ! -name '*_if*.h' \ ! -name 'zstd_compat_wrapper.h' \ ! -path './module/zstd/lib/*' \ - -exec ${top_srcdir}/scripts/cstyle.pl -cpP {} \+ + $(cstyle_line) filter_executable = -exec test -x '{}' \; -print diff --git a/config/always-parallel.m4 b/config/always-parallel.m4 new file mode 100644 index 000000000000..c1f1ae78e7e7 --- /dev/null +++ b/config/always-parallel.m4 @@ -0,0 +1,8 @@ +dnl # +dnl # Check if GNU parallel is available. +dnl # +AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PARALLEL], [ + AC_CHECK_PROG([PARALLEL], [parallel], [yes]) + + AM_CONDITIONAL([HAVE_PARALLEL], [test "x$PARALLEL" = "xyes"]) +]) diff --git a/config/zfs-build.m4 b/config/zfs-build.m4 index 8ca596ecf06b..bd8e3ac80201 100644 --- a/config/zfs-build.m4 +++ b/config/zfs-build.m4 @@ -226,6 +226,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS], [ ZFS_AC_CONFIG_ALWAYS_SED ZFS_AC_CONFIG_ALWAYS_CPPCHECK ZFS_AC_CONFIG_ALWAYS_SHELLCHECK + ZFS_AC_CONFIG_ALWAYS_PARALLEL ]) AC_DEFUN([ZFS_AC_CONFIG], [ From 8bd3dca9bf3e9a4315d58be316bcfaf8e76c6a6a Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Thu, 11 Nov 2021 21:52:16 +0100 Subject: [PATCH 14/39] Introduce a tunable to exclude special class buffers from L2ARC Special allocation class or dedup vdevs may have roughly the same performance as L2ARC vdevs. Introduce a new tunable to exclude those buffers from being cacheable on L2ARC. Reviewed-by: Don Brady Reviewed-by: Brian Behlendorf Signed-off-by: George Amanakis Closes #11761 Closes #12285 --- include/sys/arc.h | 1 + include/sys/dbuf.h | 11 +------ include/sys/dmu_objset.h | 4 --- man/man4/zfs.4 | 5 +++ module/zfs/arc.c | 12 +++++++ module/zfs/dbuf.c | 71 +++++++++++++++++++++++++++++++++++++--- module/zfs/dmu.c | 2 +- module/zfs/dmu_objset.c | 34 +++++++++++++++++-- 8 files changed, 119 insertions(+), 21 deletions(-) diff --git a/include/sys/arc.h b/include/sys/arc.h index a3241f3685a6..5d8176894e60 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -85,6 +85,7 @@ typedef void arc_prune_func_t(int64_t bytes, void *priv); /* Shared module parameters */ extern int zfs_arc_average_blocksize; +extern int l2arc_exclude_special; /* generic arc_done_func_t's which you can use */ arc_read_done_func_t arc_bcopy_func; diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 93d80066be82..2e7385113ec5 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -441,16 +441,7 @@ dbuf_find_dirty_eq(dmu_buf_impl_t *db, uint64_t txg) (dbuf_is_metadata(_db) && \ ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))) -#define DBUF_IS_L2CACHEABLE(_db) \ - ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \ - (dbuf_is_metadata(_db) && \ - ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA))) - -#define DNODE_LEVEL_IS_L2CACHEABLE(_dn, _level) \ - ((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_ALL || \ - (((_level) > 0 || \ - DMU_OT_IS_METADATA((_dn)->dn_handle->dnh_dnode->dn_type)) && \ - ((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA))) +boolean_t dbuf_is_l2cacheable(dmu_buf_impl_t *db); #ifdef ZFS_DEBUG diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h index e89ee64ea686..7ade2dc91247 100644 --- a/include/sys/dmu_objset.h +++ b/include/sys/dmu_objset.h @@ -200,10 +200,6 @@ struct objset { #define DMU_GROUPUSED_DNODE(os) ((os)->os_groupused_dnode.dnh_dnode) #define DMU_PROJECTUSED_DNODE(os) ((os)->os_projectused_dnode.dnh_dnode) -#define DMU_OS_IS_L2CACHEABLE(os) \ - ((os)->os_secondary_cache == ZFS_CACHE_ALL || \ - (os)->os_secondary_cache == ZFS_CACHE_METADATA) - /* called from zpl */ int dmu_objset_hold(const char *name, void *tag, objset_t **osp); int dmu_objset_hold_flags(const char *name, boolean_t decrypt, void *tag, diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 6495e9b4cd20..c32dd4b1b27f 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -109,6 +109,11 @@ A value of .Sy 100 disables this feature. . +.It Sy l2arc_exclude_special Ns = Ns Sy 0 Ns | Ns 1 Pq int +Controls whether buffers present on special vdevs are eligibile for caching +into L2ARC. +If set to 1, exclude dbufs on special vdevs from being cached to L2ARC. +. .It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Pq int Controls whether only MFU metadata and data are cached from ARC into L2ARC. This may be desired to avoid wasting space on L2ARC when reading/writing large diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 215250ea6fec..0ba366f1858f 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -877,6 +877,14 @@ static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, #define l2arc_hdr_arcstats_decrement_state(hdr) \ l2arc_hdr_arcstats_update((hdr), B_FALSE, B_TRUE) +/* + * l2arc_exclude_special : A zfs module parameter that controls whether buffers + * present on special vdevs are eligibile for caching in L2ARC. If + * set to 1, exclude dbufs on special vdevs from being cached to + * L2ARC. + */ +int l2arc_exclude_special = 0; + /* * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU * metadata and data are cached from ARC into L2ARC. @@ -11136,6 +11144,10 @@ ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, ULONG, ZMOD_RW, ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW, "Cache only MFU data from ARC into L2ARC"); +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, exclude_special, INT, ZMOD_RW, + "If set to 1 exclude dbufs on special vdevs from being cached to " + "L2ARC."); + ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int, param_get_int, ZMOD_RW, "System free memory I/O throttle in bytes"); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index e687d96501ed..1a022c8b8a07 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -53,6 +53,7 @@ #include #include #include +#include kstat_t *dbuf_ksp; @@ -594,6 +595,68 @@ dbuf_is_metadata(dmu_buf_impl_t *db) } } +/* + * We want to exclude buffers that are on a special allocation class from + * L2ARC. + */ +boolean_t +dbuf_is_l2cacheable(dmu_buf_impl_t *db) +{ + vdev_t *vd = NULL; + zfs_cache_type_t cache = db->db_objset->os_secondary_cache; + blkptr_t *bp = db->db_blkptr; + + if (bp != NULL && !BP_IS_HOLE(bp)) { + uint64_t vdev = DVA_GET_VDEV(bp->blk_dva); + vdev_t *rvd = db->db_objset->os_spa->spa_root_vdev; + + if (vdev < rvd->vdev_children) + vd = rvd->vdev_child[vdev]; + + if (cache == ZFS_CACHE_ALL || + (dbuf_is_metadata(db) && cache == ZFS_CACHE_METADATA)) { + if (vd == NULL) + return (B_TRUE); + + if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL && + vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) || + l2arc_exclude_special == 0) + return (B_TRUE); + } + } + + return (B_FALSE); +} + +static inline boolean_t +dnode_level_is_l2cacheable(blkptr_t *bp, dnode_t *dn, int64_t level) +{ + vdev_t *vd = NULL; + zfs_cache_type_t cache = dn->dn_objset->os_secondary_cache; + + if (bp != NULL && !BP_IS_HOLE(bp)) { + uint64_t vdev = DVA_GET_VDEV(bp->blk_dva); + vdev_t *rvd = dn->dn_objset->os_spa->spa_root_vdev; + + if (vdev < rvd->vdev_children) + vd = rvd->vdev_child[vdev]; + + if (cache == ZFS_CACHE_ALL || ((level > 0 || + DMU_OT_IS_METADATA(dn->dn_handle->dnh_dnode->dn_type)) && + cache == ZFS_CACHE_METADATA)) { + if (vd == NULL) + return (B_TRUE); + + if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL && + vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) || + l2arc_exclude_special == 0) + return (B_TRUE); + } + } + + return (B_FALSE); +} + /* * This function *must* return indices evenly distributed between all @@ -1523,7 +1586,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, DTRACE_SET_STATE(db, "read issued"); mutex_exit(&db->db_mtx); - if (DBUF_IS_L2CACHEABLE(db)) + if (dbuf_is_l2cacheable(db)) aflags |= ARC_FLAG_L2CACHE; dbuf_add_ref(db, NULL); @@ -3372,7 +3435,7 @@ dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid, dpa->dpa_arg = arg; /* flag if L2ARC eligible, l2arc_noprefetch then decides */ - if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level)) + if (dnode_level_is_l2cacheable(&bp, dn, level)) dpa->dpa_aflags |= ARC_FLAG_L2CACHE; /* @@ -3390,7 +3453,7 @@ dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid, zbookmark_phys_t zb; /* flag if L2ARC eligible, l2arc_noprefetch then decides */ - if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level)) + if (dnode_level_is_l2cacheable(&bp, dn, level)) iter_aflags |= ARC_FLAG_L2CACHE; SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, @@ -4989,7 +5052,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) children_ready_cb = dbuf_write_children_ready; dr->dr_zio = arc_write(pio, os->os_spa, txg, - &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db), + &dr->dr_bp_copy, data, dbuf_is_l2cacheable(db), &zp, dbuf_write_ready, children_ready_cb, dbuf_write_physdone, dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 4e7127bd1bab..e38c9b452a28 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1846,7 +1846,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) dsa->dsa_tx = NULL; zio_nowait(arc_write(pio, os->os_spa, txg, - zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), + zgd->zgd_bp, dr->dt.dl.dr_data, dbuf_is_l2cacheable(db), &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index b9380890230c..a8975797e8af 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -63,6 +63,8 @@ #include #include #include "zfs_namecheck.h" +#include +#include /* * Needed to close a window in dnode_move() that allows the objset to be freed @@ -411,6 +413,34 @@ dnode_multilist_index_func(multilist_t *ml, void *obj) multilist_get_num_sublists(ml)); } +static inline boolean_t +dmu_os_is_l2cacheable(objset_t *os) +{ + vdev_t *vd = NULL; + zfs_cache_type_t cache = os->os_secondary_cache; + blkptr_t *bp = os->os_rootbp; + + if (bp != NULL && !BP_IS_HOLE(bp)) { + uint64_t vdev = DVA_GET_VDEV(bp->blk_dva); + vdev_t *rvd = os->os_spa->spa_root_vdev; + + if (vdev < rvd->vdev_children) + vd = rvd->vdev_child[vdev]; + + if (cache == ZFS_CACHE_ALL || cache == ZFS_CACHE_METADATA) { + if (vd == NULL) + return (B_TRUE); + + if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL && + vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) || + l2arc_exclude_special == 0) + return (B_TRUE); + } + } + + return (B_FALSE); +} + /* * Instantiates the objset_t in-memory structure corresponding to the * objset_phys_t that's pointed to by the specified blkptr_t. @@ -453,7 +483,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - if (DMU_OS_IS_L2CACHEABLE(os)) + if (dmu_os_is_l2cacheable(os)) aflags |= ARC_FLAG_L2CACHE; if (ds != NULL && ds->ds_dir->dd_crypto_obj != 0) { @@ -1661,7 +1691,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) } zio = arc_write(pio, os->os_spa, tx->tx_txg, - blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), + blkptr_copy, os->os_phys_buf, dmu_os_is_l2cacheable(os), &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); From 03fa3ef264ab80296bad9b4ad54af8714a03df95 Mon Sep 17 00:00:00 2001 From: Akash B Date: Wed, 9 Mar 2022 05:50:41 +0530 Subject: [PATCH 15/39] Add physical device size to SIZE column in 'zpool list -v' Add physical device size/capacity only for physical devices in 'zpool list -v' instead of displaying "-" in the SIZE column. This would make it easier to see the individual device capacity and to determine which spares are large enough to replace which devices. Reviewed-by: Brian Behlendorf Reviewed-by: Tony Hutter Reviewed-by: Dipak Ghosh Signed-off-by: Akash B Closes #12561 Closes #13106 --- cmd/zpool/zpool_main.c | 9 +++++++-- include/sys/fs/zfs.h | 1 + module/zfs/vdev.c | 1 + .../functional/pool_checkpoint/checkpoint_lun_expsz.ksh | 6 +++++- 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index b2e7dc4a5b4d..3f46bd0513c9 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -6035,6 +6035,7 @@ print_one_column(zpool_prop_t prop, uint64_t value, const char *str, size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL); switch (prop) { + case ZPOOL_PROP_SIZE: case ZPOOL_PROP_EXPANDSZ: case ZPOOL_PROP_CHECKPOINT: case ZPOOL_PROP_DEDUPRATIO: @@ -6130,8 +6131,12 @@ print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, * 'toplevel' boolean value is passed to the print_one_column() * to indicate that the value is valid. */ - print_one_column(ZPOOL_PROP_SIZE, vs->vs_space, NULL, scripted, - toplevel, format); + if (vs->vs_pspace) + print_one_column(ZPOOL_PROP_SIZE, vs->vs_pspace, NULL, + scripted, B_TRUE, format); + else + print_one_column(ZPOOL_PROP_SIZE, vs->vs_space, NULL, + scripted, toplevel, format); print_one_column(ZPOOL_PROP_ALLOCATED, vs->vs_alloc, NULL, scripted, toplevel, format); print_one_column(ZPOOL_PROP_FREE, vs->vs_space - vs->vs_alloc, diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 5bbac576df02..c8e199fc679c 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1102,6 +1102,7 @@ typedef struct vdev_stat { uint64_t vs_configured_ashift; /* TLV vdev_ashift */ uint64_t vs_logical_ashift; /* vdev_logical_ashift */ uint64_t vs_physical_ashift; /* vdev_physical_ashift */ + uint64_t vs_pspace; /* physical capacity */ } vdev_stat_t; /* BEGIN CSTYLED */ diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 636bb5005a1e..ccc35adc9f4b 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -4406,6 +4406,7 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) vs->vs_rsize = vdev_get_min_asize(vd); if (vd->vdev_ops->vdev_op_leaf) { + vs->vs_pspace = vd->vdev_psize; vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; /* diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_lun_expsz.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_lun_expsz.ksh index 59f64081a977..a18e634cefa7 100755 --- a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_lun_expsz.ksh +++ b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_lun_expsz.ksh @@ -48,14 +48,18 @@ log_must zpool checkpoint $NESTEDPOOL log_must truncate -s $EXPSZ $FILEDISK1 log_must zpool online -e $NESTEDPOOL $FILEDISK1 NEWSZ=$(zpool list -v | grep "$FILEDISK1" | awk '{print $2}') +DEXPSZ=$(zpool list -v | grep "$FILEDISK1" | awk '{print $6}') nested_change_state_after_checkpoint log_mustnot [ "$INITSZ" = "$NEWSZ" ] +log_must [ "$DEXPSZ" = "-" ] log_must zpool export $NESTEDPOOL log_must zpool import -d $FILEDISKDIR --rewind-to-checkpoint $NESTEDPOOL nested_verify_pre_checkpoint_state FINSZ=$(zpool list -v | grep "$FILEDISK1" | awk '{print $2}') -log_must [ "$INITSZ" = "$FINSZ" ] +DEXPSZ=$(zpool list -v | grep "$FILEDISK1" | awk '{print $6}') +log_must [ "$EXPSZ" = "$FINSZ" ] +log_must [ "$DEXPSZ" != "-" ] log_pass "LUN expansion rewinded correctly." From 3f7c174b50a3430adf68ae2f91146cc4206c5e0c Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Mon, 12 Sep 2022 15:51:17 -0400 Subject: [PATCH 16/39] vdev_draid_lookup_map() should not iterate outside draid_maps Coverity reported this as an out-of-bounds read. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Reviewed-by: Neal Gompa Signed-off-by: Richard Yao Closes #13865 --- module/zfs/vdev_draid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index db87e69f2057..7e654ca24d20 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -541,7 +541,7 @@ vdev_draid_generate_perms(const draid_map_t *map, uint8_t **permsp) int vdev_draid_lookup_map(uint64_t children, const draid_map_t **mapp) { - for (int i = 0; i <= VDEV_DRAID_MAX_MAPS; i++) { + for (int i = 0; i < VDEV_DRAID_MAX_MAPS; i++) { if (draid_maps[i].dm_children == children) { *mapp = &draid_maps[i]; return (0); From cde04badd1d76e3af050fb0232d06af8550e8b8b Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 8 Sep 2022 02:04:15 +0200 Subject: [PATCH 17/39] make DMU_OT_IS_METADATA and DMU_OT_IS_ENCRYPTED return B_TRUE or B_FALSE Without this patch, the ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf)); at the beginning of dbuf_assign_arcbuf can panic if the object type is a DMU_OT_NEWTYPE that has DMU_OT_METADATA set. While we're at it, fix DMU_OT_IS_ENCRYPTED as well. Reviewed-by: Richard Yao Reviewed-by: Alexander Motin Signed-off-by: Christian Schwarz Closes #13842 --- include/sys/dmu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 0cf4dbc9f925..070d27fde3a9 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -136,7 +136,7 @@ typedef enum dmu_object_byteswap { #endif #define DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \ - ((ot) & DMU_OT_METADATA) : \ + (((ot) & DMU_OT_METADATA) != 0) : \ DMU_OT_IS_METADATA_IMPL(ot)) #define DMU_OT_IS_DDT(ot) \ @@ -147,7 +147,7 @@ typedef enum dmu_object_byteswap { ((ot) == DMU_OT_PLAIN_FILE_CONTENTS || (ot) == DMU_OT_UINT64_OTHER) #define DMU_OT_IS_ENCRYPTED(ot) (((ot) & DMU_OT_NEWTYPE) ? \ - ((ot) & DMU_OT_ENCRYPTED) : \ + (((ot) & DMU_OT_ENCRYPTED) != 0) : \ DMU_OT_IS_ENCRYPTED_IMPL(ot)) /* From a5b0d42540594c3df5c3c09cb552974c8bed0064 Mon Sep 17 00:00:00 2001 From: Ameer Hamza <106930537+ixhamza@users.noreply.github.com> Date: Sat, 17 Sep 2022 01:52:25 +0500 Subject: [PATCH 18/39] zfs recv hangs if max recordsize is less than received recordsize - Some optimizations for bqueue enqueue/dequeue. - Added a fix to prevent deadlock when both bqueue_enqueue_impl() and bqueue_dequeue() waits for signal to be triggered. Reviewed-by: Alexander Motin Reviewed-by: Ryan Moeller Signed-off-by: Ameer Hamza Closes #13855 --- include/sys/bqueue.h | 14 +++++++------- include/sys/spa.h | 6 +++--- module/zfs/bqueue.c | 23 +++++++++++++---------- 3 files changed, 23 insertions(+), 20 deletions(-) diff --git a/include/sys/bqueue.h b/include/sys/bqueue.h index 797aecd791a3..b9621966027a 100644 --- a/include/sys/bqueue.h +++ b/include/sys/bqueue.h @@ -30,22 +30,22 @@ typedef struct bqueue { kmutex_t bq_lock; kcondvar_t bq_add_cv; kcondvar_t bq_pop_cv; - uint64_t bq_size; - uint64_t bq_maxsize; - uint64_t bq_fill_fraction; + size_t bq_size; + size_t bq_maxsize; + uint_t bq_fill_fraction; size_t bq_node_offset; } bqueue_t; typedef struct bqueue_node { list_node_t bqn_node; - uint64_t bqn_size; + size_t bqn_size; } bqueue_node_t; -int bqueue_init(bqueue_t *, uint64_t, uint64_t, size_t); +int bqueue_init(bqueue_t *, uint_t, size_t, size_t); void bqueue_destroy(bqueue_t *); -void bqueue_enqueue(bqueue_t *, void *, uint64_t); -void bqueue_enqueue_flush(bqueue_t *, void *, uint64_t); +void bqueue_enqueue(bqueue_t *, void *, size_t); +void bqueue_enqueue_flush(bqueue_t *, void *, size_t); void *bqueue_dequeue(bqueue_t *); boolean_t bqueue_empty(bqueue_t *); diff --git a/include/sys/spa.h b/include/sys/spa.h index f168015abffc..3eebcd84fccf 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -78,9 +78,9 @@ struct dsl_crypto_params; * against the cost of COWing a giant block to modify one byte, and the * large latency of reading or writing a large block. * - * Note that although blocks up to 16MB are supported, the recordsize - * property can not be set larger than zfs_max_recordsize (default 1MB). - * See the comment near zfs_max_recordsize in dsl_dataset.c for details. + * The recordsize property can not be set larger than zfs_max_recordsize + * (default 16MB on 64-bit and 1MB on 32-bit). See the comment near + * zfs_max_recordsize in dsl_dataset.c for details. * * Note that although the LSIZE field of the blkptr_t can store sizes up * to 32MB, the dnode's dn_datablkszsec can only store sizes up to diff --git a/module/zfs/bqueue.c b/module/zfs/bqueue.c index 22539efc4e23..ec5ce4388ec8 100644 --- a/module/zfs/bqueue.c +++ b/module/zfs/bqueue.c @@ -42,8 +42,7 @@ obj2node(bqueue_t *q, void *data) * Return 0 on success, or -1 on failure. */ int -bqueue_init(bqueue_t *q, uint64_t fill_fraction, uint64_t size, - size_t node_offset) +bqueue_init(bqueue_t *q, uint_t fill_fraction, size_t size, size_t node_offset) { if (fill_fraction == 0) { return (-1); @@ -78,22 +77,26 @@ bqueue_destroy(bqueue_t *q) } static void -bqueue_enqueue_impl(bqueue_t *q, void *data, uint64_t item_size, - boolean_t flush) +bqueue_enqueue_impl(bqueue_t *q, void *data, size_t item_size, boolean_t flush) { ASSERT3U(item_size, >, 0); ASSERT3U(item_size, <=, q->bq_maxsize); mutex_enter(&q->bq_lock); obj2node(q, data)->bqn_size = item_size; - while (q->bq_size + item_size > q->bq_maxsize) { + while (q->bq_size && q->bq_size + item_size > q->bq_maxsize) { + /* + * Wake up bqueue_dequeue() thread if already sleeping in order + * to prevent the deadlock condition + */ + cv_signal(&q->bq_pop_cv); cv_wait_sig(&q->bq_add_cv, &q->bq_lock); } q->bq_size += item_size; list_insert_tail(&q->bq_list, data); - if (q->bq_size >= q->bq_maxsize / q->bq_fill_fraction) - cv_signal(&q->bq_pop_cv); if (flush) cv_broadcast(&q->bq_pop_cv); + else if (q->bq_size >= q->bq_maxsize / q->bq_fill_fraction) + cv_signal(&q->bq_pop_cv); mutex_exit(&q->bq_lock); } @@ -103,7 +106,7 @@ bqueue_enqueue_impl(bqueue_t *q, void *data, uint64_t item_size, * > 0. */ void -bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size) +bqueue_enqueue(bqueue_t *q, void *data, size_t item_size) { bqueue_enqueue_impl(q, data, item_size, B_FALSE); } @@ -117,7 +120,7 @@ bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size) * destroy the condvar before the enqueuing thread is done. */ void -bqueue_enqueue_flush(bqueue_t *q, void *data, uint64_t item_size) +bqueue_enqueue_flush(bqueue_t *q, void *data, size_t item_size) { bqueue_enqueue_impl(q, data, item_size, B_TRUE); } @@ -130,7 +133,7 @@ void * bqueue_dequeue(bqueue_t *q) { void *ret = NULL; - uint64_t item_size; + size_t item_size; mutex_enter(&q->bq_lock); while (q->bq_size == 0) { cv_wait_sig(&q->bq_pop_cv, &q->bq_lock); From 999830a0212e74530d41b9ef3022fa91166c5d58 Mon Sep 17 00:00:00 2001 From: Kevin Jin <33590050+jxdking@users.noreply.github.com> Date: Thu, 1 Jul 2021 11:20:27 -0400 Subject: [PATCH 19/39] Optimize txg_kick() process (#12274) Use dp_dirty_pertxg[] for txg_kick(), instead of dp_dirty_total in original code. Extra parameter "txg" is added for txg_kick(), thus it knows which txg to kick. Also txg_kick() call is moved from dsl_pool_need_dirty_delay() to dsl_pool_dirty_space() so that we can know the txg number assigned for txg_kick(). Some unnecessary code regarding dp_dirty_total in txg_sync_thread() is also cleaned up. Reviewed-by: Brian Behlendorf Reviewed-by: Matthew Ahrens Reviewed-by: Alexander Motin Signed-off-by: jxdking Closes #12274 --- include/sys/txg.h | 2 +- module/zfs/dsl_pool.c | 25 +++++++++++++++++++------ module/zfs/txg.c | 36 ++++++++++++++---------------------- 3 files changed, 34 insertions(+), 29 deletions(-) diff --git a/include/sys/txg.h b/include/sys/txg.h index 22158bd1a5e6..f38f0006c040 100644 --- a/include/sys/txg.h +++ b/include/sys/txg.h @@ -78,7 +78,7 @@ extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks); extern void txg_delay(struct dsl_pool *dp, uint64_t txg, hrtime_t delta, hrtime_t resolution); -extern void txg_kick(struct dsl_pool *dp); +extern void txg_kick(struct dsl_pool *dp, uint64_t txg); /* * Wait until the given transaction group has finished syncing. diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 456ef5372e2e..ba1cb96a76fc 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -902,18 +902,26 @@ dsl_pool_need_dirty_delay(dsl_pool_t *dp) { uint64_t delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; - uint64_t dirty_min_bytes = - zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; - uint64_t dirty; mutex_enter(&dp->dp_lock); - dirty = dp->dp_dirty_total; + uint64_t dirty = dp->dp_dirty_total; mutex_exit(&dp->dp_lock); - if (dirty > dirty_min_bytes) - txg_kick(dp); + return (dirty > delay_min_bytes); } +static boolean_t +dsl_pool_need_dirty_sync(dsl_pool_t *dp, uint64_t txg) +{ + ASSERT(MUTEX_HELD(&dp->dp_lock)); + + uint64_t dirty_min_bytes = + zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; + uint64_t dirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; + + return (dirty > dirty_min_bytes); +} + void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) { @@ -921,7 +929,12 @@ dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) mutex_enter(&dp->dp_lock); dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space; dsl_pool_dirty_delta(dp, space); + boolean_t needsync = !dmu_tx_is_syncing(tx) && + dsl_pool_need_dirty_sync(dp, tx->tx_txg); mutex_exit(&dp->dp_lock); + + if (needsync) + txg_kick(dp, tx->tx_txg); } } diff --git a/module/zfs/txg.c b/module/zfs/txg.c index c55b1d8f9601..c9eb84bbdb12 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -498,14 +498,6 @@ txg_wait_callbacks(dsl_pool_t *dp) taskq_wait_outstanding(tx->tx_commit_cb_taskq, 0); } -static boolean_t -txg_is_syncing(dsl_pool_t *dp) -{ - tx_state_t *tx = &dp->dp_tx; - ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); - return (tx->tx_syncing_txg != 0); -} - static boolean_t txg_is_quiescing(dsl_pool_t *dp) { @@ -539,8 +531,6 @@ txg_sync_thread(void *arg) clock_t timeout = zfs_txg_timeout * hz; clock_t timer; uint64_t txg; - uint64_t dirty_min_bytes = - zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; /* * We sync when we're scanning, there's someone waiting @@ -551,8 +541,7 @@ txg_sync_thread(void *arg) while (!dsl_scan_active(dp->dp_scan) && !tx->tx_exiting && timer > 0 && tx->tx_synced_txg >= tx->tx_sync_txg_waiting && - !txg_has_quiesced_to_sync(dp) && - dp->dp_dirty_total < dirty_min_bytes) { + !txg_has_quiesced_to_sync(dp)) { dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", (u_longlong_t)tx->tx_synced_txg, (u_longlong_t)tx->tx_sync_txg_waiting, dp); @@ -566,6 +555,11 @@ txg_sync_thread(void *arg) * prompting it to do so if necessary. */ while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) { + if (txg_is_quiescing(dp)) { + txg_thread_wait(tx, &cpr, + &tx->tx_quiesce_done_cv, 0); + continue; + } if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1) tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1; cv_broadcast(&tx->tx_quiesce_more_cv); @@ -791,24 +785,22 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg, boolean_t should_quiesce) } /* - * If there isn't a txg syncing or in the pipeline, push another txg through - * the pipeline by quiescing the open txg. + * Pass in the txg number that should be synced. */ void -txg_kick(dsl_pool_t *dp) +txg_kick(dsl_pool_t *dp, uint64_t txg) { tx_state_t *tx = &dp->dp_tx; ASSERT(!dsl_pool_config_held(dp)); + if (tx->tx_sync_txg_waiting >= txg) + return; + mutex_enter(&tx->tx_sync_lock); - if (!txg_is_syncing(dp) && - !txg_is_quiescing(dp) && - tx->tx_quiesce_txg_waiting <= tx->tx_open_txg && - tx->tx_sync_txg_waiting <= tx->tx_synced_txg && - tx->tx_quiesced_txg <= tx->tx_synced_txg) { - tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1; - cv_broadcast(&tx->tx_quiesce_more_cv); + if (tx->tx_sync_txg_waiting < txg) { + tx->tx_sync_txg_waiting = txg; + cv_broadcast(&tx->tx_sync_more_cv); } mutex_exit(&tx->tx_sync_lock); } From d05f3039f7749508229e16b981723ec95c8764d1 Mon Sep 17 00:00:00 2001 From: Kevin Jin <33590050+jxdking@users.noreply.github.com> Date: Tue, 20 Jul 2021 11:40:24 -0400 Subject: [PATCH 20/39] Add Module Parameter Regarding Log Size Limit zfs_wrlog_data_max The upper limit of TX_WRITE log data. Once it is reached, write operation is blocked, until log data is cleared out after txg sync. It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY. Reviewed-by: Prakash Surya Reviewed-by: Brian Behlendorf Signed-off-by: jxdking Closes #12284 --- include/sys/dmu_tx.h | 1 + include/sys/dsl_pool.h | 7 ++++++ man/man4/zfs.4 | 12 +++++++++ module/zfs/arc.c | 12 +++++++++ module/zfs/dmu_tx.c | 7 ++++++ module/zfs/dsl_pool.c | 57 ++++++++++++++++++++++++++++++++++++++++++ module/zfs/zfs_log.c | 5 ++++ module/zfs/zvol.c | 7 ++++-- 8 files changed, 106 insertions(+), 2 deletions(-) diff --git a/include/sys/dmu_tx.h b/include/sys/dmu_tx.h index 60e9ed6e26f5..71a9ac7ca7bf 100644 --- a/include/sys/dmu_tx.h +++ b/include/sys/dmu_tx.h @@ -124,6 +124,7 @@ typedef struct dmu_tx_stats { kstat_named_t dmu_tx_dirty_throttle; kstat_named_t dmu_tx_dirty_delay; kstat_named_t dmu_tx_dirty_over_max; + kstat_named_t dmu_tx_wrlog_over_max; kstat_named_t dmu_tx_dirty_frees_delay; kstat_named_t dmu_tx_quota; } dmu_tx_stats_t; diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index e5eb9a20e9ca..1b4e2924facf 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -40,6 +40,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -58,6 +59,7 @@ struct dsl_deadlist; extern unsigned long zfs_dirty_data_max; extern unsigned long zfs_dirty_data_max_max; +extern unsigned long zfs_wrlog_data_max; extern int zfs_dirty_data_sync_percent; extern int zfs_dirty_data_max_percent; extern int zfs_dirty_data_max_max_percent; @@ -118,6 +120,9 @@ typedef struct dsl_pool { uint64_t dp_mos_compressed_delta; uint64_t dp_mos_uncompressed_delta; + aggsum_t dp_wrlog_pertxg[TXG_SIZE]; + aggsum_t dp_wrlog_total; + /* * Time of most recently scheduled (furthest in the future) * wakeup for delayed transactions. @@ -158,6 +163,8 @@ uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy); uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy); uint64_t dsl_pool_deferred_space(dsl_pool_t *dp); +void dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg); +boolean_t dsl_pool_wrlog_over_max(dsl_pool_t *dp); void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg); void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp); diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index c32dd4b1b27f..19c67a61ad74 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1096,6 +1096,18 @@ Start syncing out a transaction group if there's at least this much dirty data This should be less than .Sy zfs_vdev_async_write_active_min_dirty_percent . . +.It Sy zfs_wrlog_data_max Ns = Pq int +The upper limit of write-transaction zil log data size in bytes. +Once it is reached, write operation is blocked, until log data is cleared out +after transaction group sync. Because of some overhead, it should be set +at least 2 times the size of +.Sy zfs_dirty_data_max +.No to prevent harming normal write throughput. +It also should be smaller than the size of the slog device if slog is present. +.Pp +Defaults to +.Sy zfs_dirty_data_max*2 +. .It Sy zfs_fallocate_reserve_percent Ns = Ns Sy 110 Ns % Pq uint Since ZFS is a copy-on-write filesystem with snapshots, blocks cannot be preallocated for a file in order to guarantee that later writes will not diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 0ba366f1858f..17193ed079fe 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -8062,6 +8062,18 @@ arc_init(void) zfs_dirty_data_max = MIN(zfs_dirty_data_max, zfs_dirty_data_max_max); } + + if (zfs_wrlog_data_max == 0) { + + /* + * dp_wrlog_total is reduced for each txg at the end of + * spa_sync(). However, dp_dirty_total is reduced every time + * a block is written out. Thus under normal operation, + * dp_wrlog_total could grow 2 times as big as + * zfs_dirty_data_max. + */ + zfs_wrlog_data_max = zfs_dirty_data_max * 2; + } } void diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 0beb983f992f..5fa516866668 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -53,6 +53,7 @@ dmu_tx_stats_t dmu_tx_stats = { { "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 }, + { "dmu_tx_wrlog_over_max", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_frees_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_quota", KSTAT_DATA_UINT64 }, }; @@ -884,6 +885,12 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) return (SET_ERROR(ERESTART)); } + if (!tx->tx_dirty_delayed && + dsl_pool_wrlog_over_max(tx->tx_pool)) { + DMU_TX_STAT_BUMP(dmu_tx_wrlog_over_max); + return (SET_ERROR(ERESTART)); + } + if (!tx->tx_dirty_delayed && dsl_pool_need_dirty_delay(tx->tx_pool)) { tx->tx_wait_dirty = B_TRUE; diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index ba1cb96a76fc..b91dd4cfa8a6 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -104,6 +104,14 @@ unsigned long zfs_dirty_data_max_max = 0; int zfs_dirty_data_max_percent = 10; int zfs_dirty_data_max_max_percent = 25; +/* + * zfs_wrlog_data_max, the upper limit of TX_WRITE log data. + * Once it is reached, write operation is blocked, + * until log data is cleared out after txg sync. + * It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY. + */ +unsigned long zfs_wrlog_data_max = 0; + /* * If there's at least this much dirty data (as a percentage of * zfs_dirty_data_max), push out a txg. This should be less than @@ -220,6 +228,11 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); + aggsum_init(&dp->dp_wrlog_total, 0); + for (int i = 0; i < TXG_SIZE; i++) { + aggsum_init(&dp->dp_wrlog_pertxg[i], 0); + } + dp->dp_zrele_taskq = taskq_create("z_zrele", 100, defclsyspri, boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); @@ -416,6 +429,14 @@ dsl_pool_close(dsl_pool_t *dp) rrw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); cv_destroy(&dp->dp_spaceavail_cv); + + ASSERT0(aggsum_value(&dp->dp_wrlog_total)); + aggsum_fini(&dp->dp_wrlog_total); + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT0(aggsum_value(&dp->dp_wrlog_pertxg[i])); + aggsum_fini(&dp->dp_wrlog_pertxg[i]); + } + taskq_destroy(dp->dp_unlinked_drain_taskq); taskq_destroy(dp->dp_zrele_taskq); if (dp->dp_blkstats != NULL) @@ -590,6 +611,36 @@ dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta) cv_signal(&dp->dp_spaceavail_cv); } +void +dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg) +{ + ASSERT3S(size, >=, 0); + + aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], size); + aggsum_add(&dp->dp_wrlog_total, size); + + /* Choose a value slightly bigger than min dirty sync bytes */ + uint64_t sync_min = + zfs_dirty_data_max * (zfs_dirty_data_sync_percent + 10) / 100; + if (aggsum_compare(&dp->dp_wrlog_pertxg[txg & TXG_MASK], sync_min) > 0) + txg_kick(dp, txg); +} + +boolean_t +dsl_pool_wrlog_over_max(dsl_pool_t *dp) +{ + return (aggsum_compare(&dp->dp_wrlog_total, zfs_wrlog_data_max) > 0); +} + +static void +dsl_pool_wrlog_clear(dsl_pool_t *dp, uint64_t txg) +{ + int64_t delta; + delta = -(int64_t)aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]); + aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], delta); + aggsum_add(&dp->dp_wrlog_total, delta); +} + #ifdef ZFS_DEBUG static boolean_t dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg) @@ -814,6 +865,9 @@ dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); dmu_buf_rele(ds->ds_dbuf, zilog); } + + dsl_pool_wrlog_clear(dp, txg); + ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); } @@ -1409,6 +1463,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, delay_min_dirty_percent, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, ULONG, ZMOD_RW, "Determines the dirty space limit"); +ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, ULONG, ZMOD_RW, + "The size limit of write-transaction zil log data"); + /* zfs_dirty_data_max_max only applied at module load in arc_init(). */ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD, "zfs_dirty_data_max upper bound in bytes"); diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index fd4c848d57f2..9e52bed77a61 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -538,6 +538,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, itx_wr_state_t write_state; uintptr_t fsync_cnt; uint64_t gen = 0; + ssize_t size = resid; if (zil_replaying(zilog, tx) || zp->z_unlinked || zfs_xattr_owner_unlinked(zp)) { @@ -623,6 +624,10 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, off += len; resid -= len; } + + if (write_state == WR_COPIED || write_state == WR_NEED_COPY) { + dsl_pool_wrlog_count(zilog->zl_dmu_pool, size, tx->tx_txg); + } } /* diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 59b05b4b08d0..7d141a12288b 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -84,10 +84,8 @@ #include #include #include - #include - unsigned int zvol_inhibit_dev = 0; unsigned int zvol_volmode = ZFS_VOLMODE_GEOM; @@ -577,6 +575,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, uint32_t blocksize = zv->zv_volblocksize; zilog_t *zilog = zv->zv_zilog; itx_wr_state_t write_state; + uint64_t sz = size; if (zil_replaying(zilog, tx)) return; @@ -628,6 +627,10 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, offset += len; size -= len; } + + if (write_state == WR_COPIED || write_state == WR_NEED_COPY) { + dsl_pool_wrlog_count(zilog->zl_dmu_pool, sz, tx->tx_txg); + } } /* From ebbbe01e31f6cdc79b32a8de2f50691972d184f7 Mon Sep 17 00:00:00 2001 From: Rich Ercolani <214141+rincebrain@users.noreply.github.com> Date: Thu, 31 Mar 2022 13:09:18 -0400 Subject: [PATCH 21/39] Ask libtool to stop hiding some errors For #13083, curiously, it did not print the actual error, just that the compile failed with "Error 1". In theory, this flag should cause it to report errors twice sometimes. In practice, I'm pretty okay with reporting some twice if it avoids reporting some never. Reviewed-by: Brian Behlendorf Reviewed-by: Damian Szuberski Signed-off-by: Rich Ercolani Closes #13086 --- lib/libavl/Makefile.am | 3 +++ lib/libefi/Makefile.am | 3 +++ lib/libicp/Makefile.am | 2 ++ lib/libnvpair/Makefile.am | 3 +++ lib/libshare/Makefile.am | 3 +++ lib/libspl/Makefile.am | 3 +++ lib/libtpool/Makefile.am | 3 +++ lib/libunicode/Makefile.am | 3 +++ lib/libuutil/Makefile.am | 3 +++ lib/libzfs/Makefile.am | 3 +++ lib/libzfs_core/Makefile.am | 3 +++ lib/libzfsbootenv/Makefile.am | 3 +++ lib/libzpool/Makefile.am | 3 +++ lib/libzstd/Makefile.am | 2 ++ lib/libzutil/Makefile.am | 3 +++ 15 files changed, 43 insertions(+) diff --git a/lib/libavl/Makefile.am b/lib/libavl/Makefile.am index 2e0a431c77fb..de8ba34d5ba0 100644 --- a/lib/libavl/Makefile.am +++ b/lib/libavl/Makefile.am @@ -5,6 +5,9 @@ VPATH = $(top_srcdir)/module/avl/ # Includes kernel code, generate warnings for large stack frames AM_CFLAGS += $(FRAME_LARGER_THAN) +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress + noinst_LTLIBRARIES = libavl.la KERNEL_C = \ diff --git a/lib/libefi/Makefile.am b/lib/libefi/Makefile.am index b26f7a6dcd5b..5f77ac480a9f 100644 --- a/lib/libefi/Makefile.am +++ b/lib/libefi/Makefile.am @@ -2,6 +2,9 @@ include $(top_srcdir)/config/Rules.am AM_CFLAGS += $(LIBUUID_CFLAGS) $(ZLIB_CFLAGS) +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress + noinst_LTLIBRARIES = libefi.la USER_C = \ diff --git a/lib/libicp/Makefile.am b/lib/libicp/Makefile.am index e4a9ee862101..9a2510d0d222 100644 --- a/lib/libicp/Makefile.am +++ b/lib/libicp/Makefile.am @@ -6,6 +6,8 @@ VPATH = \ # Includes kernel code, generate warnings for large stack frames AM_CFLAGS += $(FRAME_LARGER_THAN) +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress noinst_LTLIBRARIES = libicp.la diff --git a/lib/libnvpair/Makefile.am b/lib/libnvpair/Makefile.am index a3e1fa307f7c..f9f1eb539239 100644 --- a/lib/libnvpair/Makefile.am +++ b/lib/libnvpair/Makefile.am @@ -8,6 +8,9 @@ VPATH = \ # and required CFLAGS for libtirpc AM_CFLAGS += $(FRAME_LARGER_THAN) $(LIBTIRPC_CFLAGS) +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress + lib_LTLIBRARIES = libnvpair.la include $(top_srcdir)/config/Abigail.am diff --git a/lib/libshare/Makefile.am b/lib/libshare/Makefile.am index 7cef13c3da7c..0fce333506ae 100644 --- a/lib/libshare/Makefile.am +++ b/lib/libshare/Makefile.am @@ -2,6 +2,9 @@ include $(top_srcdir)/config/Rules.am DEFAULT_INCLUDES += -I$(srcdir) +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress + noinst_LTLIBRARIES = libshare.la USER_C = \ diff --git a/lib/libspl/Makefile.am b/lib/libspl/Makefile.am index 61432225a708..b59919bfb9e9 100644 --- a/lib/libspl/Makefile.am +++ b/lib/libspl/Makefile.am @@ -2,6 +2,9 @@ include $(top_srcdir)/config/Rules.am SUBDIRS = include +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress + noinst_LTLIBRARIES = libspl_assert.la libspl.la libspl_assert_la_SOURCES = \ diff --git a/lib/libtpool/Makefile.am b/lib/libtpool/Makefile.am index 3aff56f05f1e..ce9d03a67919 100644 --- a/lib/libtpool/Makefile.am +++ b/lib/libtpool/Makefile.am @@ -3,6 +3,9 @@ include $(top_srcdir)/config/Rules.am # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61118 AM_CFLAGS += $(NO_CLOBBERED) +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress + noinst_LTLIBRARIES = libtpool.la USER_C = \ diff --git a/lib/libunicode/Makefile.am b/lib/libunicode/Makefile.am index b82975f68efd..5b12b3e916f3 100644 --- a/lib/libunicode/Makefile.am +++ b/lib/libunicode/Makefile.am @@ -5,6 +5,9 @@ VPATH = $(top_srcdir)/module/unicode # Includes kernel code, generate warnings for large stack frames AM_CFLAGS += $(FRAME_LARGER_THAN) +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress + noinst_LTLIBRARIES = libunicode.la KERNEL_C = \ diff --git a/lib/libuutil/Makefile.am b/lib/libuutil/Makefile.am index 16d5023451bb..05b7ed0db8cb 100644 --- a/lib/libuutil/Makefile.am +++ b/lib/libuutil/Makefile.am @@ -1,5 +1,8 @@ include $(top_srcdir)/config/Rules.am +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress + lib_LTLIBRARIES = libuutil.la include $(top_srcdir)/config/Abigail.am diff --git a/lib/libzfs/Makefile.am b/lib/libzfs/Makefile.am index 7acaee4183a5..77e12b9e8d8a 100644 --- a/lib/libzfs/Makefile.am +++ b/lib/libzfs/Makefile.am @@ -8,6 +8,9 @@ VPATH = \ # Suppress unused but set variable warnings often due to ASSERTs AM_CFLAGS += $(LIBCRYPTO_CFLAGS) $(ZLIB_CFLAGS) +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress + pkgconfig_DATA = libzfs.pc lib_LTLIBRARIES = libzfs.la diff --git a/lib/libzfs_core/Makefile.am b/lib/libzfs_core/Makefile.am index 67e554dc8706..33a889a09586 100644 --- a/lib/libzfs_core/Makefile.am +++ b/lib/libzfs_core/Makefile.am @@ -2,6 +2,9 @@ include $(top_srcdir)/config/Rules.am pkgconfig_DATA = libzfs_core.pc +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress + lib_LTLIBRARIES = libzfs_core.la include $(top_srcdir)/config/Abigail.am diff --git a/lib/libzfsbootenv/Makefile.am b/lib/libzfsbootenv/Makefile.am index 984df0b8a353..8a6bb76acfe7 100644 --- a/lib/libzfsbootenv/Makefile.am +++ b/lib/libzfsbootenv/Makefile.am @@ -2,6 +2,9 @@ include $(top_srcdir)/config/Rules.am pkgconfig_DATA = libzfsbootenv.pc +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress + lib_LTLIBRARIES = libzfsbootenv.la include $(top_srcdir)/config/Abigail.am diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index db7c376318d5..4ce3b4cd2f1d 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -24,6 +24,9 @@ AM_CFLAGS += $(ZLIB_CFLAGS) AM_CFLAGS += -DLIB_ZPOOL_BUILD +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress + lib_LTLIBRARIES = libzpool.la USER_C = \ diff --git a/lib/libzstd/Makefile.am b/lib/libzstd/Makefile.am index c9ed7e2aafbc..e3bc5c446ee9 100644 --- a/lib/libzstd/Makefile.am +++ b/lib/libzstd/Makefile.am @@ -5,6 +5,8 @@ VPATH = $(top_srcdir)/module/zstd # -fno-tree-vectorize is set for gcc in zstd/common/compiler.h # Set it for other compilers, too. AM_CFLAGS += -fno-tree-vectorize +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress noinst_LTLIBRARIES = libzstd.la diff --git a/lib/libzutil/Makefile.am b/lib/libzutil/Makefile.am index 6351e0ebf64b..f55b7798f1c0 100644 --- a/lib/libzutil/Makefile.am +++ b/lib/libzutil/Makefile.am @@ -2,6 +2,9 @@ include $(top_srcdir)/config/Rules.am AM_CFLAGS += $(LIBBLKID_CFLAGS) $(LIBUDEV_CFLAGS) +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress + DEFAULT_INCLUDES += -I$(srcdir) noinst_LTLIBRARIES = libzutil.la From 44cec45f729e35275cd479770e8d1402137fe99b Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 8 Sep 2022 13:30:53 -0400 Subject: [PATCH 22/39] Improve too large physical ashift handling When iterating through children physical ashifts for vdev, prefer ones above the maximum logical ashift, that we can actually use, but within the administrator defined maximum. When selecting top-level vdev ashift, do not set it to the defined maximum in case physical ashift is even higher, but just ignore one. Using the maximum does not prevent misaligned writes, but reduces space efficiency. Since ZFS tries to write data sequentially and aggregates the writes, in many cases large misanigned writes may be not as bad as the space penalty otherwise. Allow internal physical ashifts for vdevs higher than SHIFT_MAX. May be one day allocator or aggregation could benefit from that. Reduce zfs_vdev_max_auto_ashift default from 16 (64KB) to 14 (16KB), so that ZFS may still use bigger ashifts up to SHIFT_MAX (64KB), but only if it really has to or explicitly told to, but not as an "optimization". There are some read-intensive NVMe SSDs that report Preferred Write Alignment of 64KB, and attempt to build RAIDZ2 of those leads to a space inefficiency that can't be justified. Instead these changes make ZFS fall back to logical ashift of 12 (4KB) by default and only warn user that it may be suboptimal for performance. Reviewed-by: Brian Behlendorf Reviewed-by: Ryan Moeller Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #13798 --- include/sys/vdev_impl.h | 1 + man/man4/zfs.4 | 5 ++- module/os/freebsd/zfs/vdev_geom.c | 3 +- module/zfs/vdev.c | 36 +++++++++++++++++-- module/zfs/vdev_draid.c | 10 ++++-- module/zfs/vdev_mirror.c | 10 ++++-- module/zfs/vdev_raidz.c | 10 ++++-- tests/zfs-tests/include/tunables.cfg | 2 ++ .../cli_root/zpool_add/add-o_ashift.ksh | 5 ++- 9 files changed, 69 insertions(+), 13 deletions(-) diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 3cfde40a77fe..da846d8504fe 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -642,6 +642,7 @@ extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise); */ int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj); void vdev_metaslab_group_create(vdev_t *vd); +uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b); /* * Vdev ashift optimization tunables diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 19c67a61ad74..3508ac9c83fd 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -347,9 +347,12 @@ When a vdev is added, target this number of metaslabs per top-level vdev. .It Sy zfs_vdev_default_ms_shift Ns = Ns Sy 29 Po 512MB Pc Pq int Default limit for metaslab size. . -.It Sy zfs_vdev_max_auto_ashift Ns = Ns Sy ASHIFT_MAX Po 16 Pc Pq ulong +.It Sy zfs_vdev_max_auto_ashift Ns = Ns Sy 14 Pq ulong Maximum ashift used when optimizing for logical -> physical sector size on new top-level vdevs. +May be increased up to +.Sy ASHIFT_MAX Po 16 Pc , +but this may negatively impact pool space efficiency. . .It Sy zfs_vdev_min_auto_ashift Ns = Ns Sy ASHIFT_MIN Po 9 Pc Pq ulong Minimum ashift used when creating new top-level vdevs. diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c index 5447eb922062..c8fa2b00c002 100644 --- a/module/os/freebsd/zfs/vdev_geom.c +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -956,8 +956,7 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1; *physical_ashift = 0; if (pp->stripesize && pp->stripesize > (1 << *logical_ashift) && - ISP2(pp->stripesize) && pp->stripesize <= (1 << ASHIFT_MAX) && - pp->stripeoffset == 0) + ISP2(pp->stripesize) && pp->stripeoffset == 0) *physical_ashift = highbit(pp->stripesize) - 1; /* diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index ccc35adc9f4b..67fb5bf8f17e 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -134,7 +134,15 @@ int zfs_vdev_standard_sm_blksz = (1 << 17); */ int zfs_nocacheflush = 0; -uint64_t zfs_vdev_max_auto_ashift = ASHIFT_MAX; +/* + * Maximum and minimum ashift values that can be automatically set based on + * vdev's physical ashift (disk's physical sector size). While ASHIFT_MAX + * is higher than the maximum value, it is intentionally limited here to not + * excessively impact pool space efficiency. Higher ashift values may still + * be forced by vdev logical ashift or by user via ashift property, but won't + * be set automatically as a performance optimization. + */ +uint64_t zfs_vdev_max_auto_ashift = 14; uint64_t zfs_vdev_min_auto_ashift = ASHIFT_MIN; /*PRINTFLIKE2*/ @@ -1835,6 +1843,24 @@ vdev_set_deflate_ratio(vdev_t *vd) } } +/* + * Choose the best of two ashifts, preferring one between logical ashift + * (absolute minimum) and administrator defined maximum, otherwise take + * the biggest of the two. + */ +uint64_t +vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b) +{ + if (a > logical && a <= zfs_vdev_max_auto_ashift) { + if (b <= logical || b > zfs_vdev_max_auto_ashift) + return (a); + else + return (MAX(a, b)); + } else if (b <= logical || b > zfs_vdev_max_auto_ashift) + return (MAX(a, b)); + return (b); +} + /* * Maximize performance by inflating the configured ashift for top level * vdevs to be as close to the physical ashift as possible while maintaining @@ -1846,7 +1872,8 @@ vdev_ashift_optimize(vdev_t *vd) { ASSERT(vd == vd->vdev_top); - if (vd->vdev_ashift < vd->vdev_physical_ashift) { + if (vd->vdev_ashift < vd->vdev_physical_ashift && + vd->vdev_physical_ashift <= zfs_vdev_max_auto_ashift) { vd->vdev_ashift = MIN( MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift), MAX(zfs_vdev_min_auto_ashift, @@ -4452,7 +4479,10 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) vs->vs_configured_ashift = vd->vdev_top != NULL ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; vs->vs_logical_ashift = vd->vdev_logical_ashift; - vs->vs_physical_ashift = vd->vdev_physical_ashift; + if (vd->vdev_physical_ashift <= ASHIFT_MAX) + vs->vs_physical_ashift = vd->vdev_physical_ashift; + else + vs->vs_physical_ashift = 0; /* * Report fragmentation and rebuild progress for top-level, diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index 7e654ca24d20..10d09517effd 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -1496,8 +1496,14 @@ vdev_draid_calculate_asize(vdev_t *vd, uint64_t *asizep, uint64_t *max_asizep, asize = MIN(asize - 1, cvd->vdev_asize - 1) + 1; max_asize = MIN(max_asize - 1, cvd->vdev_max_asize - 1) + 1; logical_ashift = MAX(logical_ashift, cvd->vdev_ashift); - physical_ashift = MAX(physical_ashift, - cvd->vdev_physical_ashift); + } + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_ops == &vdev_draid_spare_ops) + continue; + physical_ashift = vdev_best_ashift(logical_ashift, + physical_ashift, cvd->vdev_physical_ashift); } *asizep = asize; diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 50b86725b78a..d80a767043a5 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -409,8 +409,14 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); - *physical_ashift = MAX(*physical_ashift, - cvd->vdev_physical_ashift); + } + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error) + continue; + *physical_ashift = vdev_best_ashift(*logical_ashift, + *physical_ashift, cvd->vdev_physical_ashift); } if (numerrors == vd->vdev_children) { diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 424de0b33e09..5c25007f17b9 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -1426,8 +1426,14 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); - *physical_ashift = MAX(*physical_ashift, - cvd->vdev_physical_ashift); + } + for (c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error != 0) + continue; + *physical_ashift = vdev_best_ashift(*logical_ashift, + *physical_ashift, cvd->vdev_physical_ashift); } *asize *= vd->vdev_children; diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index fff43e469165..0fd2f48f2c1f 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -81,7 +81,9 @@ TRIM_TXG_BATCH trim.txg_batch zfs_trim_txg_batch TXG_HISTORY txg.history zfs_txg_history TXG_TIMEOUT txg.timeout zfs_txg_timeout UNLINK_SUSPEND_PROGRESS UNSUPPORTED zfs_unlink_suspend_progress +VDEV_FILE_LOGICAL_ASHIFT vdev.file.logical_ashift vdev_file_logical_ashift VDEV_FILE_PHYSICAL_ASHIFT vdev.file.physical_ashift vdev_file_physical_ashift +VDEV_MAX_AUTO_ASHIFT vdev.max_auto_ashift zfs_vdev_max_auto_ashift VDEV_MIN_MS_COUNT vdev.min_ms_count zfs_vdev_min_ms_count VDEV_VALIDATE_SKIP vdev.validate_skip vdev_validate_skip VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh index 89cc4b0d3082..0fa1c0055b3c 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh @@ -57,7 +57,9 @@ disk2=$TEST_BASE_DIR/disk2 log_must mkfile $SIZE $disk1 log_must mkfile $SIZE $disk2 +logical_ashift=$(get_tunable VDEV_FILE_LOGICAL_ASHIFT) orig_ashift=$(get_tunable VDEV_FILE_PHYSICAL_ASHIFT) +max_auto_ashift=$(get_tunable VDEV_MAX_AUTO_ASHIFT) typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16") for ashift in ${ashifts[@]} @@ -81,7 +83,8 @@ do log_must zpool create $TESTPOOL $disk1 log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT $ashift log_must zpool add $TESTPOOL $disk2 - verify_ashift $disk2 $ashift + exp=$(( (ashift <= max_auto_ashift) ? ashift : logical_ashift )) + verify_ashift $disk2 $exp if [[ $? -ne 0 ]] then log_fail "Device was added without setting ashift value to "\ From faa1e4082d64ccdb8b9b9b40bcff23b259133a75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=BD=D0=B0=D0=B1?= Date: Thu, 27 May 2021 17:09:32 +0200 Subject: [PATCH 23/39] include: move SPA_MINBLOCKSHIFT and zio_encrypt to sys/fs/zfs.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These are used by userspace, so should live in a public header Reviewed-by: Brian Behlendorf Reviewed-by: Ryan Moeller Signed-off-by: Ahelenia Ziemiańska Closes #12116 --- include/sys/fs/zfs.h | 41 +++++++++++++++++++++++++++++++++++++++++ include/sys/spa.h | 21 --------------------- include/sys/zio.h | 17 ----------------- 3 files changed, 41 insertions(+), 38 deletions(-) diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index c8e199fc679c..be6a6b2074cd 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1615,6 +1615,47 @@ typedef enum { #define ZFS_EV_HIST_DSID "history_dsid" #define ZFS_EV_RESILVER_TYPE "resilver_type" + +/* + * We currently support block sizes from 512 bytes to 16MB. + * The benefits of larger blocks, and thus larger IO, need to be weighed + * against the cost of COWing a giant block to modify one byte, and the + * large latency of reading or writing a large block. + * + * Note that although blocks up to 16MB are supported, the recordsize + * property can not be set larger than zfs_max_recordsize (default 1MB). + * See the comment near zfs_max_recordsize in dsl_dataset.c for details. + * + * Note that although the LSIZE field of the blkptr_t can store sizes up + * to 32MB, the dnode's dn_datablkszsec can only store sizes up to + * 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB. + */ +#define SPA_MINBLOCKSHIFT 9 +#define SPA_OLD_MAXBLOCKSHIFT 17 +#define SPA_MAXBLOCKSHIFT 24 +#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) +#define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) +#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) + + +/* supported encryption algorithms */ +enum zio_encrypt { + ZIO_CRYPT_INHERIT = 0, + ZIO_CRYPT_ON, + ZIO_CRYPT_OFF, + ZIO_CRYPT_AES_128_CCM, + ZIO_CRYPT_AES_192_CCM, + ZIO_CRYPT_AES_256_CCM, + ZIO_CRYPT_AES_128_GCM, + ZIO_CRYPT_AES_192_GCM, + ZIO_CRYPT_AES_256_GCM, + ZIO_CRYPT_FUNCTIONS +}; + +#define ZIO_CRYPT_ON_VALUE ZIO_CRYPT_AES_256_GCM +#define ZIO_CRYPT_DEFAULT ZIO_CRYPT_OFF + + #ifdef __cplusplus } #endif diff --git a/include/sys/spa.h b/include/sys/spa.h index 3eebcd84fccf..67724a68f0e8 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -72,27 +72,6 @@ struct dsl_pool; struct dsl_dataset; struct dsl_crypto_params; -/* - * We currently support block sizes from 512 bytes to 16MB. - * The benefits of larger blocks, and thus larger IO, need to be weighed - * against the cost of COWing a giant block to modify one byte, and the - * large latency of reading or writing a large block. - * - * The recordsize property can not be set larger than zfs_max_recordsize - * (default 16MB on 64-bit and 1MB on 32-bit). See the comment near - * zfs_max_recordsize in dsl_dataset.c for details. - * - * Note that although the LSIZE field of the blkptr_t can store sizes up - * to 32MB, the dnode's dn_datablkszsec can only store sizes up to - * 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB. - */ -#define SPA_MINBLOCKSHIFT 9 -#define SPA_OLD_MAXBLOCKSHIFT 17 -#define SPA_MAXBLOCKSHIFT 24 -#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) -#define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) -#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) - /* * Alignment Shift (ashift) is an immutable, internal top-level vdev property * which can only be set at vdev creation time. Physical writes are always done diff --git a/include/sys/zio.h b/include/sys/zio.h index 5bb712083458..39de5175b7db 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -108,23 +108,6 @@ enum zio_checksum { #define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256 -/* supported encryption algorithms */ -enum zio_encrypt { - ZIO_CRYPT_INHERIT = 0, - ZIO_CRYPT_ON, - ZIO_CRYPT_OFF, - ZIO_CRYPT_AES_128_CCM, - ZIO_CRYPT_AES_192_CCM, - ZIO_CRYPT_AES_256_CCM, - ZIO_CRYPT_AES_128_GCM, - ZIO_CRYPT_AES_192_GCM, - ZIO_CRYPT_AES_256_GCM, - ZIO_CRYPT_FUNCTIONS -}; - -#define ZIO_CRYPT_ON_VALUE ZIO_CRYPT_AES_256_GCM -#define ZIO_CRYPT_DEFAULT ZIO_CRYPT_OFF - /* macros defining encryption lengths */ #define ZIO_OBJSET_MAC_LEN 32 #define ZIO_DATA_IV_LEN 12 From d5105f068f19f0b1f3aa3e14a76ffd377a72bb58 Mon Sep 17 00:00:00 2001 From: Ameer Hamza <106930537+ixhamza@users.noreply.github.com> Date: Sat, 17 Sep 2022 01:52:25 +0500 Subject: [PATCH 24/39] zfs recv hangs if max recordsize is less than received recordsize - Some optimizations for bqueue enqueue/dequeue. - Added a fix to prevent deadlock when both bqueue_enqueue_impl() and bqueue_dequeue() waits for signal to be triggered. Reviewed-by: Alexander Motin Reviewed-by: Ryan Moeller Signed-off-by: Ameer Hamza Closes #13855 --- include/sys/fs/zfs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index be6a6b2074cd..df1cc060130a 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1622,9 +1622,9 @@ typedef enum { * against the cost of COWing a giant block to modify one byte, and the * large latency of reading or writing a large block. * - * Note that although blocks up to 16MB are supported, the recordsize - * property can not be set larger than zfs_max_recordsize (default 1MB). - * See the comment near zfs_max_recordsize in dsl_dataset.c for details. + * The recordsize property can not be set larger than zfs_max_recordsize + * (default 16MB on 64-bit and 1MB on 32-bit). See the comment near + * zfs_max_recordsize in dsl_dataset.c for details. * * Note that although the LSIZE field of the blkptr_t can store sizes up * to 32MB, the dnode's dn_datablkszsec can only store sizes up to From 035e52f59152bbea35653e5fa2215152f81823f8 Mon Sep 17 00:00:00 2001 From: Ameer Hamza <106930537+ixhamza@users.noreply.github.com> Date: Wed, 21 Sep 2022 03:19:05 +0500 Subject: [PATCH 25/39] Delay ZFS_PROP_SHARESMB property to handle it for encrypted raw receive For encrypted raw receive, objset creation is delayed until a call to dmu_recv_stream(). ZFS_PROP_SHARESMB property requires objset to be populated when calling zpl_earlier_version(). To correctly handle the ZFS_PROP_SHARESMB property for encrypted raw receive, this change delays setting the property. Reviewed-by: Alexander Motin Reviewed-by: Ryan Moeller Reviewed-by: Brian Behlendorf Signed-off-by: Ameer Hamza Closes #13878 --- module/zfs/zfs_ioctl.c | 15 +++++++++++++++ .../functional/rsend/send_encrypted_props.ksh | 8 ++++++++ 2 files changed, 23 insertions(+) diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 3336bb783251..3d2492b9b908 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -4787,6 +4787,11 @@ extract_delay_props(nvlist_t *props) static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, ZFS_PROP_KEYLOCATION, + /* + * Setting ZFS_PROP_SHARESMB requires the objset type to be + * known, which is not possible prior to receipt of raw sends. + */ + ZFS_PROP_SHARESMB, 0 }; int i; @@ -4850,6 +4855,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, offset_t off, noff; nvlist_t *local_delayprops = NULL; nvlist_t *recv_delayprops = NULL; + nvlist_t *inherited_delayprops = NULL; nvlist_t *origprops = NULL; /* existing properties */ nvlist_t *origrecvd = NULL; /* existing received properties */ boolean_t first_recvd_props = B_FALSE; @@ -4964,6 +4970,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, local_delayprops = extract_delay_props(oprops); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, oprops, *errors); + inherited_delayprops = extract_delay_props(xprops); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, xprops, *errors); @@ -5021,6 +5028,10 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, local_delayprops, *errors); } + if (inherited_delayprops != NULL && error == 0) { + (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, + inherited_delayprops, *errors); + } } /* @@ -5040,6 +5051,10 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, ASSERT(nvlist_merge(localprops, local_delayprops, 0) == 0); nvlist_free(local_delayprops); } + if (inherited_delayprops != NULL) { + ASSERT(nvlist_merge(localprops, inherited_delayprops, 0) == 0); + nvlist_free(inherited_delayprops); + } *read_bytes = off - noff; #ifdef ZFS_DEBUG diff --git a/tests/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh b/tests/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh index 793904db91ca..c0c7b682def9 100755 --- a/tests/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh +++ b/tests/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh @@ -133,6 +133,14 @@ recv_cksum=$(md5digest /$ds/$TESTFILE0) log_must test "$recv_cksum" == "$cksum" log_must zfs destroy -r $ds +# Test that we can override sharesmb property for encrypted raw stream. +log_note "Must be able to override sharesmb property for encrypted raw stream" +ds=$TESTPOOL/recv +log_must eval "zfs send -w $esnap > $sendfile" +log_must eval "zfs recv -o sharesmb=on $ds < $sendfile" +log_must test "$(get_prop 'sharesmb' $ds)" == "on" +log_must zfs destroy -r $ds + # Test that we can override encryption properties on a properties stream # of an unencrypted dataset, turning it into an encryption root. log_note "Must be able to receive stream with props as encryption root" From 5096ed31c8c3d5fb673b34a2e555da6f9e0dc62b Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Thu, 15 Sep 2022 19:21:21 -0400 Subject: [PATCH 26/39] Fix incorrect size given to bqueue_enqueue() call in dmu_redact.c We pass sizeof (struct redact_record *) rather than sizeof (struct redact_record). Passing the pointer size is wrong. Coverity caught this in two places. Reviewed-by: Brian Behlendorf Signed-off-by: Richard Yao Closes #13885 --- module/zfs/dmu_redact.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/module/zfs/dmu_redact.c b/module/zfs/dmu_redact.c index 7efe423d35f0..5184ef6888df 100644 --- a/module/zfs/dmu_redact.c +++ b/module/zfs/dmu_redact.c @@ -141,7 +141,7 @@ record_merge_enqueue(bqueue_t *q, struct redact_record **build, { if (new->eos_marker) { if (*build != NULL) - bqueue_enqueue(q, *build, sizeof (*build)); + bqueue_enqueue(q, *build, sizeof (**build)); bqueue_enqueue_flush(q, new, sizeof (*new)); return; } @@ -823,7 +823,7 @@ perform_thread_merge(bqueue_t *q, uint32_t num_threads, avl_destroy(&end_tree); kmem_free(redact_nodes, num_threads * sizeof (*redact_nodes)); if (current_record != NULL) - bqueue_enqueue(q, current_record, sizeof (current_record)); + bqueue_enqueue(q, current_record, sizeof (*current_record)); return (err); } From b66f8d3c2b79d6a6bd7e2ee850ec9b892dc31093 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Thu, 15 Sep 2022 19:22:33 -0400 Subject: [PATCH 27/39] Add zfs_btree_verify_intensity kernel module parameter I see a few issues in the issue tracker that might be aided by being able to turn this on. We have no module parameter for it, so I would like to add one. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Richard Yao Closes #13874 --- cmd/zdb/zdb.c | 2 +- man/man4/zfs.4 | 16 ++++++++++++++++ module/zfs/btree.c | 8 +++++++- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index db8e2200a72b..4e57538d2234 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -112,7 +112,7 @@ extern int zfs_vdev_async_read_max_active; extern boolean_t spa_load_verify_dryrun; extern boolean_t spa_mode_readable_spacemaps; extern int zfs_reconstruct_indirect_combinations_max; -extern int zfs_btree_verify_intensity; +extern uint_t zfs_btree_verify_intensity; static const char cmdname[] = "zdb"; uint8_t dump_opt[256]; diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 3508ac9c83fd..97539bcc5010 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1348,6 +1348,22 @@ _ .TE .Sy \& * No Requires debug build. . +.It Sy zfs_btree_verify_intensity Ns = Ns Sy 0 Pq uint +Enables btree verification. +The following settings are culminative: +.TS +box; +lbz r l l . + Value Description + + 1 Verify height. + 2 Verify pointers from children to parent. + 3 Verify element counts. + 4 Verify element order. (expensive) +* 5 Verify unused memory is poisoned. (expensive) +.TE +.Sy \& * No Requires debug build. +. .It Sy zfs_free_leak_on_eio Ns = Ns Sy 0 Ns | Ns 1 Pq int If destroy encounters an .Sy EIO diff --git a/module/zfs/btree.c b/module/zfs/btree.c index 36755f97929c..e16c4ebef6ba 100644 --- a/module/zfs/btree.c +++ b/module/zfs/btree.c @@ -53,7 +53,7 @@ kmem_cache_t *zfs_btree_leaf_cache; * (while the asymptotic complexity of the other steps is the same, the * importance of the constant factors cannot be denied). */ -int zfs_btree_verify_intensity = 0; +uint_t zfs_btree_verify_intensity = 0; /* * Convenience functions to silence warnings from memcpy/memmove's @@ -2171,3 +2171,9 @@ zfs_btree_verify(zfs_btree_t *tree) return; zfs_btree_verify_poison(tree); } + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, btree_verify_intensity, UINT, ZMOD_RW, + "Enable btree verification. Levels above 4 require ZFS be built " + "with debugging"); +/* END CSTYLED */ From 91e02156ddea27d980fa8e5f7f3d10dda06139d2 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 19 Sep 2022 11:07:15 -0700 Subject: [PATCH 28/39] Revert "Reduce dbuf_find() lock contention" This reverts commit 34dbc618f50cfcd392f90af80c140398c38cbcd1. While this change resolved the lock contention observed for certain workloads, it inadventantly reduced the maximum hash inserts/removes per second. This appears to be due to the slightly higher acquisition cost of a rwlock vs a mutex. Reviewed-by: Brian Behlendorf --- include/sys/dbuf.h | 7 ++++--- module/zfs/dbuf.c | 26 +++++++++++++------------- module/zfs/dbuf_stats.c | 4 ++-- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 2e7385113ec5..b757b2664178 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -321,12 +321,13 @@ typedef struct dmu_buf_impl { uint8_t db_dirtycnt; } dmu_buf_impl_t; -#define DBUF_RWLOCKS 8192 -#define DBUF_HASH_RWLOCK(h, idx) (&(h)->hash_rwlocks[(idx) & (DBUF_RWLOCKS-1)]) +/* Note: the dbuf hash table is exposed only for the mdb module */ +#define DBUF_MUTEXES 2048 +#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)]) typedef struct dbuf_hash_table { uint64_t hash_table_mask; dmu_buf_impl_t **hash_table; - krwlock_t hash_rwlocks[DBUF_RWLOCKS] ____cacheline_aligned; + kmutex_t hash_mutexes[DBUF_MUTEXES] ____cacheline_aligned; } dbuf_hash_table_t; typedef void (*dbuf_prefetch_fn)(void *, uint64_t, uint64_t, boolean_t); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 1a022c8b8a07..7ecc2812b4e4 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -339,18 +339,18 @@ dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) hv = dbuf_hash(os, obj, level, blkid); idx = hv & h->hash_table_mask; - rw_enter(DBUF_HASH_RWLOCK(h, idx), RW_READER); + mutex_enter(DBUF_HASH_MUTEX(h, idx)); for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { if (DBUF_EQUAL(db, os, obj, level, blkid)) { mutex_enter(&db->db_mtx); if (db->db_state != DB_EVICTING) { - rw_exit(DBUF_HASH_RWLOCK(h, idx)); + mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (db); } mutex_exit(&db->db_mtx); } } - rw_exit(DBUF_HASH_RWLOCK(h, idx)); + mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (NULL); } @@ -393,13 +393,13 @@ dbuf_hash_insert(dmu_buf_impl_t *db) hv = dbuf_hash(os, obj, level, blkid); idx = hv & h->hash_table_mask; - rw_enter(DBUF_HASH_RWLOCK(h, idx), RW_WRITER); + mutex_enter(DBUF_HASH_MUTEX(h, idx)); for (dbf = h->hash_table[idx], i = 0; dbf != NULL; dbf = dbf->db_hash_next, i++) { if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { mutex_enter(&dbf->db_mtx); if (dbf->db_state != DB_EVICTING) { - rw_exit(DBUF_HASH_RWLOCK(h, idx)); + mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (dbf); } mutex_exit(&dbf->db_mtx); @@ -417,7 +417,7 @@ dbuf_hash_insert(dmu_buf_impl_t *db) mutex_enter(&db->db_mtx); db->db_hash_next = h->hash_table[idx]; h->hash_table[idx] = db; - rw_exit(DBUF_HASH_RWLOCK(h, idx)); + mutex_exit(DBUF_HASH_MUTEX(h, idx)); uint64_t he = atomic_inc_64_nv(&dbuf_stats.hash_elements.value.ui64); DBUF_STAT_MAX(hash_elements_max, he); @@ -474,13 +474,13 @@ dbuf_hash_remove(dmu_buf_impl_t *db) /* * We mustn't hold db_mtx to maintain lock ordering: - * DBUF_HASH_RWLOCK > db_mtx. + * DBUF_HASH_MUTEX > db_mtx. */ ASSERT(zfs_refcount_is_zero(&db->db_holds)); ASSERT(db->db_state == DB_EVICTING); ASSERT(!MUTEX_HELD(&db->db_mtx)); - rw_enter(DBUF_HASH_RWLOCK(h, idx), RW_WRITER); + mutex_enter(DBUF_HASH_MUTEX(h, idx)); dbp = &h->hash_table[idx]; while ((dbf = *dbp) != db) { dbp = &dbf->db_hash_next; @@ -491,7 +491,7 @@ dbuf_hash_remove(dmu_buf_impl_t *db) if (h->hash_table[idx] && h->hash_table[idx]->db_hash_next == NULL) DBUF_STAT_BUMPDOWN(hash_chains); - rw_exit(DBUF_HASH_RWLOCK(h, idx)); + mutex_exit(DBUF_HASH_MUTEX(h, idx)); atomic_dec_64(&dbuf_stats.hash_elements.value.ui64); } @@ -914,8 +914,8 @@ dbuf_init(void) sizeof (dmu_buf_impl_t), 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); - for (i = 0; i < DBUF_RWLOCKS; i++) - rw_init(&h->hash_rwlocks[i], NULL, RW_DEFAULT, NULL); + for (i = 0; i < DBUF_MUTEXES; i++) + mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); dbuf_stats_init(h); @@ -981,8 +981,8 @@ dbuf_fini(void) dbuf_stats_destroy(); - for (i = 0; i < DBUF_RWLOCKS; i++) - rw_destroy(&h->hash_rwlocks[i]); + for (i = 0; i < DBUF_MUTEXES; i++) + mutex_destroy(&h->hash_mutexes[i]); #if defined(_KERNEL) /* * Large allocations which do not require contiguous pages diff --git a/module/zfs/dbuf_stats.c b/module/zfs/dbuf_stats.c index 037190a81bb3..12bb568a08cc 100644 --- a/module/zfs/dbuf_stats.c +++ b/module/zfs/dbuf_stats.c @@ -137,7 +137,7 @@ dbuf_stats_hash_table_data(char *buf, size_t size, void *data) if (size) buf[0] = 0; - rw_enter(DBUF_HASH_RWLOCK(h, dsh->idx), RW_READER); + mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx)); for (db = h->hash_table[dsh->idx]; db != NULL; db = db->db_hash_next) { /* * Returning ENOMEM will cause the data and header functions @@ -158,7 +158,7 @@ dbuf_stats_hash_table_data(char *buf, size_t size, void *data) mutex_exit(&db->db_mtx); } - rw_exit(DBUF_HASH_RWLOCK(h, dsh->idx)); + mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx)); return (error); } From 33223cbc3cbed37fdcecedc18b4b82406c73c01b Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 24 May 2022 12:46:35 -0400 Subject: [PATCH 29/39] Refactor Log Size Limit Original Log Size Limit implementation blocked all writes in case of limit reached until the TXG is committed and the log is freed. It caused huge delays and following speed spikes in application writes. This implementation instead smoothly throttles writes, using exactly the same mechanism as used for dirty data. Reviewed-by: Brian Behlendorf Reviewed-by: jxdking Signed-off-by: Alexander Motin Sponsored-By: iXsystems, Inc. Issue #12284 Closes #13476 --- include/sys/dmu_tx.h | 2 +- include/sys/dsl_pool.h | 2 +- man/man4/zfs.4 | 6 ++--- module/zfs/dmu_tx.c | 58 +++++++++++++++++++++++++++--------------- module/zfs/dsl_pool.c | 17 ++++++++----- 5 files changed, 53 insertions(+), 32 deletions(-) diff --git a/include/sys/dmu_tx.h b/include/sys/dmu_tx.h index 71a9ac7ca7bf..ad3f1b0e47ca 100644 --- a/include/sys/dmu_tx.h +++ b/include/sys/dmu_tx.h @@ -124,8 +124,8 @@ typedef struct dmu_tx_stats { kstat_named_t dmu_tx_dirty_throttle; kstat_named_t dmu_tx_dirty_delay; kstat_named_t dmu_tx_dirty_over_max; - kstat_named_t dmu_tx_wrlog_over_max; kstat_named_t dmu_tx_dirty_frees_delay; + kstat_named_t dmu_tx_wrlog_delay; kstat_named_t dmu_tx_quota; } dmu_tx_stats_t; diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index 1b4e2924facf..e93bd0557c1e 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -164,7 +164,7 @@ uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy); uint64_t dsl_pool_deferred_space(dsl_pool_t *dp); void dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg); -boolean_t dsl_pool_wrlog_over_max(dsl_pool_t *dp); +boolean_t dsl_pool_need_wrlog_delay(dsl_pool_t *dp); void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg); void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp); diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 97539bcc5010..fcb97d716d54 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1101,9 +1101,9 @@ This should be less than . .It Sy zfs_wrlog_data_max Ns = Pq int The upper limit of write-transaction zil log data size in bytes. -Once it is reached, write operation is blocked, until log data is cleared out -after transaction group sync. Because of some overhead, it should be set -at least 2 times the size of +Write operations are throttled when approaching the limit until log data is +cleared out after transaction group sync. +Because of some overhead, it should be set at least 2 times the size of .Sy zfs_dirty_data_max .No to prevent harming normal write throughput. It also should be smaller than the size of the slog device if slog is present. diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 5fa516866668..1eed0526b51d 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -53,8 +53,8 @@ dmu_tx_stats_t dmu_tx_stats = { { "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 }, - { "dmu_tx_wrlog_over_max", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_frees_delay", KSTAT_DATA_UINT64 }, + { "dmu_tx_wrlog_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_quota", KSTAT_DATA_UINT64 }, }; @@ -781,34 +781,49 @@ static void dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) { dsl_pool_t *dp = tx->tx_pool; - uint64_t delay_min_bytes = + uint64_t delay_min_bytes, wrlog; + hrtime_t wakeup, tx_time = 0, now; + + /* Calculate minimum transaction time for the dirty data amount. */ + delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; - hrtime_t wakeup, min_tx_time, now; + if (dirty > delay_min_bytes) { + /* + * The caller has already waited until we are under the max. + * We make them pass us the amount of dirty data so we don't + * have to handle the case of it being >= the max, which + * could cause a divide-by-zero if it's == the max. + */ + ASSERT3U(dirty, <, zfs_dirty_data_max); - if (dirty <= delay_min_bytes) - return; + tx_time = zfs_delay_scale * (dirty - delay_min_bytes) / + (zfs_dirty_data_max - dirty); + } - /* - * The caller has already waited until we are under the max. - * We make them pass us the amount of dirty data so we don't - * have to handle the case of it being >= the max, which could - * cause a divide-by-zero if it's == the max. - */ - ASSERT3U(dirty, <, zfs_dirty_data_max); + /* Calculate minimum transaction time for the TX_WRITE log size. */ + wrlog = aggsum_upper_bound(&dp->dp_wrlog_total); + delay_min_bytes = + zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100; + if (wrlog >= zfs_wrlog_data_max) { + tx_time = zfs_delay_max_ns; + } else if (wrlog > delay_min_bytes) { + tx_time = MAX(zfs_delay_scale * (wrlog - delay_min_bytes) / + (zfs_wrlog_data_max - wrlog), tx_time); + } + if (tx_time == 0) + return; + + tx_time = MIN(tx_time, zfs_delay_max_ns); now = gethrtime(); - min_tx_time = zfs_delay_scale * - (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); - min_tx_time = MIN(min_tx_time, zfs_delay_max_ns); - if (now > tx->tx_start + min_tx_time) + if (now > tx->tx_start + tx_time) return; DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, - uint64_t, min_tx_time); + uint64_t, tx_time); mutex_enter(&dp->dp_lock); - wakeup = MAX(tx->tx_start + min_tx_time, - dp->dp_last_wakeup + min_tx_time); + wakeup = MAX(tx->tx_start + tx_time, dp->dp_last_wakeup + tx_time); dp->dp_last_wakeup = wakeup; mutex_exit(&dp->dp_lock); @@ -886,8 +901,9 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) } if (!tx->tx_dirty_delayed && - dsl_pool_wrlog_over_max(tx->tx_pool)) { - DMU_TX_STAT_BUMP(dmu_tx_wrlog_over_max); + dsl_pool_need_wrlog_delay(tx->tx_pool)) { + tx->tx_wait_dirty = B_TRUE; + DMU_TX_STAT_BUMP(dmu_tx_wrlog_delay); return (SET_ERROR(ERESTART)); } diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index b91dd4cfa8a6..4036c8671f2d 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -105,9 +105,8 @@ int zfs_dirty_data_max_percent = 10; int zfs_dirty_data_max_max_percent = 25; /* - * zfs_wrlog_data_max, the upper limit of TX_WRITE log data. - * Once it is reached, write operation is blocked, - * until log data is cleared out after txg sync. + * The upper limit of TX_WRITE log data. Write operations are throttled + * when approaching the limit until log data is cleared out after txg sync. * It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY. */ unsigned long zfs_wrlog_data_max = 0; @@ -621,15 +620,18 @@ dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg) /* Choose a value slightly bigger than min dirty sync bytes */ uint64_t sync_min = - zfs_dirty_data_max * (zfs_dirty_data_sync_percent + 10) / 100; + zfs_wrlog_data_max * (zfs_dirty_data_sync_percent + 10) / 200; if (aggsum_compare(&dp->dp_wrlog_pertxg[txg & TXG_MASK], sync_min) > 0) txg_kick(dp, txg); } boolean_t -dsl_pool_wrlog_over_max(dsl_pool_t *dp) +dsl_pool_need_wrlog_delay(dsl_pool_t *dp) { - return (aggsum_compare(&dp->dp_wrlog_total, zfs_wrlog_data_max) > 0); + uint64_t delay_min_bytes = + zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100; + + return (aggsum_compare(&dp->dp_wrlog_total, delay_min_bytes) > 0); } static void @@ -639,6 +641,9 @@ dsl_pool_wrlog_clear(dsl_pool_t *dp, uint64_t txg) delta = -(int64_t)aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]); aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], delta); aggsum_add(&dp->dp_wrlog_total, delta); + /* Compact per-CPU sums after the big change. */ + (void) aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]); + (void) aggsum_value(&dp->dp_wrlog_total); } #ifdef ZFS_DEBUG From 835e03682c22f95a774f1a21d6e96e00f063fef9 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Mon, 26 Sep 2022 19:44:22 -0400 Subject: [PATCH 30/39] Linux: Fix uninitialized variable usage in zio_do_crypt_data() Coverity complained about this. An error from `hkdf_sha512()` before uio initialization will cause pointers to uninitialized memory to be passed to `zio_crypt_destroy_uio()`. This is a regression that was introduced by cf63739191b6cac629d053930a4aea592bca3819. Interestingly, this never affected FreeBSD, since the FreeBSD version never had that patch ported. Since moving uio initialization to the top of this function would slow down the qat_crypt() path, we only move the `memset()` calls to the top of the function. This is sufficient to fix this problem. Reviewed-by: Ryan Moeller Reviewed-by: Neal Gompa Reviewed-by: Brian Behlendorf Signed-off-by: Richard Yao Closes #13944 --- module/os/linux/zfs/zio_crypt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/module/os/linux/zfs/zio_crypt.c b/module/os/linux/zfs/zio_crypt.c index 381769eab682..50e93909659f 100644 --- a/module/os/linux/zfs/zio_crypt.c +++ b/module/os/linux/zfs/zio_crypt.c @@ -1900,6 +1900,9 @@ zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, crypto_ctx_template_t tmpl; uint8_t *authbuf = NULL; + memset(&puio, 0, sizeof (puio)); + memset(&cuio, 0, sizeof (cuio)); + /* * If the needed key is the current one, just use it. Otherwise we * need to generate a temporary one from the given salt + master key. @@ -1960,9 +1963,6 @@ zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, /* If the hardware implementation fails fall back to software */ } - bzero(&puio, sizeof (zfs_uio_t)); - bzero(&cuio, sizeof (zfs_uio_t)); - /* create uios for encryption */ ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf, cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len, From c973929b29bb945d1a1fd9f54a5238360f0e1029 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Tue, 27 Sep 2022 19:44:13 -0400 Subject: [PATCH 31/39] LUA: Fix CVE-2014-5461 Apply the fix from upstream. http://www.lua.org/bugs.html#5.2.2-1 https://www.opencve.io/cve/CVE-2014-5461 It should be noted that exploiting this requires the `SYS_CONFIG` privilege, and anyone with that privilege likely has other opportunities to do exploits, so it is unlikely that bad actors could exploit this unless system administrators are executing untrusted ZFS Channel Programs. Reviewed-by: Brian Behlendorf Signed-off-by: Richard Yao Closes #13949 --- module/lua/ldo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/lua/ldo.c b/module/lua/ldo.c index 08a952007d10..a9835c4f571d 100644 --- a/module/lua/ldo.c +++ b/module/lua/ldo.c @@ -406,7 +406,7 @@ int luaD_precall (lua_State *L, StkId func, int nresults) { StkId base; Proto *p = clLvalue(func)->p; n = cast_int(L->top - func) - 1; /* number of real arguments */ - luaD_checkstack(L, p->maxstacksize); + luaD_checkstack(L, p->maxstacksize + p->numparams); for (; n < p->numparams; n++) setnilvalue(L->top++); /* complete missing arguments */ if (!p->is_vararg) { From 8dcd6af62318a85606d664a3ba99d17b411a5892 Mon Sep 17 00:00:00 2001 From: Ryan Moeller Date: Tue, 2 Aug 2022 19:34:23 -0400 Subject: [PATCH 32/39] FreeBSD: Ignore symlink to i386 includes A symlink to i386 includes is created in the build dir on amd64 since freebsd/freebsd-src@d07600c563039f252becc29ac7d9a454b6b0600d Tell git to ignore it like the other include links. Reviewed-by: Brian Behlendorf Signed-off-by: Ryan Moeller Closes #13719 --- module/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/module/.gitignore b/module/.gitignore index 7a4bd3673e77..0ec6052f1bb0 100644 --- a/module/.gitignore +++ b/module/.gitignore @@ -22,5 +22,6 @@ /export_syms /machine /x86 +/i386 !Makefile.in From 55816c64dadac5fecd858a88f769184283a4808b Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Wed, 14 Sep 2022 15:51:55 -0400 Subject: [PATCH 33/39] FreeBSD: Fix integer conversion for vnlru_free{,_vfsops}() When reviewing #13875, I noticed that our FreeBSD code has an issue where it converts from `int64_t` to `int` when calling `vnlru_free{,_vfsops}()`. The result is that if the int64_t is `1 << 36`, the int will be 0, since the low bits are 0. Even when some low bits are set, a value such as `((1 << 36) + 1)` would truncate to 1, which is wrong. There is protection against this on 32-bit platforms, but on 64-bit platforms, there is no check to protect us, so we add a check. Reviewed-by: Alexander Motin Reviewed-by: Ryan Moeller Signed-off-by: Richard Yao Closes #13882 --- module/os/freebsd/zfs/arc_os.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c index 77af092e1ed4..590d1c04b9a5 100644 --- a/module/os/freebsd/zfs/arc_os.c +++ b/module/os/freebsd/zfs/arc_os.c @@ -161,6 +161,12 @@ arc_prune_task(void *arg) int64_t nr_scan = (intptr_t)arg; arc_reduce_target_size(ptob(nr_scan)); + +#ifndef __ILP32__ + if (nr_scan > INT_MAX) + nr_scan = INT_MAX; +#endif + #if __FreeBSD_version >= 1300139 sx_xlock(&arc_vnlru_lock); vnlru_free_vfsops(nr_scan, &zfs_vfsops, arc_vnlru_marker); From 2c8e3e4b28a40e6a1b1926ce03da1a725f0e60f8 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 20 Sep 2022 02:17:27 +0200 Subject: [PATCH 34/39] FreeBSD: stop passing LK_INTERLOCK to VOP_LOCK There is an ongoing effort to eliminate this feature. Reviewed-by: Alexander Motin Reviewed-by: Ryan Moeller Signed-off-by: Mateusz Guzik Closes #13908 --- module/os/freebsd/zfs/zfs_ctldir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/module/os/freebsd/zfs/zfs_ctldir.c b/module/os/freebsd/zfs/zfs_ctldir.c index 3a5c9f8caf0a..5bd2e1510ddb 100644 --- a/module/os/freebsd/zfs/zfs_ctldir.c +++ b/module/os/freebsd/zfs/zfs_ctldir.c @@ -976,12 +976,13 @@ zfsctl_snapdir_lookup(struct vop_lookup_args *ap) */ VI_LOCK(*vpp); if (((*vpp)->v_iflag & VI_MOUNT) == 0) { + VI_UNLOCK(*vpp); /* * Upgrade to exclusive lock in order to: * - avoid race conditions * - satisfy the contract of mount_snapshot() */ - err = VOP_LOCK(*vpp, LK_TRYUPGRADE | LK_INTERLOCK); + err = VOP_LOCK(*vpp, LK_TRYUPGRADE); if (err == 0) break; } else { From eec942cc54652254a39f36fb12976bcc3b4a2706 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 21 Sep 2022 00:21:30 +0200 Subject: [PATCH 35/39] FreeBSD: catch up to 1400068 Reviewed-by: Ryan Moeller Signed-off-by: Mateusz Guzik Closes #13909 --- module/os/freebsd/zfs/zfs_vnops_os.c | 41 ++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index f6bc9c0c6afb..ea6388dd515e 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -981,13 +981,17 @@ zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, case RENAME: if (error == ENOENT) { error = EJUSTRETURN; +#if __FreeBSD_version < 1400068 cnp->cn_flags |= SAVENAME; +#endif break; } fallthrough; case DELETE: +#if __FreeBSD_version < 1400068 if (error == 0) cnp->cn_flags |= SAVENAME; +#endif break; } } @@ -1337,7 +1341,10 @@ zfs_lookup_internal(znode_t *dzp, const char *name, vnode_t **vpp, cnp->cn_nameptr = __DECONST(char *, name); cnp->cn_namelen = strlen(name); cnp->cn_nameiop = nameiop; - cnp->cn_flags = ISLASTCN | SAVENAME; + cnp->cn_flags = ISLASTCN; +#if __FreeBSD_version < 1400068 + cnp->cn_flags |= SAVENAME; +#endif cnp->cn_lkflags = LK_EXCLUSIVE | LK_RETRY; cnp->cn_cred = kcred; #if __FreeBSD_version < 1400037 @@ -4642,7 +4649,9 @@ zfs_freebsd_create(struct vop_create_args *ap) znode_t *zp = NULL; int rc, mode; +#if __FreeBSD_version < 1400068 ASSERT(cnp->cn_flags & SAVENAME); +#endif vattr_init_mask(vap); mode = vap->va_mode & ALLPERMS; @@ -4672,7 +4681,9 @@ static int zfs_freebsd_remove(struct vop_remove_args *ap) { +#if __FreeBSD_version < 1400068 ASSERT(ap->a_cnp->cn_flags & SAVENAME); +#endif return (zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_cred)); @@ -4694,7 +4705,9 @@ zfs_freebsd_mkdir(struct vop_mkdir_args *ap) znode_t *zp = NULL; int rc; +#if __FreeBSD_version < 1400068 ASSERT(ap->a_cnp->cn_flags & SAVENAME); +#endif vattr_init_mask(vap); *ap->a_vpp = NULL; @@ -4720,7 +4733,9 @@ zfs_freebsd_rmdir(struct vop_rmdir_args *ap) { struct componentname *cnp = ap->a_cnp; +#if __FreeBSD_version < 1400068 ASSERT(cnp->cn_flags & SAVENAME); +#endif return (zfs_rmdir_(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred)); } @@ -4974,8 +4989,10 @@ zfs_freebsd_rename(struct vop_rename_args *ap) vnode_t *tvp = ap->a_tvp; int error; +#if __FreeBSD_version < 1400068 ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); +#endif error = zfs_do_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, ap->a_tcnp, ap->a_fcnp->cn_cred); @@ -5011,7 +5028,9 @@ zfs_freebsd_symlink(struct vop_symlink_args *ap) #endif int rc; +#if __FreeBSD_version < 1400068 ASSERT(cnp->cn_flags & SAVENAME); +#endif vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ vattr_init_mask(vap); @@ -5105,7 +5124,9 @@ zfs_freebsd_link(struct vop_link_args *ap) if (tdvp->v_mount != vp->v_mount) return (EXDEV); +#if __FreeBSD_version < 1400068 ASSERT(cnp->cn_flags & SAVENAME); +#endif return (zfs_link(VTOZ(tdvp), VTOZ(vp), cnp->cn_nameptr, cnp->cn_cred, 0)); @@ -5364,10 +5385,10 @@ zfs_getextattr_dir(struct vop_getextattr_args *ap, const char *attrname) NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp); #endif error = vn_open_cred(&nd, &flags, 0, VN_OPEN_INVFS, ap->a_cred, NULL); - vp = nd.ni_vp; - NDFREE_PNBUF(&nd); if (error != 0) return (error); + vp = nd.ni_vp; + NDFREE_PNBUF(&nd); if (ap->a_size != NULL) { error = VOP_GETATTR(vp, &va, ap->a_cred); @@ -5481,12 +5502,10 @@ zfs_deleteextattr_dir(struct vop_deleteextattr_args *ap, const char *attrname) UIO_SYSSPACE, attrname, xvp); #endif error = namei(&nd); - vp = nd.ni_vp; - if (error != 0) { - NDFREE_PNBUF(&nd); + if (error != 0) return (error); - } + vp = nd.ni_vp; error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); NDFREE_PNBUF(&nd); @@ -5612,10 +5631,10 @@ zfs_setextattr_dir(struct vop_setextattr_args *ap, const char *attrname) #endif error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred, NULL); - vp = nd.ni_vp; - NDFREE_PNBUF(&nd); if (error != 0) return (error); + vp = nd.ni_vp; + NDFREE_PNBUF(&nd); VATTR_NULL(&va); va.va_size = 0; @@ -5767,10 +5786,10 @@ zfs_listextattr_dir(struct vop_listextattr_args *ap, const char *attrprefix) UIO_SYSSPACE, ".", xvp); #endif error = namei(&nd); - vp = nd.ni_vp; - NDFREE_PNBUF(&nd); if (error != 0) return (error); + vp = nd.ni_vp; + NDFREE_PNBUF(&nd); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; From 63d4838b4ac5557510db70aff334ec7caa96114c Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 21 Sep 2022 00:22:32 +0200 Subject: [PATCH 36/39] FreeBSD: handle V_PCATCH See https://cgit.FreeBSD.org/src/commit/?id=a75d1ddd74312f5dd79bc1e965f7077679659f2e Reviewed-by: Ryan Moeller Reviewed-by: Alexander Motin Signed-off-by: Mateusz Guzik Closes #13910 --- module/os/freebsd/zfs/zfs_file_os.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/module/os/freebsd/zfs/zfs_file_os.c b/module/os/freebsd/zfs/zfs_file_os.c index fd86a75416e6..60c9ff0581e0 100644 --- a/module/os/freebsd/zfs/zfs_file_os.c +++ b/module/os/freebsd/zfs/zfs_file_os.c @@ -226,7 +226,11 @@ zfs_vop_fsync(vnode_t *vp) struct mount *mp; int error; +#if __FreeBSD_version < 1400068 if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) +#else + if ((error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH)) != 0) +#endif goto drop; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(vp, MNT_WAIT, curthread); From a2705b1dd5f8d186db02091b96efdd5f87e38090 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Fri, 23 Sep 2022 10:24:19 -0700 Subject: [PATCH 37/39] zpool: Don't print "repairing" on force faulted drives If you force fault a drive that's resilvering, it's scan stats can get frozen in time, giving the false impression that it's being resilvered. This commit checks the vdev state to see if the vdev is healthy before reporting "resilvering" or "repairing" in zpool status. Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #13927 Closes #13930 --- cmd/zpool/zpool_main.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 3f46bd0513c9..54464731b52e 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -2438,7 +2438,14 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, (void) nvlist_lookup_uint64_array(root, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c); - if (ps != NULL && ps->pss_state == DSS_SCANNING && children == 0) { + /* + * If you force fault a drive that's resilvering, its scan stats can + * get frozen in time, giving the false impression that it's + * being resilvered. That's why we check the state to see if the vdev + * is healthy before reporting "resilvering" or "repairing". + */ + if (ps != NULL && ps->pss_state == DSS_SCANNING && children == 0 && + vs->vs_state == VDEV_STATE_HEALTHY) { if (vs->vs_scan_processed != 0) { (void) printf(gettext(" (%s)"), (ps->pss_func == POOL_SCAN_RESILVER) ? @@ -2450,7 +2457,7 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, /* The top-level vdevs have the rebuild stats */ if (vrs != NULL && vrs->vrs_state == VDEV_REBUILD_ACTIVE && - children == 0) { + children == 0 && vs->vs_state == VDEV_STATE_HEALTHY) { if (vs->vs_rebuild_processed != 0) { (void) printf(gettext(" (resilvering)")); } From 566e908fa01eb91e0637347987bc61772d47aee1 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Tue, 27 Sep 2022 15:36:58 -0400 Subject: [PATCH 38/39] Fix bad free in skein code Clang's static analyzer found a bad free caused by skein_mac_atomic(). It will allocate a context on the stack and then pass it to skein_final(), which attempts to free it. Upon inspection, skein_digest_atomic() also has the same problem. These functions were created to match the OpenSolaris ICP API, so I was curious how we avoided this in other providers and looked at the SHA2 code. It appears that SHA2 has a SHA2Final() helper function that is called by the exported sha2_mac_final()/sha2_digest_final() as well as the sha2_mac_atomic() and sha2_digest_atomic() functions. The real work is done in SHA2Final() while some checks and the free are done in sha2_mac_final()/sha2_digest_final(). We fix the use after free in the skein code by taking inspiration from the SHA2 code. We introduce a skein_final_nofree() that does most of the work, and make skein_final() into a function that calls it and then frees the memory. Reviewed-by: Brian Behlendorf Reviewed-by: Tony Hutter Signed-off-by: Richard Yao Closes #13954 --- module/icp/io/skein_mod.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/module/icp/io/skein_mod.c b/module/icp/io/skein_mod.c index 5ee36af12bcb..8992c5895e5b 100644 --- a/module/icp/io/skein_mod.c +++ b/module/icp/io/skein_mod.c @@ -494,7 +494,8 @@ skein_update(crypto_ctx_t *ctx, crypto_data_t *data, crypto_req_handle_t req) */ /*ARGSUSED*/ static int -skein_final(crypto_ctx_t *ctx, crypto_data_t *digest, crypto_req_handle_t req) +skein_final_nofree(crypto_ctx_t *ctx, crypto_data_t *digest, + crypto_req_handle_t req) { int error = CRYPTO_SUCCESS; @@ -525,6 +526,17 @@ skein_final(crypto_ctx_t *ctx, crypto_data_t *digest, crypto_req_handle_t req) else digest->cd_length = 0; + return (error); +} + +static int +skein_final(crypto_ctx_t *ctx, crypto_data_t *digest, crypto_req_handle_t req) +{ + int error = skein_final_nofree(ctx, digest, req); + + if (error == CRYPTO_BUFFER_TOO_SMALL) + return (error); + bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); kmem_free(SKEIN_CTX(ctx), sizeof (*(SKEIN_CTX(ctx)))); SKEIN_CTX_LVALUE(ctx) = NULL; @@ -560,7 +572,7 @@ skein_digest_atomic(crypto_provider_handle_t provider, if ((error = skein_update(&ctx, data, digest)) != CRYPTO_SUCCESS) goto out; - if ((error = skein_final(&ctx, data, digest)) != CRYPTO_SUCCESS) + if ((error = skein_final_nofree(&ctx, data, digest)) != CRYPTO_SUCCESS) goto out; out: @@ -669,7 +681,7 @@ skein_mac_atomic(crypto_provider_handle_t provider, if ((error = skein_update(&ctx, data, req)) != CRYPTO_SUCCESS) goto errout; - if ((error = skein_final(&ctx, mac, req)) != CRYPTO_SUCCESS) + if ((error = skein_final_nofree(&ctx, mac, req)) != CRYPTO_SUCCESS) goto errout; return (CRYPTO_SUCCESS); From 6a6bd493988c75331deab06e5352a9bed035a87d Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Mon, 19 Sep 2022 09:50:46 -0700 Subject: [PATCH 39/39] Tag zfs-2.1.6 META file and changelog updated. Signed-off-by: Tony Hutter --- META | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/META b/META index baf85fe89de0..7dd5b311d0c2 100644 --- a/META +++ b/META @@ -1,7 +1,7 @@ Meta: 1 Name: zfs Branch: 1.0 -Version: 2.1.5 +Version: 2.1.6 Release: 1 Release-Tags: relext License: CDDL