diff --git a/META b/META index baf85fe89de0..7dd5b311d0c2 100644 --- a/META +++ b/META @@ -1,7 +1,7 @@ Meta: 1 Name: zfs Branch: 1.0 -Version: 2.1.5 +Version: 2.1.6 Release: 1 Release-Tags: relext License: CDDL diff --git a/Makefile.am b/Makefile.am index e28369eac88f..7bafc53392e5 100644 --- a/Makefile.am +++ b/Makefile.am @@ -114,6 +114,11 @@ commitcheck: ${top_srcdir}/scripts/commitcheck.sh; \ fi +if HAVE_PARALLEL +cstyle_line = -print0 | parallel -X0 ${top_srcdir}/scripts/cstyle.pl -cpP {} +else +cstyle_line = -exec ${top_srcdir}/scripts/cstyle.pl -cpP {} + +endif PHONY += cstyle cstyle: @find ${top_srcdir} -name build -prune -o -name zfsd -prune \ @@ -123,7 +128,7 @@ cstyle: ! -name 'opt_global.h' ! -name '*_if*.h' \ ! -name 'zstd_compat_wrapper.h' \ ! -path './module/zstd/lib/*' \ - -exec ${top_srcdir}/scripts/cstyle.pl -cpP {} \+ + $(cstyle_line) filter_executable = -exec test -x '{}' \; -print diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index db8e2200a72b..4e57538d2234 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -112,7 +112,7 @@ extern int zfs_vdev_async_read_max_active; extern boolean_t spa_load_verify_dryrun; extern boolean_t spa_mode_readable_spacemaps; extern int zfs_reconstruct_indirect_combinations_max; -extern int zfs_btree_verify_intensity; +extern uint_t zfs_btree_verify_intensity; static const char cmdname[] = "zdb"; uint8_t dump_opt[256]; diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index a510d646e1f9..a4e23ca1a3b0 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -894,14 +894,90 @@ zfs_deliver_check(nvlist_t *nvl) return (0); } +/* + * Given a path to a vdev, lookup the vdev's physical size from its + * config nvlist. + * + * Returns the vdev's physical size in bytes on success, 0 on error. + */ +static uint64_t +vdev_size_from_config(zpool_handle_t *zhp, const char *vdev_path) +{ + nvlist_t *nvl = NULL; + boolean_t avail_spare, l2cache, log; + vdev_stat_t *vs = NULL; + uint_t c; + + nvl = zpool_find_vdev(zhp, vdev_path, &avail_spare, &l2cache, &log); + if (!nvl) + return (0); + + verify(nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0); + if (!vs) { + zed_log_msg(LOG_INFO, "%s: no nvlist for '%s'", __func__, + vdev_path); + return (0); + } + + return (vs->vs_pspace); +} + +/* + * Given a path to a vdev, lookup if the vdev is a "whole disk" in the + * config nvlist. "whole disk" means that ZFS was passed a whole disk + * at pool creation time, which it partitioned up and has full control over. + * Thus a partition with wholedisk=1 set tells us that zfs created the + * partition at creation time. A partition without whole disk set would have + * been created by externally (like with fdisk) and passed to ZFS. + * + * Returns the whole disk value (either 0 or 1). + */ +static uint64_t +vdev_whole_disk_from_config(zpool_handle_t *zhp, const char *vdev_path) +{ + nvlist_t *nvl = NULL; + boolean_t avail_spare, l2cache, log; + uint64_t wholedisk; + + nvl = zpool_find_vdev(zhp, vdev_path, &avail_spare, &l2cache, &log); + if (!nvl) + return (0); + + verify(nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_WHOLE_DISK, + &wholedisk) == 0); + + return (wholedisk); +} + +/* + * If the device size grew more than 1% then return true. + */ +#define DEVICE_GREW(oldsize, newsize) \ + ((newsize > oldsize) && \ + ((newsize / (newsize - oldsize)) <= 100)) + static int zfsdle_vdev_online(zpool_handle_t *zhp, void *data) { - char *devname = data; boolean_t avail_spare, l2cache; + nvlist_t *udev_nvl = data; nvlist_t *tgt; int error; + char *tmp_devname, devname[MAXPATHLEN]; + uint64_t guid; + + if (nvlist_lookup_uint64(udev_nvl, ZFS_EV_VDEV_GUID, &guid) == 0) { + sprintf(devname, "%llu", (u_longlong_t)guid); + } else if (nvlist_lookup_string(udev_nvl, DEV_PHYS_PATH, + &tmp_devname) == 0) { + strlcpy(devname, tmp_devname, MAXPATHLEN); + zfs_append_partition(devname, MAXPATHLEN); + } else { + zed_log_msg(LOG_INFO, "%s: no guid or physpath", __func__); + } + zed_log_msg(LOG_INFO, "zfsdle_vdev_online: searching for '%s' in '%s'", devname, zpool_get_name(zhp)); @@ -953,12 +1029,75 @@ zfsdle_vdev_online(zpool_handle_t *zhp, void *data) vdev_state_t newstate; if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) { - error = zpool_vdev_online(zhp, fullpath, 0, - &newstate); - zed_log_msg(LOG_INFO, "zfsdle_vdev_online: " - "setting device '%s' to ONLINE state " - "in pool '%s': %d", fullpath, - zpool_get_name(zhp), error); + /* + * If this disk size has not changed, then + * there's no need to do an autoexpand. To + * check we look at the disk's size in its + * config, and compare it to the disk size + * that udev is reporting. + */ + uint64_t udev_size = 0, conf_size = 0, + wholedisk = 0, udev_parent_size = 0; + + /* + * Get the size of our disk that udev is + * reporting. + */ + if (nvlist_lookup_uint64(udev_nvl, DEV_SIZE, + &udev_size) != 0) { + udev_size = 0; + } + + /* + * Get the size of our disk's parent device + * from udev (where sda1's parent is sda). + */ + if (nvlist_lookup_uint64(udev_nvl, + DEV_PARENT_SIZE, &udev_parent_size) != 0) { + udev_parent_size = 0; + } + + conf_size = vdev_size_from_config(zhp, + fullpath); + + wholedisk = vdev_whole_disk_from_config(zhp, + fullpath); + + /* + * Only attempt an autoexpand if the vdev size + * changed. There are two different cases + * to consider. + * + * 1. wholedisk=1 + * If you do a 'zpool create' on a whole disk + * (like /dev/sda), then zfs will create + * partitions on the disk (like /dev/sda1). In + * that case, wholedisk=1 will be set in the + * partition's nvlist config. So zed will need + * to see if your parent device (/dev/sda) + * expanded in size, and if so, then attempt + * the autoexpand. + * + * 2. wholedisk=0 + * If you do a 'zpool create' on an existing + * partition, or a device that doesn't allow + * partitions, then wholedisk=0, and you will + * simply need to check if the device itself + * expanded in size. + */ + if (DEVICE_GREW(conf_size, udev_size) || + (wholedisk && DEVICE_GREW(conf_size, + udev_parent_size))) { + error = zpool_vdev_online(zhp, fullpath, + 0, &newstate); + + zed_log_msg(LOG_INFO, + "%s: autoexpanding '%s' from %llu" + " to %llu bytes in pool '%s': %d", + __func__, fullpath, conf_size, + MAX(udev_size, udev_parent_size), + zpool_get_name(zhp), error); + } } } zpool_close(zhp); @@ -989,7 +1128,7 @@ zfs_deliver_dle(nvlist_t *nvl) zed_log_msg(LOG_INFO, "zfs_deliver_dle: no guid or physpath"); } - if (zpool_iter(g_zfshdl, zfsdle_vdev_online, name) != 1) { + if (zpool_iter(g_zfshdl, zfsdle_vdev_online, nvl) != 1) { zed_log_msg(LOG_INFO, "zfs_deliver_dle: device '%s' not " "found", name); return (1); diff --git a/cmd/zed/zed_disk_event.c b/cmd/zed/zed_disk_event.c index 52b80d8c4c93..e31ec4cfc7e7 100644 --- a/cmd/zed/zed_disk_event.c +++ b/cmd/zed/zed_disk_event.c @@ -78,6 +78,8 @@ zed_udev_event(const char *class, const char *subclass, nvlist_t *nvl) zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PHYS_PATH, strval); if (nvlist_lookup_uint64(nvl, DEV_SIZE, &numval) == 0) zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_SIZE, numval); + if (nvlist_lookup_uint64(nvl, DEV_PARENT_SIZE, &numval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_PARENT_SIZE, numval); if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &numval) == 0) zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_POOL_GUID, numval); if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &numval) == 0) @@ -130,6 +132,20 @@ dev_event_nvlist(struct udev_device *dev) numval *= strtoull(value, NULL, 10); (void) nvlist_add_uint64(nvl, DEV_SIZE, numval); + + /* + * If the device has a parent, then get the parent block + * device's size as well. For example, /dev/sda1's parent + * is /dev/sda. + */ + struct udev_device *parent_dev = udev_device_get_parent(dev); + if ((value = udev_device_get_sysattr_value(parent_dev, "size")) + != NULL) { + uint64_t numval = DEV_BSIZE; + + numval *= strtoull(value, NULL, 10); + (void) nvlist_add_uint64(nvl, DEV_PARENT_SIZE, numval); + } } /* @@ -169,7 +185,7 @@ zed_udev_monitor(void *arg) while (1) { struct udev_device *dev; const char *action, *type, *part, *sectors; - const char *bus, *uuid; + const char *bus, *uuid, *devpath; const char *class, *subclass; nvlist_t *nvl; boolean_t is_zfs = B_FALSE; @@ -208,6 +224,12 @@ zed_udev_monitor(void *arg) * if this is a disk and it is partitioned, then the * zfs label will reside in a DEVTYPE=partition and * we can skip passing this event + * + * Special case: Blank disks are sometimes reported with + * an erroneous 'atari' partition, and should not be + * excluded from being used as an autoreplace disk: + * + * https://github.com/openzfs/zfs/issues/13497 */ type = udev_device_get_property_value(dev, "DEVTYPE"); part = udev_device_get_property_value(dev, @@ -215,14 +237,23 @@ zed_udev_monitor(void *arg) if (type != NULL && type[0] != '\0' && strcmp(type, "disk") == 0 && part != NULL && part[0] != '\0') { - zed_log_msg(LOG_INFO, - "%s: skip %s since it has a %s partition already", - __func__, - udev_device_get_property_value(dev, "DEVNAME"), - part); - /* skip and wait for partition event */ - udev_device_unref(dev); - continue; + const char *devname = + udev_device_get_property_value(dev, "DEVNAME"); + + if (strcmp(part, "atari") == 0) { + zed_log_msg(LOG_INFO, + "%s: %s is reporting an atari partition, " + "but we're going to assume it's a false " + "positive and still use it (issue #13497)", + __func__, devname); + } else { + zed_log_msg(LOG_INFO, + "%s: skip %s since it has a %s partition " + "already", __func__, devname, part); + /* skip and wait for partition event */ + udev_device_unref(dev); + continue; + } } /* @@ -248,10 +279,19 @@ zed_udev_monitor(void *arg) * device id string is required in the message schema * for matching with vdevs. Preflight here for expected * udev information. + * + * Special case: + * NVMe devices don't have ID_BUS set (at least on RHEL 7-8), + * but they are valid for autoreplace. Add a special case for + * them by searching for "/nvme/" in the udev DEVPATH: + * + * DEVPATH=/devices/pci0000:00/0000:00:1e.0/nvme/nvme2/nvme2n1 */ bus = udev_device_get_property_value(dev, "ID_BUS"); uuid = udev_device_get_property_value(dev, "DM_UUID"); - if (!is_zfs && (bus == NULL && uuid == NULL)) { + devpath = udev_device_get_devpath(dev); + if (!is_zfs && (bus == NULL && uuid == NULL && + strstr(devpath, "/nvme/") == NULL)) { zed_log_msg(LOG_INFO, "zed_udev_monitor: %s no devid " "source", udev_device_get_devnode(dev)); udev_device_unref(dev); diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index b93a6196beea..54464731b52e 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -2438,7 +2438,14 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, (void) nvlist_lookup_uint64_array(root, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c); - if (ps != NULL && ps->pss_state == DSS_SCANNING && children == 0) { + /* + * If you force fault a drive that's resilvering, its scan stats can + * get frozen in time, giving the false impression that it's + * being resilvered. That's why we check the state to see if the vdev + * is healthy before reporting "resilvering" or "repairing". + */ + if (ps != NULL && ps->pss_state == DSS_SCANNING && children == 0 && + vs->vs_state == VDEV_STATE_HEALTHY) { if (vs->vs_scan_processed != 0) { (void) printf(gettext(" (%s)"), (ps->pss_func == POOL_SCAN_RESILVER) ? @@ -2450,7 +2457,7 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, /* The top-level vdevs have the rebuild stats */ if (vrs != NULL && vrs->vrs_state == VDEV_REBUILD_ACTIVE && - children == 0) { + children == 0 && vs->vs_state == VDEV_STATE_HEALTHY) { if (vs->vs_rebuild_processed != 0) { (void) printf(gettext(" (resilvering)")); } @@ -5458,8 +5465,8 @@ get_namewidth_iostat(zpool_handle_t *zhp, void *data) * get_namewidth() returns the maximum width of any name in that column * for any pool/vdev/device line that will be output. */ - width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags, - cb->cb_verbose); + width = get_namewidth(zhp, cb->cb_namewidth, + cb->cb_name_flags | VDEV_NAME_TYPE_ID, cb->cb_verbose); /* * The width we are calculating is the width of the header and also the @@ -6035,6 +6042,7 @@ print_one_column(zpool_prop_t prop, uint64_t value, const char *str, size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL); switch (prop) { + case ZPOOL_PROP_SIZE: case ZPOOL_PROP_EXPANDSZ: case ZPOOL_PROP_CHECKPOINT: case ZPOOL_PROP_DEDUPRATIO: @@ -6130,8 +6138,12 @@ print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, * 'toplevel' boolean value is passed to the print_one_column() * to indicate that the value is valid. */ - print_one_column(ZPOOL_PROP_SIZE, vs->vs_space, NULL, scripted, - toplevel, format); + if (vs->vs_pspace) + print_one_column(ZPOOL_PROP_SIZE, vs->vs_pspace, NULL, + scripted, B_TRUE, format); + else + print_one_column(ZPOOL_PROP_SIZE, vs->vs_space, NULL, + scripted, toplevel, format); print_one_column(ZPOOL_PROP_ALLOCATED, vs->vs_alloc, NULL, scripted, toplevel, format); print_one_column(ZPOOL_PROP_FREE, vs->vs_space - vs->vs_alloc, @@ -6282,8 +6294,8 @@ get_namewidth_list(zpool_handle_t *zhp, void *data) list_cbdata_t *cb = data; int width; - width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags, - cb->cb_verbose); + width = get_namewidth(zhp, cb->cb_namewidth, + cb->cb_name_flags | VDEV_NAME_TYPE_ID, cb->cb_verbose); if (width < 9) width = 9; diff --git a/config/always-parallel.m4 b/config/always-parallel.m4 new file mode 100644 index 000000000000..c1f1ae78e7e7 --- /dev/null +++ b/config/always-parallel.m4 @@ -0,0 +1,8 @@ +dnl # +dnl # Check if GNU parallel is available. +dnl # +AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PARALLEL], [ + AC_CHECK_PROG([PARALLEL], [parallel], [yes]) + + AM_CONDITIONAL([HAVE_PARALLEL], [test "x$PARALLEL" = "xyes"]) +]) diff --git a/config/kernel-xattr-handler.m4 b/config/kernel-xattr-handler.m4 index 00b1e74a9ccb..b6cbfa155007 100644 --- a/config/kernel-xattr-handler.m4 +++ b/config/kernel-xattr-handler.m4 @@ -100,6 +100,19 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_GET], [ .get = get, }; ],[]) + + ZFS_LINUX_TEST_SRC([xattr_handler_get_dentry_inode_flags], [ + #include + + int get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, + size_t size, int flags) { return 0; } + static const struct xattr_handler + xops __attribute__ ((unused)) = { + .get = get, + }; + ],[]) ]) AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_GET], [ @@ -142,7 +155,21 @@ AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_GET], [ AC_DEFINE(HAVE_XATTR_GET_DENTRY, 1, [xattr_handler->get() wants dentry]) ],[ - ZFS_LINUX_TEST_ERROR([xattr get()]) + dnl # + dnl # Android API change, + dnl # The xattr_handler->get() callback was + dnl # changed to take dentry, inode and flags. + dnl # + AC_MSG_RESULT(no) + AC_MSG_CHECKING( + [whether xattr_handler->get() wants dentry and inode and flags]) + ZFS_LINUX_TEST_RESULT([xattr_handler_get_dentry_inode_flags], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_XATTR_GET_DENTRY_INODE_FLAGS, 1, + [xattr_handler->get() wants dentry and inode and flags]) + ],[ + ZFS_LINUX_TEST_ERROR([xattr get()]) + ]) ]) ]) ]) diff --git a/config/zfs-build.m4 b/config/zfs-build.m4 index 8ca596ecf06b..bd8e3ac80201 100644 --- a/config/zfs-build.m4 +++ b/config/zfs-build.m4 @@ -226,6 +226,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS], [ ZFS_AC_CONFIG_ALWAYS_SED ZFS_AC_CONFIG_ALWAYS_CPPCHECK ZFS_AC_CONFIG_ALWAYS_SHELLCHECK + ZFS_AC_CONFIG_ALWAYS_PARALLEL ]) AC_DEFUN([ZFS_AC_CONFIG], [ diff --git a/contrib/truenas/changelog b/contrib/truenas/changelog index 3b64660d90bc..a426dbae76eb 100644 --- a/contrib/truenas/changelog +++ b/contrib/truenas/changelog @@ -1,3 +1,14 @@ +openzfs (2.1.6-0) unstable; urgency=medium + + * Merge tag zfs-2.1.6 + * zed: mark disks as REMOVED when they are removed + * Provide kfpu_begin/end from spl + * Add snapshots_changed as property + * Add createtxg sort support for simple snapshot iterator + * Expose ZFS dataset case sensitivity setting via sb_opts + + -- Ryan Moeller Wed, 22 Jun 2022 16:00:00 -0500 + openzfs (2.1.5-0) unstable; urgency=medium * Merged tag zfs-2.1.5 diff --git a/include/os/freebsd/spl/sys/mod_os.h b/include/os/freebsd/spl/sys/mod_os.h index 5695abee7b85..46ea2d15ac6e 100644 --- a/include/os/freebsd/spl/sys/mod_os.h +++ b/include/os/freebsd/spl/sys/mod_os.h @@ -52,7 +52,7 @@ #define ZFS_MODULE_PARAM_CALL_IMPL(parent, name, perm, args, desc) \ SYSCTL_DECL(parent); \ - SYSCTL_PROC(parent, OID_AUTO, name, perm | args, desc) + SYSCTL_PROC(parent, OID_AUTO, name, CTLFLAG_MPSAFE | perm | args, desc) #define ZFS_MODULE_PARAM_CALL(scope_prefix, name_prefix, name, func, _, perm, desc) \ ZFS_MODULE_PARAM_CALL_IMPL(_vfs_ ## scope_prefix, name, perm, func ## _args(name_prefix ## name), desc) diff --git a/include/os/linux/kernel/linux/xattr_compat.h b/include/os/linux/kernel/linux/xattr_compat.h index 54690727eab9..30403fe87397 100644 --- a/include/os/linux/kernel/linux/xattr_compat.h +++ b/include/os/linux/kernel/linux/xattr_compat.h @@ -115,6 +115,20 @@ fn(struct dentry *dentry, const char *name, void *buffer, size_t size, \ { \ return (__ ## fn(dentry->d_inode, name, buffer, size)); \ } +/* + * Android API change, + * The xattr_handler->get() callback was changed to take a dentry and inode + * and flags, because the dentry might not be attached to an inode yet. + */ +#elif defined(HAVE_XATTR_GET_DENTRY_INODE_FLAGS) +#define ZPL_XATTR_GET_WRAPPER(fn) \ +static int \ +fn(const struct xattr_handler *handler, struct dentry *dentry, \ + struct inode *inode, const char *name, void *buffer, \ + size_t size, int flags) \ +{ \ + return (__ ## fn(inode, name, buffer, size)); \ +} #else #error "Unsupported kernel" #endif diff --git a/include/sys/arc.h b/include/sys/arc.h index a3241f3685a6..5d8176894e60 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -85,6 +85,7 @@ typedef void arc_prune_func_t(int64_t bytes, void *priv); /* Shared module parameters */ extern int zfs_arc_average_blocksize; +extern int l2arc_exclude_special; /* generic arc_done_func_t's which you can use */ arc_read_done_func_t arc_bcopy_func; diff --git a/include/sys/bqueue.h b/include/sys/bqueue.h index 797aecd791a3..b9621966027a 100644 --- a/include/sys/bqueue.h +++ b/include/sys/bqueue.h @@ -30,22 +30,22 @@ typedef struct bqueue { kmutex_t bq_lock; kcondvar_t bq_add_cv; kcondvar_t bq_pop_cv; - uint64_t bq_size; - uint64_t bq_maxsize; - uint64_t bq_fill_fraction; + size_t bq_size; + size_t bq_maxsize; + uint_t bq_fill_fraction; size_t bq_node_offset; } bqueue_t; typedef struct bqueue_node { list_node_t bqn_node; - uint64_t bqn_size; + size_t bqn_size; } bqueue_node_t; -int bqueue_init(bqueue_t *, uint64_t, uint64_t, size_t); +int bqueue_init(bqueue_t *, uint_t, size_t, size_t); void bqueue_destroy(bqueue_t *); -void bqueue_enqueue(bqueue_t *, void *, uint64_t); -void bqueue_enqueue_flush(bqueue_t *, void *, uint64_t); +void bqueue_enqueue(bqueue_t *, void *, size_t); +void bqueue_enqueue_flush(bqueue_t *, void *, size_t); void *bqueue_dequeue(bqueue_t *); boolean_t bqueue_empty(bqueue_t *); diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 93d80066be82..b757b2664178 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -321,12 +321,13 @@ typedef struct dmu_buf_impl { uint8_t db_dirtycnt; } dmu_buf_impl_t; -#define DBUF_RWLOCKS 8192 -#define DBUF_HASH_RWLOCK(h, idx) (&(h)->hash_rwlocks[(idx) & (DBUF_RWLOCKS-1)]) +/* Note: the dbuf hash table is exposed only for the mdb module */ +#define DBUF_MUTEXES 2048 +#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)]) typedef struct dbuf_hash_table { uint64_t hash_table_mask; dmu_buf_impl_t **hash_table; - krwlock_t hash_rwlocks[DBUF_RWLOCKS] ____cacheline_aligned; + kmutex_t hash_mutexes[DBUF_MUTEXES] ____cacheline_aligned; } dbuf_hash_table_t; typedef void (*dbuf_prefetch_fn)(void *, uint64_t, uint64_t, boolean_t); @@ -441,16 +442,7 @@ dbuf_find_dirty_eq(dmu_buf_impl_t *db, uint64_t txg) (dbuf_is_metadata(_db) && \ ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))) -#define DBUF_IS_L2CACHEABLE(_db) \ - ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \ - (dbuf_is_metadata(_db) && \ - ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA))) - -#define DNODE_LEVEL_IS_L2CACHEABLE(_dn, _level) \ - ((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_ALL || \ - (((_level) > 0 || \ - DMU_OT_IS_METADATA((_dn)->dn_handle->dnh_dnode->dn_type)) && \ - ((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA))) +boolean_t dbuf_is_l2cacheable(dmu_buf_impl_t *db); #ifdef ZFS_DEBUG diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 0cf4dbc9f925..070d27fde3a9 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -136,7 +136,7 @@ typedef enum dmu_object_byteswap { #endif #define DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \ - ((ot) & DMU_OT_METADATA) : \ + (((ot) & DMU_OT_METADATA) != 0) : \ DMU_OT_IS_METADATA_IMPL(ot)) #define DMU_OT_IS_DDT(ot) \ @@ -147,7 +147,7 @@ typedef enum dmu_object_byteswap { ((ot) == DMU_OT_PLAIN_FILE_CONTENTS || (ot) == DMU_OT_UINT64_OTHER) #define DMU_OT_IS_ENCRYPTED(ot) (((ot) & DMU_OT_NEWTYPE) ? \ - ((ot) & DMU_OT_ENCRYPTED) : \ + (((ot) & DMU_OT_ENCRYPTED) != 0) : \ DMU_OT_IS_ENCRYPTED_IMPL(ot)) /* diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h index e89ee64ea686..7ade2dc91247 100644 --- a/include/sys/dmu_objset.h +++ b/include/sys/dmu_objset.h @@ -200,10 +200,6 @@ struct objset { #define DMU_GROUPUSED_DNODE(os) ((os)->os_groupused_dnode.dnh_dnode) #define DMU_PROJECTUSED_DNODE(os) ((os)->os_projectused_dnode.dnh_dnode) -#define DMU_OS_IS_L2CACHEABLE(os) \ - ((os)->os_secondary_cache == ZFS_CACHE_ALL || \ - (os)->os_secondary_cache == ZFS_CACHE_METADATA) - /* called from zpl */ int dmu_objset_hold(const char *name, void *tag, objset_t **osp); int dmu_objset_hold_flags(const char *name, boolean_t decrypt, void *tag, diff --git a/include/sys/dmu_tx.h b/include/sys/dmu_tx.h index 60e9ed6e26f5..ad3f1b0e47ca 100644 --- a/include/sys/dmu_tx.h +++ b/include/sys/dmu_tx.h @@ -125,6 +125,7 @@ typedef struct dmu_tx_stats { kstat_named_t dmu_tx_dirty_delay; kstat_named_t dmu_tx_dirty_over_max; kstat_named_t dmu_tx_dirty_frees_delay; + kstat_named_t dmu_tx_wrlog_delay; kstat_named_t dmu_tx_quota; } dmu_tx_stats_t; diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h index e5eb9a20e9ca..e93bd0557c1e 100644 --- a/include/sys/dsl_pool.h +++ b/include/sys/dsl_pool.h @@ -40,6 +40,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -58,6 +59,7 @@ struct dsl_deadlist; extern unsigned long zfs_dirty_data_max; extern unsigned long zfs_dirty_data_max_max; +extern unsigned long zfs_wrlog_data_max; extern int zfs_dirty_data_sync_percent; extern int zfs_dirty_data_max_percent; extern int zfs_dirty_data_max_max_percent; @@ -118,6 +120,9 @@ typedef struct dsl_pool { uint64_t dp_mos_compressed_delta; uint64_t dp_mos_uncompressed_delta; + aggsum_t dp_wrlog_pertxg[TXG_SIZE]; + aggsum_t dp_wrlog_total; + /* * Time of most recently scheduled (furthest in the future) * wakeup for delayed transactions. @@ -158,6 +163,8 @@ uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy); uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy); uint64_t dsl_pool_deferred_space(dsl_pool_t *dp); +void dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg); +boolean_t dsl_pool_need_wrlog_delay(dsl_pool_t *dp); void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg); void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp); diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 556831fc7993..d227e9f07ae1 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1103,6 +1103,7 @@ typedef struct vdev_stat { uint64_t vs_configured_ashift; /* TLV vdev_ashift */ uint64_t vs_logical_ashift; /* vdev_logical_ashift */ uint64_t vs_physical_ashift; /* vdev_physical_ashift */ + uint64_t vs_pspace; /* physical capacity */ } vdev_stat_t; /* BEGIN CSTYLED */ @@ -1650,6 +1651,44 @@ typedef enum { #define ZFS_EV_HIST_DSID "history_dsid" #define ZFS_EV_RESILVER_TYPE "resilver_type" +/* + * We currently support block sizes from 512 bytes to 16MB. + * The benefits of larger blocks, and thus larger IO, need to be weighed + * against the cost of COWing a giant block to modify one byte, and the + * large latency of reading or writing a large block. + * + * The recordsize property can not be set larger than zfs_max_recordsize + * (default 16MB on 64-bit and 1MB on 32-bit). See the comment near + * zfs_max_recordsize in dsl_dataset.c for details. + * + * Note that although the LSIZE field of the blkptr_t can store sizes up + * to 32MB, the dnode's dn_datablkszsec can only store sizes up to + * 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB. + */ +#define SPA_MINBLOCKSHIFT 9 +#define SPA_OLD_MAXBLOCKSHIFT 17 +#define SPA_MAXBLOCKSHIFT 24 +#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) +#define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) +#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) + +/* supported encryption algorithms */ +enum zio_encrypt { + ZIO_CRYPT_INHERIT = 0, + ZIO_CRYPT_ON, + ZIO_CRYPT_OFF, + ZIO_CRYPT_AES_128_CCM, + ZIO_CRYPT_AES_192_CCM, + ZIO_CRYPT_AES_256_CCM, + ZIO_CRYPT_AES_128_GCM, + ZIO_CRYPT_AES_192_GCM, + ZIO_CRYPT_AES_256_GCM, + ZIO_CRYPT_FUNCTIONS +}; + +#define ZIO_CRYPT_ON_VALUE ZIO_CRYPT_AES_256_GCM +#define ZIO_CRYPT_DEFAULT ZIO_CRYPT_OFF + /* * xattr namespace prefixes. These are forbidden in xattr names. * diff --git a/include/sys/spa.h b/include/sys/spa.h index ad58539d2ddd..1dacd0f4328f 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -72,27 +72,6 @@ struct dsl_pool; struct dsl_dataset; struct dsl_crypto_params; -/* - * We currently support block sizes from 512 bytes to 16MB. - * The benefits of larger blocks, and thus larger IO, need to be weighed - * against the cost of COWing a giant block to modify one byte, and the - * large latency of reading or writing a large block. - * - * Note that although blocks up to 16MB are supported, the recordsize - * property can not be set larger than zfs_max_recordsize (default 1MB). - * See the comment near zfs_max_recordsize in dsl_dataset.c for details. - * - * Note that although the LSIZE field of the blkptr_t can store sizes up - * to 32MB, the dnode's dn_datablkszsec can only store sizes up to - * 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB. - */ -#define SPA_MINBLOCKSHIFT 9 -#define SPA_OLD_MAXBLOCKSHIFT 17 -#define SPA_MAXBLOCKSHIFT 24 -#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) -#define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) -#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) - /* * Alignment Shift (ashift) is an immutable, internal top-level vdev property * which can only be set at vdev creation time. Physical writes are always done diff --git a/include/sys/sysevent/dev.h b/include/sys/sysevent/dev.h index 1117538d822d..2418bbad469d 100644 --- a/include/sys/sysevent/dev.h +++ b/include/sys/sysevent/dev.h @@ -244,6 +244,9 @@ extern "C" { #define DEV_PATH "path" #define DEV_IS_PART "is_slice" #define DEV_SIZE "dev_size" + +/* Size of the whole parent block device (if dev is a partition) */ +#define DEV_PARENT_SIZE "dev_parent_size" #endif /* __linux__ */ #define EV_V1 1 diff --git a/include/sys/txg.h b/include/sys/txg.h index 22158bd1a5e6..f38f0006c040 100644 --- a/include/sys/txg.h +++ b/include/sys/txg.h @@ -78,7 +78,7 @@ extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks); extern void txg_delay(struct dsl_pool *dp, uint64_t txg, hrtime_t delta, hrtime_t resolution); -extern void txg_kick(struct dsl_pool *dp); +extern void txg_kick(struct dsl_pool *dp, uint64_t txg); /* * Wait until the given transaction group has finished syncing. diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index fea1f0bf42fd..9d4a8062b2d9 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -645,6 +645,7 @@ extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise); */ int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj); void vdev_metaslab_group_create(vdev_t *vd); +uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b); /* * Vdev ashift optimization tunables diff --git a/include/sys/zio.h b/include/sys/zio.h index 5bb712083458..39de5175b7db 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -108,23 +108,6 @@ enum zio_checksum { #define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256 -/* supported encryption algorithms */ -enum zio_encrypt { - ZIO_CRYPT_INHERIT = 0, - ZIO_CRYPT_ON, - ZIO_CRYPT_OFF, - ZIO_CRYPT_AES_128_CCM, - ZIO_CRYPT_AES_192_CCM, - ZIO_CRYPT_AES_256_CCM, - ZIO_CRYPT_AES_128_GCM, - ZIO_CRYPT_AES_192_GCM, - ZIO_CRYPT_AES_256_GCM, - ZIO_CRYPT_FUNCTIONS -}; - -#define ZIO_CRYPT_ON_VALUE ZIO_CRYPT_AES_256_GCM -#define ZIO_CRYPT_DEFAULT ZIO_CRYPT_OFF - /* macros defining encryption lengths */ #define ZIO_OBJSET_MAC_LEN 32 #define ZIO_DATA_IV_LEN 12 diff --git a/lib/libavl/Makefile.am b/lib/libavl/Makefile.am index 2e0a431c77fb..de8ba34d5ba0 100644 --- a/lib/libavl/Makefile.am +++ b/lib/libavl/Makefile.am @@ -5,6 +5,9 @@ VPATH = $(top_srcdir)/module/avl/ # Includes kernel code, generate warnings for large stack frames AM_CFLAGS += $(FRAME_LARGER_THAN) +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress + noinst_LTLIBRARIES = libavl.la KERNEL_C = \ diff --git a/lib/libefi/Makefile.am b/lib/libefi/Makefile.am index b26f7a6dcd5b..5f77ac480a9f 100644 --- a/lib/libefi/Makefile.am +++ b/lib/libefi/Makefile.am @@ -2,6 +2,9 @@ include $(top_srcdir)/config/Rules.am AM_CFLAGS += $(LIBUUID_CFLAGS) $(ZLIB_CFLAGS) +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress + noinst_LTLIBRARIES = libefi.la USER_C = \ diff --git a/lib/libicp/Makefile.am b/lib/libicp/Makefile.am index e4a9ee862101..9a2510d0d222 100644 --- a/lib/libicp/Makefile.am +++ b/lib/libicp/Makefile.am @@ -6,6 +6,8 @@ VPATH = \ # Includes kernel code, generate warnings for large stack frames AM_CFLAGS += $(FRAME_LARGER_THAN) +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress noinst_LTLIBRARIES = libicp.la diff --git a/lib/libnvpair/Makefile.am b/lib/libnvpair/Makefile.am index a3e1fa307f7c..f9f1eb539239 100644 --- a/lib/libnvpair/Makefile.am +++ b/lib/libnvpair/Makefile.am @@ -8,6 +8,9 @@ VPATH = \ # and required CFLAGS for libtirpc AM_CFLAGS += $(FRAME_LARGER_THAN) $(LIBTIRPC_CFLAGS) +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress + lib_LTLIBRARIES = libnvpair.la include $(top_srcdir)/config/Abigail.am diff --git a/lib/libshare/Makefile.am b/lib/libshare/Makefile.am index 7cef13c3da7c..0fce333506ae 100644 --- a/lib/libshare/Makefile.am +++ b/lib/libshare/Makefile.am @@ -2,6 +2,9 @@ include $(top_srcdir)/config/Rules.am DEFAULT_INCLUDES += -I$(srcdir) +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress + noinst_LTLIBRARIES = libshare.la USER_C = \ diff --git a/lib/libspl/Makefile.am b/lib/libspl/Makefile.am index 61432225a708..b59919bfb9e9 100644 --- a/lib/libspl/Makefile.am +++ b/lib/libspl/Makefile.am @@ -2,6 +2,9 @@ include $(top_srcdir)/config/Rules.am SUBDIRS = include +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress + noinst_LTLIBRARIES = libspl_assert.la libspl.la libspl_assert_la_SOURCES = \ diff --git a/lib/libtpool/Makefile.am b/lib/libtpool/Makefile.am index 3aff56f05f1e..ce9d03a67919 100644 --- a/lib/libtpool/Makefile.am +++ b/lib/libtpool/Makefile.am @@ -3,6 +3,9 @@ include $(top_srcdir)/config/Rules.am # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61118 AM_CFLAGS += $(NO_CLOBBERED) +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress + noinst_LTLIBRARIES = libtpool.la USER_C = \ diff --git a/lib/libunicode/Makefile.am b/lib/libunicode/Makefile.am index b82975f68efd..5b12b3e916f3 100644 --- a/lib/libunicode/Makefile.am +++ b/lib/libunicode/Makefile.am @@ -5,6 +5,9 @@ VPATH = $(top_srcdir)/module/unicode # Includes kernel code, generate warnings for large stack frames AM_CFLAGS += $(FRAME_LARGER_THAN) +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress + noinst_LTLIBRARIES = libunicode.la KERNEL_C = \ diff --git a/lib/libuutil/Makefile.am b/lib/libuutil/Makefile.am index 16d5023451bb..05b7ed0db8cb 100644 --- a/lib/libuutil/Makefile.am +++ b/lib/libuutil/Makefile.am @@ -1,5 +1,8 @@ include $(top_srcdir)/config/Rules.am +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress + lib_LTLIBRARIES = libuutil.la include $(top_srcdir)/config/Abigail.am diff --git a/lib/libzfs/Makefile.am b/lib/libzfs/Makefile.am index afece485f011..32ab6be9d775 100644 --- a/lib/libzfs/Makefile.am +++ b/lib/libzfs/Makefile.am @@ -8,6 +8,9 @@ VPATH = \ # Suppress unused but set variable warnings often due to ASSERTs AM_CFLAGS += $(LIBCRYPTO_CFLAGS) $(ZLIB_CFLAGS) +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress + pkgconfig_DATA = libzfs.pc lib_LTLIBRARIES = libzfs.la diff --git a/lib/libzfs_core/Makefile.am b/lib/libzfs_core/Makefile.am index 67e554dc8706..33a889a09586 100644 --- a/lib/libzfs_core/Makefile.am +++ b/lib/libzfs_core/Makefile.am @@ -2,6 +2,9 @@ include $(top_srcdir)/config/Rules.am pkgconfig_DATA = libzfs_core.pc +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress + lib_LTLIBRARIES = libzfs_core.la include $(top_srcdir)/config/Abigail.am diff --git a/lib/libzfsbootenv/Makefile.am b/lib/libzfsbootenv/Makefile.am index 984df0b8a353..8a6bb76acfe7 100644 --- a/lib/libzfsbootenv/Makefile.am +++ b/lib/libzfsbootenv/Makefile.am @@ -2,6 +2,9 @@ include $(top_srcdir)/config/Rules.am pkgconfig_DATA = libzfsbootenv.pc +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress + lib_LTLIBRARIES = libzfsbootenv.la include $(top_srcdir)/config/Abigail.am diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index db7c376318d5..4ce3b4cd2f1d 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -24,6 +24,9 @@ AM_CFLAGS += $(ZLIB_CFLAGS) AM_CFLAGS += -DLIB_ZPOOL_BUILD +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress + lib_LTLIBRARIES = libzpool.la USER_C = \ diff --git a/lib/libzstd/Makefile.am b/lib/libzstd/Makefile.am index c9ed7e2aafbc..e3bc5c446ee9 100644 --- a/lib/libzstd/Makefile.am +++ b/lib/libzstd/Makefile.am @@ -5,6 +5,8 @@ VPATH = $(top_srcdir)/module/zstd # -fno-tree-vectorize is set for gcc in zstd/common/compiler.h # Set it for other compilers, too. AM_CFLAGS += -fno-tree-vectorize +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress noinst_LTLIBRARIES = libzstd.la diff --git a/lib/libzutil/Makefile.am b/lib/libzutil/Makefile.am index 6351e0ebf64b..f55b7798f1c0 100644 --- a/lib/libzutil/Makefile.am +++ b/lib/libzutil/Makefile.am @@ -2,6 +2,9 @@ include $(top_srcdir)/config/Rules.am AM_CFLAGS += $(LIBBLKID_CFLAGS) $(LIBUDEV_CFLAGS) +# See https://debbugs.gnu.org/cgi/bugreport.cgi?bug=54020 +AM_CFLAGS += -no-suppress + DEFAULT_INCLUDES += -I$(srcdir) noinst_LTLIBRARIES = libzutil.la diff --git a/lib/libzutil/zutil_import.c b/lib/libzutil/zutil_import.c index f6f125e7a5df..1658215199f2 100644 --- a/lib/libzutil/zutil_import.c +++ b/lib/libzutil/zutil_import.c @@ -1660,6 +1660,8 @@ zpool_find_import_cached(libpc_handle_t *hdl, importargs_t *iarg) * caller. */ nvpair_t *pair = nvlist_next_nvpair(nv, NULL); + if (pair == NULL) + continue; fnvlist_add_nvlist(pools, nvpair_name(pair), fnvpair_value_nvlist(pair)); diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 70742f7f16f8..390054c2ef8b 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -109,6 +109,11 @@ A value of .Sy 100 disables this feature. . +.It Sy l2arc_exclude_special Ns = Ns Sy 0 Ns | Ns 1 Pq int +Controls whether buffers present on special vdevs are eligibile for caching +into L2ARC. +If set to 1, exclude dbufs on special vdevs from being cached to L2ARC. +. .It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Pq int Controls whether only MFU metadata and data are cached from ARC into L2ARC. This may be desired to avoid wasting space on L2ARC when reading/writing large @@ -342,9 +347,12 @@ When a vdev is added, target this number of metaslabs per top-level vdev. .It Sy zfs_vdev_default_ms_shift Ns = Ns Sy 29 Po 512MB Pc Pq int Default limit for metaslab size. . -.It Sy zfs_vdev_max_auto_ashift Ns = Ns Sy ASHIFT_MAX Po 16 Pc Pq ulong +.It Sy zfs_vdev_max_auto_ashift Ns = Ns Sy 14 Pq ulong Maximum ashift used when optimizing for logical -> physical sector size on new top-level vdevs. +May be increased up to +.Sy ASHIFT_MAX Po 16 Pc , +but this may negatively impact pool space efficiency. . .It Sy zfs_vdev_min_auto_ashift Ns = Ns Sy ASHIFT_MIN Po 9 Pc Pq ulong Minimum ashift used when creating new top-level vdevs. @@ -1091,6 +1099,18 @@ Start syncing out a transaction group if there's at least this much dirty data This should be less than .Sy zfs_vdev_async_write_active_min_dirty_percent . . +.It Sy zfs_wrlog_data_max Ns = Pq int +The upper limit of write-transaction zil log data size in bytes. +Write operations are throttled when approaching the limit until log data is +cleared out after transaction group sync. +Because of some overhead, it should be set at least 2 times the size of +.Sy zfs_dirty_data_max +.No to prevent harming normal write throughput. +It also should be smaller than the size of the slog device if slog is present. +.Pp +Defaults to +.Sy zfs_dirty_data_max*2 +. .It Sy zfs_fallocate_reserve_percent Ns = Ns Sy 110 Ns % Pq uint Since ZFS is a copy-on-write filesystem with snapshots, blocks cannot be preallocated for a file in order to guarantee that later writes will not @@ -1328,6 +1348,22 @@ _ .TE .Sy \& * No Requires debug build. . +.It Sy zfs_btree_verify_intensity Ns = Ns Sy 0 Pq uint +Enables btree verification. +The following settings are culminative: +.TS +box; +lbz r l l . + Value Description + + 1 Verify height. + 2 Verify pointers from children to parent. + 3 Verify element counts. + 4 Verify element order. (expensive) +* 5 Verify unused memory is poisoned. (expensive) +.TE +.Sy \& * No Requires debug build. +. .It Sy zfs_free_leak_on_eio Ns = Ns Sy 0 Ns | Ns 1 Pq int If destroy encounters an .Sy EIO diff --git a/module/.gitignore b/module/.gitignore index 7a4bd3673e77..0ec6052f1bb0 100644 --- a/module/.gitignore +++ b/module/.gitignore @@ -22,5 +22,6 @@ /export_syms /machine /x86 +/i386 !Makefile.in diff --git a/module/icp/io/skein_mod.c b/module/icp/io/skein_mod.c index 5ee36af12bcb..8992c5895e5b 100644 --- a/module/icp/io/skein_mod.c +++ b/module/icp/io/skein_mod.c @@ -494,7 +494,8 @@ skein_update(crypto_ctx_t *ctx, crypto_data_t *data, crypto_req_handle_t req) */ /*ARGSUSED*/ static int -skein_final(crypto_ctx_t *ctx, crypto_data_t *digest, crypto_req_handle_t req) +skein_final_nofree(crypto_ctx_t *ctx, crypto_data_t *digest, + crypto_req_handle_t req) { int error = CRYPTO_SUCCESS; @@ -525,6 +526,17 @@ skein_final(crypto_ctx_t *ctx, crypto_data_t *digest, crypto_req_handle_t req) else digest->cd_length = 0; + return (error); +} + +static int +skein_final(crypto_ctx_t *ctx, crypto_data_t *digest, crypto_req_handle_t req) +{ + int error = skein_final_nofree(ctx, digest, req); + + if (error == CRYPTO_BUFFER_TOO_SMALL) + return (error); + bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); kmem_free(SKEIN_CTX(ctx), sizeof (*(SKEIN_CTX(ctx)))); SKEIN_CTX_LVALUE(ctx) = NULL; @@ -560,7 +572,7 @@ skein_digest_atomic(crypto_provider_handle_t provider, if ((error = skein_update(&ctx, data, digest)) != CRYPTO_SUCCESS) goto out; - if ((error = skein_final(&ctx, data, digest)) != CRYPTO_SUCCESS) + if ((error = skein_final_nofree(&ctx, data, digest)) != CRYPTO_SUCCESS) goto out; out: @@ -669,7 +681,7 @@ skein_mac_atomic(crypto_provider_handle_t provider, if ((error = skein_update(&ctx, data, req)) != CRYPTO_SUCCESS) goto errout; - if ((error = skein_final(&ctx, mac, req)) != CRYPTO_SUCCESS) + if ((error = skein_final_nofree(&ctx, mac, req)) != CRYPTO_SUCCESS) goto errout; return (CRYPTO_SUCCESS); diff --git a/module/lua/ldo.c b/module/lua/ldo.c index 08a952007d10..a9835c4f571d 100644 --- a/module/lua/ldo.c +++ b/module/lua/ldo.c @@ -406,7 +406,7 @@ int luaD_precall (lua_State *L, StkId func, int nresults) { StkId base; Proto *p = clLvalue(func)->p; n = cast_int(L->top - func) - 1; /* number of real arguments */ - luaD_checkstack(L, p->maxstacksize); + luaD_checkstack(L, p->maxstacksize + p->numparams); for (; n < p->numparams; n++) setnilvalue(L->top++); /* complete missing arguments */ if (!p->is_vararg) { diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c index fddb1f0e87cb..590d1c04b9a5 100644 --- a/module/os/freebsd/zfs/arc_os.c +++ b/module/os/freebsd/zfs/arc_os.c @@ -161,6 +161,12 @@ arc_prune_task(void *arg) int64_t nr_scan = (intptr_t)arg; arc_reduce_target_size(ptob(nr_scan)); + +#ifndef __ILP32__ + if (nr_scan > INT_MAX) + nr_scan = INT_MAX; +#endif + #if __FreeBSD_version >= 1300139 sx_xlock(&arc_vnlru_lock); vnlru_free_vfsops(nr_scan, &zfs_vfsops, arc_vnlru_marker); @@ -223,7 +229,10 @@ arc_lowmem(void *arg __unused, int howto __unused) arc_warm = B_TRUE; arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); free_memory = arc_available_memory(); - to_free = (arc_c >> arc_shrink_shift) - MIN(free_memory, 0); + int64_t can_free = arc_c - arc_c_min; + if (can_free <= 0) + return; + to_free = (can_free >> arc_shrink_shift) - MIN(free_memory, 0); DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free); arc_reduce_target_size(to_free); diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c index 5447eb922062..c8fa2b00c002 100644 --- a/module/os/freebsd/zfs/vdev_geom.c +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -956,8 +956,7 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1; *physical_ashift = 0; if (pp->stripesize && pp->stripesize > (1 << *logical_ashift) && - ISP2(pp->stripesize) && pp->stripesize <= (1 << ASHIFT_MAX) && - pp->stripeoffset == 0) + ISP2(pp->stripesize) && pp->stripeoffset == 0) *physical_ashift = highbit(pp->stripesize) - 1; /* diff --git a/module/os/freebsd/zfs/zfs_ctldir.c b/module/os/freebsd/zfs/zfs_ctldir.c index 3a5c9f8caf0a..5bd2e1510ddb 100644 --- a/module/os/freebsd/zfs/zfs_ctldir.c +++ b/module/os/freebsd/zfs/zfs_ctldir.c @@ -976,12 +976,13 @@ zfsctl_snapdir_lookup(struct vop_lookup_args *ap) */ VI_LOCK(*vpp); if (((*vpp)->v_iflag & VI_MOUNT) == 0) { + VI_UNLOCK(*vpp); /* * Upgrade to exclusive lock in order to: * - avoid race conditions * - satisfy the contract of mount_snapshot() */ - err = VOP_LOCK(*vpp, LK_TRYUPGRADE | LK_INTERLOCK); + err = VOP_LOCK(*vpp, LK_TRYUPGRADE); if (err == 0) break; } else { diff --git a/module/os/freebsd/zfs/zfs_file_os.c b/module/os/freebsd/zfs/zfs_file_os.c index fd86a75416e6..60c9ff0581e0 100644 --- a/module/os/freebsd/zfs/zfs_file_os.c +++ b/module/os/freebsd/zfs/zfs_file_os.c @@ -226,7 +226,11 @@ zfs_vop_fsync(vnode_t *vp) struct mount *mp; int error; +#if __FreeBSD_version < 1400068 if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) +#else + if ((error = vn_start_write(vp, &mp, V_WAIT | V_PCATCH)) != 0) +#endif goto drop; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(vp, MNT_WAIT, curthread); diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c index cdd762dcbcbf..05d41d4e3b2a 100644 --- a/module/os/freebsd/zfs/zfs_vfsops.c +++ b/module/os/freebsd/zfs/zfs_vfsops.c @@ -1845,7 +1845,8 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) return (SET_ERROR(EINVAL)); } - if (fidp->fid_len == LONG_FID_LEN && (fid_gen > 1 || setgen != 0)) { + if (fidp->fid_len == LONG_FID_LEN && setgen != 0) { + ZFS_EXIT(zfsvfs); dprintf("snapdir fid: fid_gen (%llu) and setgen (%llu)\n", (u_longlong_t)fid_gen, (u_longlong_t)setgen); return (SET_ERROR(EINVAL)); diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index be6e4b9cc435..760f30d56b7e 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -981,13 +981,17 @@ zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, case RENAME: if (error == ENOENT) { error = EJUSTRETURN; +#if __FreeBSD_version < 1400068 cnp->cn_flags |= SAVENAME; +#endif break; } fallthrough; case DELETE: +#if __FreeBSD_version < 1400068 if (error == 0) cnp->cn_flags |= SAVENAME; +#endif break; } } @@ -1337,7 +1341,10 @@ zfs_lookup_internal(znode_t *dzp, const char *name, vnode_t **vpp, cnp->cn_nameptr = __DECONST(char *, name); cnp->cn_namelen = strlen(name); cnp->cn_nameiop = nameiop; - cnp->cn_flags = ISLASTCN | SAVENAME; + cnp->cn_flags = ISLASTCN; +#if __FreeBSD_version < 1400068 + cnp->cn_flags |= SAVENAME; +#endif cnp->cn_lkflags = LK_EXCLUSIVE | LK_RETRY; cnp->cn_cred = kcred; #if __FreeBSD_version < 1400037 @@ -4642,7 +4649,9 @@ zfs_freebsd_create(struct vop_create_args *ap) znode_t *zp = NULL; int rc, mode; +#if __FreeBSD_version < 1400068 ASSERT(cnp->cn_flags & SAVENAME); +#endif vattr_init_mask(vap); mode = vap->va_mode & ALLPERMS; @@ -4672,7 +4681,9 @@ static int zfs_freebsd_remove(struct vop_remove_args *ap) { +#if __FreeBSD_version < 1400068 ASSERT(ap->a_cnp->cn_flags & SAVENAME); +#endif return (zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_cred)); @@ -4694,7 +4705,9 @@ zfs_freebsd_mkdir(struct vop_mkdir_args *ap) znode_t *zp = NULL; int rc; +#if __FreeBSD_version < 1400068 ASSERT(ap->a_cnp->cn_flags & SAVENAME); +#endif vattr_init_mask(vap); *ap->a_vpp = NULL; @@ -4720,7 +4733,9 @@ zfs_freebsd_rmdir(struct vop_rmdir_args *ap) { struct componentname *cnp = ap->a_cnp; +#if __FreeBSD_version < 1400068 ASSERT(cnp->cn_flags & SAVENAME); +#endif return (zfs_rmdir_(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred)); } @@ -4974,8 +4989,10 @@ zfs_freebsd_rename(struct vop_rename_args *ap) vnode_t *tvp = ap->a_tvp; int error; +#if __FreeBSD_version < 1400068 ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); +#endif error = zfs_do_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, ap->a_tcnp, ap->a_fcnp->cn_cred); @@ -5011,7 +5028,9 @@ zfs_freebsd_symlink(struct vop_symlink_args *ap) #endif int rc; +#if __FreeBSD_version < 1400068 ASSERT(cnp->cn_flags & SAVENAME); +#endif vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ vattr_init_mask(vap); @@ -5105,7 +5124,9 @@ zfs_freebsd_link(struct vop_link_args *ap) if (tdvp->v_mount != vp->v_mount) return (EXDEV); +#if __FreeBSD_version < 1400068 ASSERT(cnp->cn_flags & SAVENAME); +#endif return (zfs_link(VTOZ(tdvp), VTOZ(vp), cnp->cn_nameptr, cnp->cn_cred, 0)); @@ -5377,10 +5398,10 @@ zfs_getextattr_dir(struct vop_getextattr_args *ap, const char *attrname) NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp); #endif error = vn_open_cred(&nd, &flags, 0, VN_OPEN_INVFS, ap->a_cred, NULL); - vp = nd.ni_vp; - NDFREE_PNBUF(&nd); if (error != 0) return (SET_ERROR(error)); + vp = nd.ni_vp; + NDFREE_PNBUF(&nd); if (ap->a_size != NULL) { error = VOP_GETATTR(vp, &va, ap->a_cred); @@ -5522,12 +5543,10 @@ zfs_deleteextattr_dir(struct vop_deleteextattr_args *ap, const char *attrname) UIO_SYSSPACE, attrname, xvp); #endif error = namei(&nd); - vp = nd.ni_vp; - if (error != 0) { - NDFREE_PNBUF(&nd); + if (error != 0) return (SET_ERROR(error)); - } + vp = nd.ni_vp; error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); NDFREE_PNBUF(&nd); @@ -5667,10 +5686,10 @@ zfs_setextattr_dir(struct vop_setextattr_args *ap, const char *attrname) #endif error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred, NULL); - vp = nd.ni_vp; - NDFREE_PNBUF(&nd); if (error != 0) return (SET_ERROR(error)); + vp = nd.ni_vp; + NDFREE_PNBUF(&nd); VATTR_NULL(&va); va.va_size = 0; @@ -5854,10 +5873,10 @@ zfs_listextattr_dir(struct vop_listextattr_args *ap, const char *attrprefix) UIO_SYSSPACE, ".", xvp); #endif error = namei(&nd); - vp = nd.ni_vp; - NDFREE_PNBUF(&nd); if (error != 0) return (SET_ERROR(error)); + vp = nd.ni_vp; + NDFREE_PNBUF(&nd); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; diff --git a/module/os/linux/zfs/zio_crypt.c b/module/os/linux/zfs/zio_crypt.c index 381769eab682..50e93909659f 100644 --- a/module/os/linux/zfs/zio_crypt.c +++ b/module/os/linux/zfs/zio_crypt.c @@ -1900,6 +1900,9 @@ zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, crypto_ctx_template_t tmpl; uint8_t *authbuf = NULL; + memset(&puio, 0, sizeof (puio)); + memset(&cuio, 0, sizeof (cuio)); + /* * If the needed key is the current one, just use it. Otherwise we * need to generate a temporary one from the given salt + master key. @@ -1960,9 +1963,6 @@ zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, /* If the hardware implementation fails fall back to software */ } - bzero(&puio, sizeof (zfs_uio_t)); - bzero(&cuio, sizeof (zfs_uio_t)); - /* create uios for encryption */ ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf, cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len, diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 8b5806fca8e6..fedcc8ac13cc 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -877,6 +877,14 @@ static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, #define l2arc_hdr_arcstats_decrement_state(hdr) \ l2arc_hdr_arcstats_update((hdr), B_FALSE, B_TRUE) +/* + * l2arc_exclude_special : A zfs module parameter that controls whether buffers + * present on special vdevs are eligibile for caching in L2ARC. If + * set to 1, exclude dbufs on special vdevs from being cached to + * L2ARC. + */ +int l2arc_exclude_special = 0; + /* * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU * metadata and data are cached from ARC into L2ARC. @@ -5046,10 +5054,11 @@ arc_reap_cb(void *arg, zthr_t *zthr) */ free_memory = arc_available_memory(); - int64_t to_free = - (arc_c >> arc_shrink_shift) - free_memory; - if (to_free > 0) { - arc_reduce_target_size(to_free); + int64_t can_free = arc_c - arc_c_min; + if (can_free > 0) { + int64_t to_free = (can_free >> arc_shrink_shift) - free_memory; + if (to_free > 0) + arc_reduce_target_size(to_free); } spl_fstrans_unmark(cookie); } @@ -8063,6 +8072,18 @@ arc_init(void) zfs_dirty_data_max = MIN(zfs_dirty_data_max, zfs_dirty_data_max_max); } + + if (zfs_wrlog_data_max == 0) { + + /* + * dp_wrlog_total is reduced for each txg at the end of + * spa_sync(). However, dp_dirty_total is reduced every time + * a block is written out. Thus under normal operation, + * dp_wrlog_total could grow 2 times as big as + * zfs_dirty_data_max. + */ + zfs_wrlog_data_max = zfs_dirty_data_max * 2; + } } void @@ -11145,6 +11166,10 @@ ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, ULONG, ZMOD_RW, ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW, "Cache only MFU data from ARC into L2ARC"); +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, exclude_special, INT, ZMOD_RW, + "If set to 1 exclude dbufs on special vdevs from being cached to " + "L2ARC."); + ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int, param_get_int, ZMOD_RW, "System free memory I/O throttle in bytes"); diff --git a/module/zfs/bqueue.c b/module/zfs/bqueue.c index 22539efc4e23..ec5ce4388ec8 100644 --- a/module/zfs/bqueue.c +++ b/module/zfs/bqueue.c @@ -42,8 +42,7 @@ obj2node(bqueue_t *q, void *data) * Return 0 on success, or -1 on failure. */ int -bqueue_init(bqueue_t *q, uint64_t fill_fraction, uint64_t size, - size_t node_offset) +bqueue_init(bqueue_t *q, uint_t fill_fraction, size_t size, size_t node_offset) { if (fill_fraction == 0) { return (-1); @@ -78,22 +77,26 @@ bqueue_destroy(bqueue_t *q) } static void -bqueue_enqueue_impl(bqueue_t *q, void *data, uint64_t item_size, - boolean_t flush) +bqueue_enqueue_impl(bqueue_t *q, void *data, size_t item_size, boolean_t flush) { ASSERT3U(item_size, >, 0); ASSERT3U(item_size, <=, q->bq_maxsize); mutex_enter(&q->bq_lock); obj2node(q, data)->bqn_size = item_size; - while (q->bq_size + item_size > q->bq_maxsize) { + while (q->bq_size && q->bq_size + item_size > q->bq_maxsize) { + /* + * Wake up bqueue_dequeue() thread if already sleeping in order + * to prevent the deadlock condition + */ + cv_signal(&q->bq_pop_cv); cv_wait_sig(&q->bq_add_cv, &q->bq_lock); } q->bq_size += item_size; list_insert_tail(&q->bq_list, data); - if (q->bq_size >= q->bq_maxsize / q->bq_fill_fraction) - cv_signal(&q->bq_pop_cv); if (flush) cv_broadcast(&q->bq_pop_cv); + else if (q->bq_size >= q->bq_maxsize / q->bq_fill_fraction) + cv_signal(&q->bq_pop_cv); mutex_exit(&q->bq_lock); } @@ -103,7 +106,7 @@ bqueue_enqueue_impl(bqueue_t *q, void *data, uint64_t item_size, * > 0. */ void -bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size) +bqueue_enqueue(bqueue_t *q, void *data, size_t item_size) { bqueue_enqueue_impl(q, data, item_size, B_FALSE); } @@ -117,7 +120,7 @@ bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size) * destroy the condvar before the enqueuing thread is done. */ void -bqueue_enqueue_flush(bqueue_t *q, void *data, uint64_t item_size) +bqueue_enqueue_flush(bqueue_t *q, void *data, size_t item_size) { bqueue_enqueue_impl(q, data, item_size, B_TRUE); } @@ -130,7 +133,7 @@ void * bqueue_dequeue(bqueue_t *q) { void *ret = NULL; - uint64_t item_size; + size_t item_size; mutex_enter(&q->bq_lock); while (q->bq_size == 0) { cv_wait_sig(&q->bq_pop_cv, &q->bq_lock); diff --git a/module/zfs/btree.c b/module/zfs/btree.c index 03c46473c1ec..e16c4ebef6ba 100644 --- a/module/zfs/btree.c +++ b/module/zfs/btree.c @@ -53,7 +53,7 @@ kmem_cache_t *zfs_btree_leaf_cache; * (while the asymptotic complexity of the other steps is the same, the * importance of the constant factors cannot be denied). */ -int zfs_btree_verify_intensity = 0; +uint_t zfs_btree_verify_intensity = 0; /* * Convenience functions to silence warnings from memcpy/memmove's @@ -1608,8 +1608,8 @@ zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count, 1); new_rm_hdr->bth_count = 0; - zfs_btree_node_destroy(tree, new_rm_hdr); zfs_btree_remove_from_node(tree, parent, new_rm_hdr); + zfs_btree_node_destroy(tree, new_rm_hdr); } /* Remove the element at the specific location. */ @@ -1817,10 +1817,10 @@ zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where) /* Move our elements to the left neighbor. */ bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep, k_count + 1); - zfs_btree_node_destroy(tree, rm_hdr); /* Remove the emptied node from the parent. */ zfs_btree_remove_from_node(tree, parent, rm_hdr); + zfs_btree_node_destroy(tree, rm_hdr); zfs_btree_verify(tree); } @@ -2171,3 +2171,9 @@ zfs_btree_verify(zfs_btree_t *tree) return; zfs_btree_verify_poison(tree); } + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, btree_verify_intensity, UINT, ZMOD_RW, + "Enable btree verification. Levels above 4 require ZFS be built " + "with debugging"); +/* END CSTYLED */ diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index e687d96501ed..7ecc2812b4e4 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -53,6 +53,7 @@ #include #include #include +#include kstat_t *dbuf_ksp; @@ -338,18 +339,18 @@ dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) hv = dbuf_hash(os, obj, level, blkid); idx = hv & h->hash_table_mask; - rw_enter(DBUF_HASH_RWLOCK(h, idx), RW_READER); + mutex_enter(DBUF_HASH_MUTEX(h, idx)); for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { if (DBUF_EQUAL(db, os, obj, level, blkid)) { mutex_enter(&db->db_mtx); if (db->db_state != DB_EVICTING) { - rw_exit(DBUF_HASH_RWLOCK(h, idx)); + mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (db); } mutex_exit(&db->db_mtx); } } - rw_exit(DBUF_HASH_RWLOCK(h, idx)); + mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (NULL); } @@ -392,13 +393,13 @@ dbuf_hash_insert(dmu_buf_impl_t *db) hv = dbuf_hash(os, obj, level, blkid); idx = hv & h->hash_table_mask; - rw_enter(DBUF_HASH_RWLOCK(h, idx), RW_WRITER); + mutex_enter(DBUF_HASH_MUTEX(h, idx)); for (dbf = h->hash_table[idx], i = 0; dbf != NULL; dbf = dbf->db_hash_next, i++) { if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { mutex_enter(&dbf->db_mtx); if (dbf->db_state != DB_EVICTING) { - rw_exit(DBUF_HASH_RWLOCK(h, idx)); + mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (dbf); } mutex_exit(&dbf->db_mtx); @@ -416,7 +417,7 @@ dbuf_hash_insert(dmu_buf_impl_t *db) mutex_enter(&db->db_mtx); db->db_hash_next = h->hash_table[idx]; h->hash_table[idx] = db; - rw_exit(DBUF_HASH_RWLOCK(h, idx)); + mutex_exit(DBUF_HASH_MUTEX(h, idx)); uint64_t he = atomic_inc_64_nv(&dbuf_stats.hash_elements.value.ui64); DBUF_STAT_MAX(hash_elements_max, he); @@ -473,13 +474,13 @@ dbuf_hash_remove(dmu_buf_impl_t *db) /* * We mustn't hold db_mtx to maintain lock ordering: - * DBUF_HASH_RWLOCK > db_mtx. + * DBUF_HASH_MUTEX > db_mtx. */ ASSERT(zfs_refcount_is_zero(&db->db_holds)); ASSERT(db->db_state == DB_EVICTING); ASSERT(!MUTEX_HELD(&db->db_mtx)); - rw_enter(DBUF_HASH_RWLOCK(h, idx), RW_WRITER); + mutex_enter(DBUF_HASH_MUTEX(h, idx)); dbp = &h->hash_table[idx]; while ((dbf = *dbp) != db) { dbp = &dbf->db_hash_next; @@ -490,7 +491,7 @@ dbuf_hash_remove(dmu_buf_impl_t *db) if (h->hash_table[idx] && h->hash_table[idx]->db_hash_next == NULL) DBUF_STAT_BUMPDOWN(hash_chains); - rw_exit(DBUF_HASH_RWLOCK(h, idx)); + mutex_exit(DBUF_HASH_MUTEX(h, idx)); atomic_dec_64(&dbuf_stats.hash_elements.value.ui64); } @@ -594,6 +595,68 @@ dbuf_is_metadata(dmu_buf_impl_t *db) } } +/* + * We want to exclude buffers that are on a special allocation class from + * L2ARC. + */ +boolean_t +dbuf_is_l2cacheable(dmu_buf_impl_t *db) +{ + vdev_t *vd = NULL; + zfs_cache_type_t cache = db->db_objset->os_secondary_cache; + blkptr_t *bp = db->db_blkptr; + + if (bp != NULL && !BP_IS_HOLE(bp)) { + uint64_t vdev = DVA_GET_VDEV(bp->blk_dva); + vdev_t *rvd = db->db_objset->os_spa->spa_root_vdev; + + if (vdev < rvd->vdev_children) + vd = rvd->vdev_child[vdev]; + + if (cache == ZFS_CACHE_ALL || + (dbuf_is_metadata(db) && cache == ZFS_CACHE_METADATA)) { + if (vd == NULL) + return (B_TRUE); + + if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL && + vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) || + l2arc_exclude_special == 0) + return (B_TRUE); + } + } + + return (B_FALSE); +} + +static inline boolean_t +dnode_level_is_l2cacheable(blkptr_t *bp, dnode_t *dn, int64_t level) +{ + vdev_t *vd = NULL; + zfs_cache_type_t cache = dn->dn_objset->os_secondary_cache; + + if (bp != NULL && !BP_IS_HOLE(bp)) { + uint64_t vdev = DVA_GET_VDEV(bp->blk_dva); + vdev_t *rvd = dn->dn_objset->os_spa->spa_root_vdev; + + if (vdev < rvd->vdev_children) + vd = rvd->vdev_child[vdev]; + + if (cache == ZFS_CACHE_ALL || ((level > 0 || + DMU_OT_IS_METADATA(dn->dn_handle->dnh_dnode->dn_type)) && + cache == ZFS_CACHE_METADATA)) { + if (vd == NULL) + return (B_TRUE); + + if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL && + vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) || + l2arc_exclude_special == 0) + return (B_TRUE); + } + } + + return (B_FALSE); +} + /* * This function *must* return indices evenly distributed between all @@ -851,8 +914,8 @@ dbuf_init(void) sizeof (dmu_buf_impl_t), 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); - for (i = 0; i < DBUF_RWLOCKS; i++) - rw_init(&h->hash_rwlocks[i], NULL, RW_DEFAULT, NULL); + for (i = 0; i < DBUF_MUTEXES; i++) + mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); dbuf_stats_init(h); @@ -918,8 +981,8 @@ dbuf_fini(void) dbuf_stats_destroy(); - for (i = 0; i < DBUF_RWLOCKS; i++) - rw_destroy(&h->hash_rwlocks[i]); + for (i = 0; i < DBUF_MUTEXES; i++) + mutex_destroy(&h->hash_mutexes[i]); #if defined(_KERNEL) /* * Large allocations which do not require contiguous pages @@ -1523,7 +1586,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, DTRACE_SET_STATE(db, "read issued"); mutex_exit(&db->db_mtx); - if (DBUF_IS_L2CACHEABLE(db)) + if (dbuf_is_l2cacheable(db)) aflags |= ARC_FLAG_L2CACHE; dbuf_add_ref(db, NULL); @@ -3372,7 +3435,7 @@ dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid, dpa->dpa_arg = arg; /* flag if L2ARC eligible, l2arc_noprefetch then decides */ - if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level)) + if (dnode_level_is_l2cacheable(&bp, dn, level)) dpa->dpa_aflags |= ARC_FLAG_L2CACHE; /* @@ -3390,7 +3453,7 @@ dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid, zbookmark_phys_t zb; /* flag if L2ARC eligible, l2arc_noprefetch then decides */ - if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level)) + if (dnode_level_is_l2cacheable(&bp, dn, level)) iter_aflags |= ARC_FLAG_L2CACHE; SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, @@ -4989,7 +5052,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) children_ready_cb = dbuf_write_children_ready; dr->dr_zio = arc_write(pio, os->os_spa, txg, - &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db), + &dr->dr_bp_copy, data, dbuf_is_l2cacheable(db), &zp, dbuf_write_ready, children_ready_cb, dbuf_write_physdone, dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, diff --git a/module/zfs/dbuf_stats.c b/module/zfs/dbuf_stats.c index 037190a81bb3..12bb568a08cc 100644 --- a/module/zfs/dbuf_stats.c +++ b/module/zfs/dbuf_stats.c @@ -137,7 +137,7 @@ dbuf_stats_hash_table_data(char *buf, size_t size, void *data) if (size) buf[0] = 0; - rw_enter(DBUF_HASH_RWLOCK(h, dsh->idx), RW_READER); + mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx)); for (db = h->hash_table[dsh->idx]; db != NULL; db = db->db_hash_next) { /* * Returning ENOMEM will cause the data and header functions @@ -158,7 +158,7 @@ dbuf_stats_hash_table_data(char *buf, size_t size, void *data) mutex_exit(&db->db_mtx); } - rw_exit(DBUF_HASH_RWLOCK(h, dsh->idx)); + mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx)); return (error); } diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 4e7127bd1bab..e38c9b452a28 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1846,7 +1846,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) dsa->dsa_tx = NULL; zio_nowait(arc_write(pio, os->os_spa, txg, - zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), + zgd->zgd_bp, dr->dt.dl.dr_data, dbuf_is_l2cacheable(db), &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index b9380890230c..a8975797e8af 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -63,6 +63,8 @@ #include #include #include "zfs_namecheck.h" +#include +#include /* * Needed to close a window in dnode_move() that allows the objset to be freed @@ -411,6 +413,34 @@ dnode_multilist_index_func(multilist_t *ml, void *obj) multilist_get_num_sublists(ml)); } +static inline boolean_t +dmu_os_is_l2cacheable(objset_t *os) +{ + vdev_t *vd = NULL; + zfs_cache_type_t cache = os->os_secondary_cache; + blkptr_t *bp = os->os_rootbp; + + if (bp != NULL && !BP_IS_HOLE(bp)) { + uint64_t vdev = DVA_GET_VDEV(bp->blk_dva); + vdev_t *rvd = os->os_spa->spa_root_vdev; + + if (vdev < rvd->vdev_children) + vd = rvd->vdev_child[vdev]; + + if (cache == ZFS_CACHE_ALL || cache == ZFS_CACHE_METADATA) { + if (vd == NULL) + return (B_TRUE); + + if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL && + vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) || + l2arc_exclude_special == 0) + return (B_TRUE); + } + } + + return (B_FALSE); +} + /* * Instantiates the objset_t in-memory structure corresponding to the * objset_phys_t that's pointed to by the specified blkptr_t. @@ -453,7 +483,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - if (DMU_OS_IS_L2CACHEABLE(os)) + if (dmu_os_is_l2cacheable(os)) aflags |= ARC_FLAG_L2CACHE; if (ds != NULL && ds->ds_dir->dd_crypto_obj != 0) { @@ -1661,7 +1691,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) } zio = arc_write(pio, os->os_spa, tx->tx_txg, - blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), + blkptr_copy, os->os_phys_buf, dmu_os_is_l2cacheable(os), &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); diff --git a/module/zfs/dmu_redact.c b/module/zfs/dmu_redact.c index 7efe423d35f0..5184ef6888df 100644 --- a/module/zfs/dmu_redact.c +++ b/module/zfs/dmu_redact.c @@ -141,7 +141,7 @@ record_merge_enqueue(bqueue_t *q, struct redact_record **build, { if (new->eos_marker) { if (*build != NULL) - bqueue_enqueue(q, *build, sizeof (*build)); + bqueue_enqueue(q, *build, sizeof (**build)); bqueue_enqueue_flush(q, new, sizeof (*new)); return; } @@ -823,7 +823,7 @@ perform_thread_merge(bqueue_t *q, uint32_t num_threads, avl_destroy(&end_tree); kmem_free(redact_nodes, num_threads * sizeof (*redact_nodes)); if (current_record != NULL) - bqueue_enqueue(q, current_record, sizeof (current_record)); + bqueue_enqueue(q, current_record, sizeof (*current_record)); return (err); } diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 0beb983f992f..1eed0526b51d 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -54,6 +54,7 @@ dmu_tx_stats_t dmu_tx_stats = { { "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_frees_delay", KSTAT_DATA_UINT64 }, + { "dmu_tx_wrlog_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_quota", KSTAT_DATA_UINT64 }, }; @@ -780,34 +781,49 @@ static void dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) { dsl_pool_t *dp = tx->tx_pool; - uint64_t delay_min_bytes = + uint64_t delay_min_bytes, wrlog; + hrtime_t wakeup, tx_time = 0, now; + + /* Calculate minimum transaction time for the dirty data amount. */ + delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; - hrtime_t wakeup, min_tx_time, now; + if (dirty > delay_min_bytes) { + /* + * The caller has already waited until we are under the max. + * We make them pass us the amount of dirty data so we don't + * have to handle the case of it being >= the max, which + * could cause a divide-by-zero if it's == the max. + */ + ASSERT3U(dirty, <, zfs_dirty_data_max); - if (dirty <= delay_min_bytes) - return; + tx_time = zfs_delay_scale * (dirty - delay_min_bytes) / + (zfs_dirty_data_max - dirty); + } - /* - * The caller has already waited until we are under the max. - * We make them pass us the amount of dirty data so we don't - * have to handle the case of it being >= the max, which could - * cause a divide-by-zero if it's == the max. - */ - ASSERT3U(dirty, <, zfs_dirty_data_max); + /* Calculate minimum transaction time for the TX_WRITE log size. */ + wrlog = aggsum_upper_bound(&dp->dp_wrlog_total); + delay_min_bytes = + zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100; + if (wrlog >= zfs_wrlog_data_max) { + tx_time = zfs_delay_max_ns; + } else if (wrlog > delay_min_bytes) { + tx_time = MAX(zfs_delay_scale * (wrlog - delay_min_bytes) / + (zfs_wrlog_data_max - wrlog), tx_time); + } + if (tx_time == 0) + return; + + tx_time = MIN(tx_time, zfs_delay_max_ns); now = gethrtime(); - min_tx_time = zfs_delay_scale * - (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); - min_tx_time = MIN(min_tx_time, zfs_delay_max_ns); - if (now > tx->tx_start + min_tx_time) + if (now > tx->tx_start + tx_time) return; DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, - uint64_t, min_tx_time); + uint64_t, tx_time); mutex_enter(&dp->dp_lock); - wakeup = MAX(tx->tx_start + min_tx_time, - dp->dp_last_wakeup + min_tx_time); + wakeup = MAX(tx->tx_start + tx_time, dp->dp_last_wakeup + tx_time); dp->dp_last_wakeup = wakeup; mutex_exit(&dp->dp_lock); @@ -884,6 +900,13 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) return (SET_ERROR(ERESTART)); } + if (!tx->tx_dirty_delayed && + dsl_pool_need_wrlog_delay(tx->tx_pool)) { + tx->tx_wait_dirty = B_TRUE; + DMU_TX_STAT_BUMP(dmu_tx_wrlog_delay); + return (SET_ERROR(ERESTART)); + } + if (!tx->tx_dirty_delayed && dsl_pool_need_dirty_delay(tx->tx_pool)) { tx->tx_wait_dirty = B_TRUE; diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 456ef5372e2e..4036c8671f2d 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -104,6 +104,13 @@ unsigned long zfs_dirty_data_max_max = 0; int zfs_dirty_data_max_percent = 10; int zfs_dirty_data_max_max_percent = 25; +/* + * The upper limit of TX_WRITE log data. Write operations are throttled + * when approaching the limit until log data is cleared out after txg sync. + * It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY. + */ +unsigned long zfs_wrlog_data_max = 0; + /* * If there's at least this much dirty data (as a percentage of * zfs_dirty_data_max), push out a txg. This should be less than @@ -220,6 +227,11 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); + aggsum_init(&dp->dp_wrlog_total, 0); + for (int i = 0; i < TXG_SIZE; i++) { + aggsum_init(&dp->dp_wrlog_pertxg[i], 0); + } + dp->dp_zrele_taskq = taskq_create("z_zrele", 100, defclsyspri, boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); @@ -416,6 +428,14 @@ dsl_pool_close(dsl_pool_t *dp) rrw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); cv_destroy(&dp->dp_spaceavail_cv); + + ASSERT0(aggsum_value(&dp->dp_wrlog_total)); + aggsum_fini(&dp->dp_wrlog_total); + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT0(aggsum_value(&dp->dp_wrlog_pertxg[i])); + aggsum_fini(&dp->dp_wrlog_pertxg[i]); + } + taskq_destroy(dp->dp_unlinked_drain_taskq); taskq_destroy(dp->dp_zrele_taskq); if (dp->dp_blkstats != NULL) @@ -590,6 +610,42 @@ dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta) cv_signal(&dp->dp_spaceavail_cv); } +void +dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg) +{ + ASSERT3S(size, >=, 0); + + aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], size); + aggsum_add(&dp->dp_wrlog_total, size); + + /* Choose a value slightly bigger than min dirty sync bytes */ + uint64_t sync_min = + zfs_wrlog_data_max * (zfs_dirty_data_sync_percent + 10) / 200; + if (aggsum_compare(&dp->dp_wrlog_pertxg[txg & TXG_MASK], sync_min) > 0) + txg_kick(dp, txg); +} + +boolean_t +dsl_pool_need_wrlog_delay(dsl_pool_t *dp) +{ + uint64_t delay_min_bytes = + zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100; + + return (aggsum_compare(&dp->dp_wrlog_total, delay_min_bytes) > 0); +} + +static void +dsl_pool_wrlog_clear(dsl_pool_t *dp, uint64_t txg) +{ + int64_t delta; + delta = -(int64_t)aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]); + aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], delta); + aggsum_add(&dp->dp_wrlog_total, delta); + /* Compact per-CPU sums after the big change. */ + (void) aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]); + (void) aggsum_value(&dp->dp_wrlog_total); +} + #ifdef ZFS_DEBUG static boolean_t dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg) @@ -814,6 +870,9 @@ dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); dmu_buf_rele(ds->ds_dbuf, zilog); } + + dsl_pool_wrlog_clear(dp, txg); + ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); } @@ -902,18 +961,26 @@ dsl_pool_need_dirty_delay(dsl_pool_t *dp) { uint64_t delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; - uint64_t dirty_min_bytes = - zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; - uint64_t dirty; mutex_enter(&dp->dp_lock); - dirty = dp->dp_dirty_total; + uint64_t dirty = dp->dp_dirty_total; mutex_exit(&dp->dp_lock); - if (dirty > dirty_min_bytes) - txg_kick(dp); + return (dirty > delay_min_bytes); } +static boolean_t +dsl_pool_need_dirty_sync(dsl_pool_t *dp, uint64_t txg) +{ + ASSERT(MUTEX_HELD(&dp->dp_lock)); + + uint64_t dirty_min_bytes = + zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; + uint64_t dirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; + + return (dirty > dirty_min_bytes); +} + void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) { @@ -921,7 +988,12 @@ dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) mutex_enter(&dp->dp_lock); dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space; dsl_pool_dirty_delta(dp, space); + boolean_t needsync = !dmu_tx_is_syncing(tx) && + dsl_pool_need_dirty_sync(dp, tx->tx_txg); mutex_exit(&dp->dp_lock); + + if (needsync) + txg_kick(dp, tx->tx_txg); } } @@ -1396,6 +1468,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, delay_min_dirty_percent, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, ULONG, ZMOD_RW, "Determines the dirty space limit"); +ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, ULONG, ZMOD_RW, + "The size limit of write-transaction zil log data"); + /* zfs_dirty_data_max_max only applied at module load in arc_init(). */ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD, "zfs_dirty_data_max upper bound in bytes"); diff --git a/module/zfs/txg.c b/module/zfs/txg.c index c55b1d8f9601..c9eb84bbdb12 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -498,14 +498,6 @@ txg_wait_callbacks(dsl_pool_t *dp) taskq_wait_outstanding(tx->tx_commit_cb_taskq, 0); } -static boolean_t -txg_is_syncing(dsl_pool_t *dp) -{ - tx_state_t *tx = &dp->dp_tx; - ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); - return (tx->tx_syncing_txg != 0); -} - static boolean_t txg_is_quiescing(dsl_pool_t *dp) { @@ -539,8 +531,6 @@ txg_sync_thread(void *arg) clock_t timeout = zfs_txg_timeout * hz; clock_t timer; uint64_t txg; - uint64_t dirty_min_bytes = - zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; /* * We sync when we're scanning, there's someone waiting @@ -551,8 +541,7 @@ txg_sync_thread(void *arg) while (!dsl_scan_active(dp->dp_scan) && !tx->tx_exiting && timer > 0 && tx->tx_synced_txg >= tx->tx_sync_txg_waiting && - !txg_has_quiesced_to_sync(dp) && - dp->dp_dirty_total < dirty_min_bytes) { + !txg_has_quiesced_to_sync(dp)) { dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", (u_longlong_t)tx->tx_synced_txg, (u_longlong_t)tx->tx_sync_txg_waiting, dp); @@ -566,6 +555,11 @@ txg_sync_thread(void *arg) * prompting it to do so if necessary. */ while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) { + if (txg_is_quiescing(dp)) { + txg_thread_wait(tx, &cpr, + &tx->tx_quiesce_done_cv, 0); + continue; + } if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1) tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1; cv_broadcast(&tx->tx_quiesce_more_cv); @@ -791,24 +785,22 @@ txg_wait_open(dsl_pool_t *dp, uint64_t txg, boolean_t should_quiesce) } /* - * If there isn't a txg syncing or in the pipeline, push another txg through - * the pipeline by quiescing the open txg. + * Pass in the txg number that should be synced. */ void -txg_kick(dsl_pool_t *dp) +txg_kick(dsl_pool_t *dp, uint64_t txg) { tx_state_t *tx = &dp->dp_tx; ASSERT(!dsl_pool_config_held(dp)); + if (tx->tx_sync_txg_waiting >= txg) + return; + mutex_enter(&tx->tx_sync_lock); - if (!txg_is_syncing(dp) && - !txg_is_quiescing(dp) && - tx->tx_quiesce_txg_waiting <= tx->tx_open_txg && - tx->tx_sync_txg_waiting <= tx->tx_synced_txg && - tx->tx_quiesced_txg <= tx->tx_synced_txg) { - tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1; - cv_broadcast(&tx->tx_quiesce_more_cv); + if (tx->tx_sync_txg_waiting < txg) { + tx->tx_sync_txg_waiting = txg; + cv_broadcast(&tx->tx_sync_more_cv); } mutex_exit(&tx->tx_sync_lock); } diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index f209899d96a4..145db2dd51d4 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -134,7 +134,15 @@ int zfs_vdev_standard_sm_blksz = (1 << 17); */ int zfs_nocacheflush = 0; -uint64_t zfs_vdev_max_auto_ashift = ASHIFT_MAX; +/* + * Maximum and minimum ashift values that can be automatically set based on + * vdev's physical ashift (disk's physical sector size). While ASHIFT_MAX + * is higher than the maximum value, it is intentionally limited here to not + * excessively impact pool space efficiency. Higher ashift values may still + * be forced by vdev logical ashift or by user via ashift property, but won't + * be set automatically as a performance optimization. + */ +uint64_t zfs_vdev_max_auto_ashift = 14; uint64_t zfs_vdev_min_auto_ashift = ASHIFT_MIN; /*PRINTFLIKE2*/ @@ -1835,6 +1843,24 @@ vdev_set_deflate_ratio(vdev_t *vd) } } +/* + * Choose the best of two ashifts, preferring one between logical ashift + * (absolute minimum) and administrator defined maximum, otherwise take + * the biggest of the two. + */ +uint64_t +vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b) +{ + if (a > logical && a <= zfs_vdev_max_auto_ashift) { + if (b <= logical || b > zfs_vdev_max_auto_ashift) + return (a); + else + return (MAX(a, b)); + } else if (b <= logical || b > zfs_vdev_max_auto_ashift) + return (MAX(a, b)); + return (b); +} + /* * Maximize performance by inflating the configured ashift for top level * vdevs to be as close to the physical ashift as possible while maintaining @@ -1846,7 +1872,8 @@ vdev_ashift_optimize(vdev_t *vd) { ASSERT(vd == vd->vdev_top); - if (vd->vdev_ashift < vd->vdev_physical_ashift) { + if (vd->vdev_ashift < vd->vdev_physical_ashift && + vd->vdev_physical_ashift <= zfs_vdev_max_auto_ashift) { vd->vdev_ashift = MIN( MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift), MAX(zfs_vdev_min_auto_ashift, @@ -4465,6 +4492,7 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) vs->vs_rsize = vdev_get_min_asize(vd); if (vd->vdev_ops->vdev_op_leaf) { + vs->vs_pspace = vd->vdev_psize; vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; /* @@ -4510,7 +4538,10 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) vs->vs_configured_ashift = vd->vdev_top != NULL ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; vs->vs_logical_ashift = vd->vdev_logical_ashift; - vs->vs_physical_ashift = vd->vdev_physical_ashift; + if (vd->vdev_physical_ashift <= ASHIFT_MAX) + vs->vs_physical_ashift = vd->vdev_physical_ashift; + else + vs->vs_physical_ashift = 0; /* * Report fragmentation and rebuild progress for top-level, diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index db87e69f2057..10d09517effd 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -541,7 +541,7 @@ vdev_draid_generate_perms(const draid_map_t *map, uint8_t **permsp) int vdev_draid_lookup_map(uint64_t children, const draid_map_t **mapp) { - for (int i = 0; i <= VDEV_DRAID_MAX_MAPS; i++) { + for (int i = 0; i < VDEV_DRAID_MAX_MAPS; i++) { if (draid_maps[i].dm_children == children) { *mapp = &draid_maps[i]; return (0); @@ -1496,8 +1496,14 @@ vdev_draid_calculate_asize(vdev_t *vd, uint64_t *asizep, uint64_t *max_asizep, asize = MIN(asize - 1, cvd->vdev_asize - 1) + 1; max_asize = MIN(max_asize - 1, cvd->vdev_max_asize - 1) + 1; logical_ashift = MAX(logical_ashift, cvd->vdev_ashift); - physical_ashift = MAX(physical_ashift, - cvd->vdev_physical_ashift); + } + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_ops == &vdev_draid_spare_ops) + continue; + physical_ashift = vdev_best_ashift(logical_ashift, + physical_ashift, cvd->vdev_physical_ashift); } *asizep = asize; diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 50b86725b78a..d80a767043a5 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -409,8 +409,14 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); - *physical_ashift = MAX(*physical_ashift, - cvd->vdev_physical_ashift); + } + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error) + continue; + *physical_ashift = vdev_best_ashift(*logical_ashift, + *physical_ashift, cvd->vdev_physical_ashift); } if (numerrors == vd->vdev_children) { diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 424de0b33e09..5c25007f17b9 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -1426,8 +1426,14 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); - *physical_ashift = MAX(*physical_ashift, - cvd->vdev_physical_ashift); + } + for (c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error != 0) + continue; + *physical_ashift = vdev_best_ashift(*logical_ashift, + *physical_ashift, cvd->vdev_physical_ashift); } *asize *= vd->vdev_children; diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 94a4afc23ce2..1951a3fecebd 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -4793,6 +4793,11 @@ extract_delay_props(nvlist_t *props) static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, ZFS_PROP_KEYLOCATION, + /* + * Setting ZFS_PROP_SHARESMB requires the objset type to be + * known, which is not possible prior to receipt of raw sends. + */ + ZFS_PROP_SHARESMB, 0 }; int i; @@ -4856,6 +4861,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, offset_t off, noff; nvlist_t *local_delayprops = NULL; nvlist_t *recv_delayprops = NULL; + nvlist_t *inherited_delayprops = NULL; nvlist_t *origprops = NULL; /* existing properties */ nvlist_t *origrecvd = NULL; /* existing received properties */ boolean_t first_recvd_props = B_FALSE; @@ -4970,6 +4976,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, local_delayprops = extract_delay_props(oprops); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, oprops, *errors); + inherited_delayprops = extract_delay_props(xprops); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, xprops, *errors); @@ -5027,6 +5034,10 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, local_delayprops, *errors); } + if (inherited_delayprops != NULL && error == 0) { + (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, + inherited_delayprops, *errors); + } } /* @@ -5046,6 +5057,10 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, ASSERT(nvlist_merge(localprops, local_delayprops, 0) == 0); nvlist_free(local_delayprops); } + if (inherited_delayprops != NULL) { + ASSERT(nvlist_merge(localprops, inherited_delayprops, 0) == 0); + nvlist_free(inherited_delayprops); + } *read_bytes = off - noff; #ifdef ZFS_DEBUG diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index fd4c848d57f2..9e52bed77a61 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -538,6 +538,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, itx_wr_state_t write_state; uintptr_t fsync_cnt; uint64_t gen = 0; + ssize_t size = resid; if (zil_replaying(zilog, tx) || zp->z_unlinked || zfs_xattr_owner_unlinked(zp)) { @@ -623,6 +624,10 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, off += len; resid -= len; } + + if (write_state == WR_COPIED || write_state == WR_NEED_COPY) { + dsl_pool_wrlog_count(zilog->zl_dmu_pool, size, tx->tx_txg); + } } /* diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 59b05b4b08d0..7d141a12288b 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -84,10 +84,8 @@ #include #include #include - #include - unsigned int zvol_inhibit_dev = 0; unsigned int zvol_volmode = ZFS_VOLMODE_GEOM; @@ -577,6 +575,7 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, uint32_t blocksize = zv->zv_volblocksize; zilog_t *zilog = zv->zv_zilog; itx_wr_state_t write_state; + uint64_t sz = size; if (zil_replaying(zilog, tx)) return; @@ -628,6 +627,10 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, offset += len; size -= len; } + + if (write_state == WR_COPIED || write_state == WR_NEED_COPY) { + dsl_pool_wrlog_count(zilog->zl_dmu_pool, sz, tx->tx_txg); + } } /* diff --git a/rpm/generic/zfs-dkms.spec.in b/rpm/generic/zfs-dkms.spec.in index 55f0f1cf5249..920b90e88912 100644 --- a/rpm/generic/zfs-dkms.spec.in +++ b/rpm/generic/zfs-dkms.spec.in @@ -31,7 +31,7 @@ Requires(post): gcc, make, perl, diffutils %if 0%{?rhel}%{?fedora}%{?mageia}%{?suse_version} Requires: kernel-devel >= @ZFS_META_KVER_MIN@, kernel-devel <= @ZFS_META_KVER_MAX@.999 Requires(post): kernel-devel >= @ZFS_META_KVER_MIN@, kernel-devel <= @ZFS_META_KVER_MAX@.999 -Obsoletes: spl-dkms +Obsoletes: spl-dkms <= %{version} %endif Provides: %{module}-kmod = %{version} AutoReqProv: no diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in index 1cd3f6b520ea..8cab1c3d70bb 100644 --- a/rpm/generic/zfs.spec.in +++ b/rpm/generic/zfs.spec.in @@ -120,13 +120,13 @@ License: @ZFS_META_LICENSE@ URL: https://github.com/openzfs/zfs Source0: %{name}-%{version}.tar.gz BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) -Requires: libzpool5 = %{version} -Requires: libnvpair3 = %{version} -Requires: libuutil3 = %{version} -Requires: libzfs5 = %{version} +Requires: libzpool5%{?_isa} = %{version}-%{release} +Requires: libnvpair3%{?_isa} = %{version}-%{release} +Requires: libuutil3%{?_isa} = %{version}-%{release} +Requires: libzfs5%{?_isa} = %{version}-%{release} Requires: %{name}-kmod = %{version} -Provides: %{name}-kmod-common = %{version} -Obsoletes: spl +Provides: %{name}-kmod-common = %{version}-%{release} +Obsoletes: spl <= %{version} # zfs-fuse provides the same commands and man pages that OpenZFS does. # Renaming those on either side would conflict with all available documentation. @@ -178,8 +178,8 @@ This package contains the core ZFS command line utilities. %package -n libzpool5 Summary: Native ZFS pool library for Linux Group: System Environment/Kernel -Obsoletes: libzpool2 -Obsoletes: libzpool4 +Obsoletes: libzpool2 <= %{version} +Obsoletes: libzpool4 <= %{version} %description -n libzpool5 This package contains the zpool library, which provides support @@ -195,7 +195,7 @@ for managing zpools %package -n libnvpair3 Summary: Solaris name-value library for Linux Group: System Environment/Kernel -Obsoletes: libnvpair1 +Obsoletes: libnvpair1 <= %{version} %description -n libnvpair3 This package contains routines for packing and unpacking name-value @@ -213,7 +213,7 @@ to write self describing data structures on disk. %package -n libuutil3 Summary: Solaris userland utility library for Linux Group: System Environment/Kernel -Obsoletes: libuutil1 +Obsoletes: libuutil1 <= %{version} %description -n libuutil3 This library provides a variety of compatibility functions for OpenZFS: @@ -239,8 +239,8 @@ This library provides a variety of compatibility functions for OpenZFS: %package -n libzfs5 Summary: Native ZFS filesystem library for Linux Group: System Environment/Kernel -Obsoletes: libzfs2 -Obsoletes: libzfs4 +Obsoletes: libzfs2 <= %{version} +Obsoletes: libzfs4 <= %{version} %description -n libzfs5 This package provides support for managing ZFS filesystems @@ -255,16 +255,16 @@ This package provides support for managing ZFS filesystems %package -n libzfs5-devel Summary: Development headers Group: System Environment/Kernel -Requires: libzfs5 = %{version} -Requires: libzpool5 = %{version} -Requires: libnvpair3 = %{version} -Requires: libuutil3 = %{version} -Provides: libzpool5-devel -Provides: libnvpair3-devel -Provides: libuutil3-devel -Obsoletes: zfs-devel -Obsoletes: libzfs2-devel -Obsoletes: libzfs4-devel +Requires: libzfs5%{?_isa} = %{version}-%{release} +Requires: libzpool5%{?_isa} = %{version}-%{release} +Requires: libnvpair3%{?_isa} = %{version}-%{release} +Requires: libuutil3%{?_isa} = %{version}-%{release} +Provides: libzpool5-devel = %{version}-%{release} +Provides: libnvpair3-devel = %{version}-%{release} +Provides: libuutil3-devel = %{version}-%{release} +Obsoletes: zfs-devel <= %{version} +Obsoletes: libzfs2-devel <= %{version} +Obsoletes: libzfs4-devel <= %{version} %description -n libzfs5-devel This package contains the header files needed for building additional @@ -313,8 +313,8 @@ Summary: Python %{python_version} wrapper for libzfs_core Group: Development/Languages/Python License: Apache-2.0 BuildArch: noarch -Requires: libzfs5 = %{version} -Requires: libnvpair3 = %{version} +Requires: libzfs5 = %{version}-%{release} +Requires: libnvpair3 = %{version}-%{release} Requires: libffi Requires: python%{__python_pkg_version} Requires: %{__python_cffi_pkg} @@ -339,7 +339,6 @@ This package provides a python wrapper for the libzfs_core C library. Summary: Initramfs module Group: System Environment/Kernel Requires: %{name}%{?_isa} = %{version}-%{release} -Requires: %{name} = %{version}-%{release} Requires: initramfs-tools %description initramfs diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index eea2af2edcf0..eaaf998e8b76 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -81,7 +81,9 @@ TRIM_TXG_BATCH trim.txg_batch zfs_trim_txg_batch TXG_HISTORY txg.history zfs_txg_history TXG_TIMEOUT txg.timeout zfs_txg_timeout UNLINK_SUSPEND_PROGRESS UNSUPPORTED zfs_unlink_suspend_progress +VDEV_FILE_LOGICAL_ASHIFT vdev.file.logical_ashift vdev_file_logical_ashift VDEV_FILE_PHYSICAL_ASHIFT vdev.file.physical_ashift vdev_file_physical_ashift +VDEV_MAX_AUTO_ASHIFT vdev.max_auto_ashift zfs_vdev_max_auto_ashift VDEV_MIN_MS_COUNT vdev.min_ms_count zfs_vdev_min_ms_count VDEV_VALIDATE_SKIP vdev.validate_skip vdev_validate_skip VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh index 89cc4b0d3082..0fa1c0055b3c 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh @@ -57,7 +57,9 @@ disk2=$TEST_BASE_DIR/disk2 log_must mkfile $SIZE $disk1 log_must mkfile $SIZE $disk2 +logical_ashift=$(get_tunable VDEV_FILE_LOGICAL_ASHIFT) orig_ashift=$(get_tunable VDEV_FILE_PHYSICAL_ASHIFT) +max_auto_ashift=$(get_tunable VDEV_MAX_AUTO_ASHIFT) typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16") for ashift in ${ashifts[@]} @@ -81,7 +83,8 @@ do log_must zpool create $TESTPOOL $disk1 log_must set_tunable64 VDEV_FILE_PHYSICAL_ASHIFT $ashift log_must zpool add $TESTPOOL $disk2 - verify_ashift $disk2 $ashift + exp=$(( (ashift <= max_auto_ashift) ? ashift : logical_ashift )) + verify_ashift $disk2 $exp if [[ $? -ne 0 ]] then log_fail "Device was added without setting ashift value to "\ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh index 6bbd46289f7c..8760f48dd2a4 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh @@ -54,6 +54,14 @@ verify_runnable "global" +# We override $org_size and $exp_size from zpool_expand.cfg to make sure we get +# an expected free space value every time. Otherwise, if we left it +# configurable, the free space ratio to pool size ratio would diverge too much +# much at low $org_size values. +# +org_size=$((1024 * 1024 * 1024)) +exp_size=$(($org_size * 2)) + function cleanup { poolexists $TESTPOOL1 && destroy_pool $TESTPOOL1 @@ -68,11 +76,35 @@ function cleanup unload_scsi_debug } +# Wait for the size of a pool to autoexpand to $1 and the total free space to +# expand to $2 (both values allowing a 10% tolerance). +# +# Wait for up to 10 seconds for this to happen (typically takes 1-2 seconds) +# +function wait_for_autoexpand +{ + typeset exp_new_size=$1 + typeset exp_new_free=$2 + + for i in $(seq 1 10) ; do + typeset new_size=$(get_pool_prop size $TESTPOOL1) + typeset new_free=$(get_prop avail $TESTPOOL1) + # Values need to be within 90% of each other (10% tolerance) + if within_percent $new_size $exp_new_size 90 > /dev/null && \ + within_percent $new_free $exp_new_free 90 > /dev/null ; then + return + fi + sleep 1 + done + log_fail "$TESTPOOL never expanded to $exp_new_size with $exp_new_free" \ + " free space (got $new_size with $new_free free space)" +} + log_onexit cleanup log_assert "zpool can be autoexpanded after set autoexpand=on on vdev expansion" -for type in " " mirror raidz draid:1s; do +for type in " " mirror raidz; do log_note "Setting up loopback, scsi_debug, and file vdevs" log_must truncate -s $org_size $FILE_LO DEV1=$(losetup -f) @@ -105,72 +137,38 @@ for type in " " mirror raidz draid:1s; do log_note "Expanding loopback, scsi_debug, and file vdevs" log_must truncate -s $exp_size $FILE_LO log_must losetup -c $DEV1 - sleep 3 echo "2" > /sys/bus/pseudo/drivers/scsi_debug/virtual_gb echo "1" > /sys/class/block/$DEV2/device/rescan block_device_wait - sleep 3 log_must truncate -s $exp_size $FILE_RAW log_must zpool online -e $TESTPOOL1 $FILE_RAW - typeset expand_size=$(get_pool_prop size $TESTPOOL1) - typeset zfs_expand_size=$(get_prop avail $TESTPOOL1) - - log_note "$TESTPOOL1 $type has previous size: $prev_size and " \ - "expanded size: $expand_size" - # compare available pool size from zfs - if [[ $zfs_expand_size -gt $zfs_prev_size ]]; then - # check for zpool history for the pool size expansion - if [[ $type == " " ]]; then - typeset expansion_size=$(($exp_size-$org_size)) - typeset size_addition=$(zpool history -il $TESTPOOL1 |\ - grep "pool '$TESTPOOL1' size:" | \ - grep "vdev online" | \ - grep "(+${expansion_size}" | wc -l) - - if [[ $size_addition -ne 3 ]]; then - log_fail "pool $TESTPOOL1 has not expanded, " \ - "$size_addition/3 vdevs expanded" - fi - elif [[ $type == "mirror" ]]; then - typeset expansion_size=$(($exp_size-$org_size)) - zpool history -il $TESTPOOL1 | \ - grep "pool '$TESTPOOL1' size:" | \ - grep "vdev online" | \ - grep "(+${expansion_size})" >/dev/null 2>&1 - - if [[ $? -ne 0 ]] ; then - log_fail "pool $TESTPOOL1 has not expanded" - fi - elif [[ $type == "draid:1s" ]]; then - typeset expansion_size=$((2*($exp_size-$org_size))) - zpool history -il $TESTPOOL1 | \ - grep "pool '$TESTPOOL1' size:" | \ - grep "vdev online" | \ - grep "(+${expansion_size})" >/dev/null 2>&1 - - if [[ $? -ne 0 ]]; then - log_fail "pool $TESTPOOL has not expanded" - fi - else - typeset expansion_size=$((3*($exp_size-$org_size))) - zpool history -il $TESTPOOL1 | \ - grep "pool '$TESTPOOL1' size:" | \ - grep "vdev online" | \ - grep "(+${expansion_size})" >/dev/null 2>&1 - - if [[ $? -ne 0 ]]; then - log_fail "pool $TESTPOOL has not expanded" - fi - fi - else - log_fail "pool $TESTPOOL1 is not autoexpanded after vdev " \ - "expansion. Previous size: $zfs_prev_size and expanded " \ - "size: $zfs_expand_size" + + # The expected free space values below were observed at the time of + # this commit. However, we know ZFS overhead will change over time, + # and thus we do not do an exact comparison to these values in + # wait_for_autoexpand. Rather, we make sure the free space + # is within some small percentage threshold of these values. + typeset exp_new_size=$(($prev_size * 2)) + if [[ "$type" == " " ]] ; then + exp_new_free=6045892608 + elif [[ "$type" == "mirror" ]] ; then + exp_new_free=1945997312 + elif [[ "$type" == "raidz" ]] ; then + exp_new_free=3977637338 + elif [[ "$type" == "draid:1s" ]] then + exp_new_free=1946000384 fi + wait_for_autoexpand $exp_new_size $exp_new_free + + expand_size=$(get_pool_prop size $TESTPOOL1) + + log_note "$TESTPOOL1 '$type' grew from $prev_size -> $expand_size with" \ + "free space from $zfs_prev_size -> $(get_prop avail $TESTPOOL1)" + cleanup done diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_lun_expsz.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_lun_expsz.ksh index 59f64081a977..a18e634cefa7 100755 --- a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_lun_expsz.ksh +++ b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_lun_expsz.ksh @@ -48,14 +48,18 @@ log_must zpool checkpoint $NESTEDPOOL log_must truncate -s $EXPSZ $FILEDISK1 log_must zpool online -e $NESTEDPOOL $FILEDISK1 NEWSZ=$(zpool list -v | grep "$FILEDISK1" | awk '{print $2}') +DEXPSZ=$(zpool list -v | grep "$FILEDISK1" | awk '{print $6}') nested_change_state_after_checkpoint log_mustnot [ "$INITSZ" = "$NEWSZ" ] +log_must [ "$DEXPSZ" = "-" ] log_must zpool export $NESTEDPOOL log_must zpool import -d $FILEDISKDIR --rewind-to-checkpoint $NESTEDPOOL nested_verify_pre_checkpoint_state FINSZ=$(zpool list -v | grep "$FILEDISK1" | awk '{print $2}') -log_must [ "$INITSZ" = "$FINSZ" ] +DEXPSZ=$(zpool list -v | grep "$FILEDISK1" | awk '{print $6}') +log_must [ "$EXPSZ" = "$FINSZ" ] +log_must [ "$DEXPSZ" != "-" ] log_pass "LUN expansion rewinded correctly." diff --git a/tests/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh b/tests/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh index 793904db91ca..c0c7b682def9 100755 --- a/tests/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh +++ b/tests/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh @@ -133,6 +133,14 @@ recv_cksum=$(md5digest /$ds/$TESTFILE0) log_must test "$recv_cksum" == "$cksum" log_must zfs destroy -r $ds +# Test that we can override sharesmb property for encrypted raw stream. +log_note "Must be able to override sharesmb property for encrypted raw stream" +ds=$TESTPOOL/recv +log_must eval "zfs send -w $esnap > $sendfile" +log_must eval "zfs recv -o sharesmb=on $ds < $sendfile" +log_must test "$(get_prop 'sharesmb' $ds)" == "on" +log_must zfs destroy -r $ds + # Test that we can override encryption properties on a properties stream # of an unencrypted dataset, turning it into an encryption root. log_note "Must be able to receive stream with props as encryption root"