Skip to content
Permalink
Browse files

Add subcommand to wait for background zfs activity to complete

Currently the best way to wait for the completion of a long-running
operation in a pool, like a scrub or device removal, is to poll 'zpool
status' and parse its output, which is neither efficient nor convenient.

This change adds a 'wait' subcommand to the zpool command. When invoked,
'zpool wait' will block until a specified type of background activity
completes. Currently, this subcommand can wait for any of the following:

 - Scrubs or resilvers to complete
 - Devices to initialized
 - Devices to be replaced
 - Devices to be removed
 - Checkpoints to be discarded
 - Background freeing to complete

For example, a scrub that is in progress could be waited for by running

    zpool wait -t scrub <pool>

This also adds a -w flag to the attach, checkpoint, initialize, replace,
remove, and scrub subcommands. When used, this flag makes the operations
kicked off by these subcommands synchronous instead of asynchronous.

This functionality is implemented using a new ioctl. The type of
activity to wait for is provided as input to the ioctl, and the ioctl
blocks until all activity of that type has completed. An ioctl was used
over other methods of kernel-userspace communiction primarily for the
sake of portability.

Porting Notes:
This is ported from Delphix OS change DLPX-44432. The following changes
were made while porting:

 - Added ZoL-style ioctl input declaration.
 - Reorganized error handling in zpool_initialize in libzfs to integrate
   better with changes made for TRIM support.
 - Fixed check for whether a checkpoint discard is in progress.
   Previously it also waited if the pool had a checkpoint, instead of
   just if a checkpoint was being discarded.
 - Exposed zfs_initialize_chunk_size as a ZoL-style tunable.
 - Updated more existing tests to make use of new 'zpool wait'
   functionality, tests that don't exist in Delphix OS.
 - Used existing ZoL tunable zfs_scan_suspend_progress, together with
   zinject, in place of a new tunable zfs_scan_max_blks_per_txg.
 - Added support for a non-integral interval argument to zpool wait.

Future work:
ZoL has support for trimming devices, which Delphix OS does not. In the
future, 'zpool wait' could be extended to add the ability to wait for
trim operations to complete.

Signed-off-by: John Gallagher <john.gallagher@delphix.com>
  • Loading branch information...
jgallag88 committed Aug 9, 2019
1 parent c81f179 commit f0ff6c4f6f5dc5c5d19d7b5fefd92f4ccab66204
Showing with 2,574 additions and 130 deletions.
  1. +481 −28 cmd/zpool/zpool_main.c
  2. +2 −0 configure.ac
  3. +7 −0 include/libzfs.h
  4. +3 −0 include/libzfs_core.h
  5. +20 −1 include/sys/fs/zfs.h
  6. +8 −0 include/sys/spa.h
  7. +7 −0 include/sys/spa_impl.h
  8. +1 −0 include/sys/vdev.h
  9. +100 −20 lib/libzfs/libzfs_pool.c
  10. +36 −0 lib/libzfs_core/libzfs_core.c
  11. +12 −0 man/man5/zfs-module-parameters.5
  12. +92 −9 man/man8/zpool.8
  13. +13 −3 module/zfs/bpobj.c
  14. +6 −1 module/zfs/dsl_scan.c
  15. +277 −0 module/zfs/spa.c
  16. +1 −0 module/zfs/spa_checkpoint.c
  17. +7 −0 module/zfs/spa_misc.c
  18. +29 −0 module/zfs/vdev.c
  19. +9 −1 module/zfs/vdev_initialize.c
  20. +1 −0 module/zfs/vdev_removal.c
  21. +55 −0 module/zfs/zfs_ioctl.c
  22. +13 −0 tests/runfiles/linux.run
  23. +8 −1 tests/zfs-tests/include/libtest.shlib
  24. +2 −1 tests/zfs-tests/tests/functional/cli_root/Makefile.am
  25. +2 −11 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib
  26. +1 −5 tests/zfs-tests/tests/functional/cli_root/zpool_clear/zpool_clear_001_pos.ksh
  27. +1 −3 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh
  28. +2 −10 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_005_pos.ksh
  29. +1 −5 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_encrypted_unloaded.ksh
  30. +19 −0 tests/zfs-tests/tests/functional/cli_root/zpool_wait/Makefile.am
  31. +20 −0 tests/zfs-tests/tests/functional/cli_root/zpool_wait/cleanup.ksh
  32. +10 −0 tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/Makefile.am
  33. +20 −0 tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/cleanup.ksh
  34. +32 −0 tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/setup.ksh
  35. +71 −0 tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_replace.ksh
  36. +64 −0 tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_replace_cancel.ksh
  37. +64 −0 tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_resilver.ksh
  38. +49 −0 tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_scrub_basic.ksh
  39. +66 −0 tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_scrub_cancel.ksh
  40. +52 −0 tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_scrub_flag.ksh
  41. +23 −0 tests/zfs-tests/tests/functional/cli_root/zpool_wait/setup.ksh
  42. +124 −0 tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib
  43. +87 −0 tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_discard.ksh
  44. +112 −0 tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_freeing.ksh
  45. +63 −0 tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_initialize_basic.ksh
  46. +77 −0 tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_initialize_cancel.ksh
  47. +88 −0 tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_initialize_flag.ksh
  48. +83 −0 tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_multiple.ksh
  49. +52 −0 tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_no_activity.ksh
  50. +85 −0 tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_remove.ksh
  51. +62 −0 tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_remove_cancel.ksh
  52. +47 −0 tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_usage.ksh
  53. +1 −3 tests/zfs-tests/tests/functional/events/events_002_pos.ksh
  54. +2 −8 tests/zfs-tests/tests/functional/online_offline/online_offline_002_neg.ksh
  55. +1 −8 tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib
  56. +1 −6 tests/zfs-tests/tests/functional/redundancy/redundancy_004_neg.ksh
  57. +1 −3 tests/zfs-tests/tests/functional/removal/removal.kshlib
  58. +1 −3 tests/zfs-tests/tests/functional/removal/removal_with_errors.ksh

Large diffs are not rendered by default.

@@ -270,6 +270,8 @@ AC_CONFIG_FILES([
tests/zfs-tests/tests/functional/cli_root/zpool_trim/Makefile
tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/Makefile
tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/Makefile
tests/zfs-tests/tests/functional/cli_root/zpool_wait/Makefile
tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/Makefile
tests/zfs-tests/tests/functional/cli_user/Makefile
tests/zfs-tests/tests/functional/cli_user/misc/Makefile
tests/zfs-tests/tests/functional/cli_user/zfs_list/Makefile
@@ -194,6 +194,10 @@ typedef struct zfs_handle zfs_handle_t;
typedef struct zpool_handle zpool_handle_t;
typedef struct libzfs_handle libzfs_handle_t;

extern int zpool_wait(zpool_handle_t *, zpool_wait_activity_t);
extern int zpool_wait_status(zpool_handle_t *, zpool_wait_activity_t,
boolean_t *, boolean_t *);

/*
* Library initialization
*/
@@ -275,6 +279,8 @@ typedef struct trimflags {
extern int zpool_scan(zpool_handle_t *, pool_scan_func_t, pool_scrub_cmd_t);
extern int zpool_initialize(zpool_handle_t *, pool_initialize_func_t,
nvlist_t *);
extern int zpool_initialize_wait(zpool_handle_t *, pool_initialize_func_t,
nvlist_t *);
extern int zpool_trim(zpool_handle_t *, pool_trim_func_t, nvlist_t *,
trimflags_t *);

@@ -317,6 +323,7 @@ extern int zpool_get_prop(zpool_handle_t *, zpool_prop_t, char *,
size_t proplen, zprop_source_t *, boolean_t literal);
extern uint64_t zpool_get_prop_int(zpool_handle_t *, zpool_prop_t,
zprop_source_t *);
extern int zpool_props_refresh(zpool_handle_t *);

extern const char *zpool_prop_to_name(zpool_prop_t);
extern const char *zpool_prop_values(zpool_prop_t);
@@ -130,6 +130,9 @@ int lzc_reopen(const char *, boolean_t);
int lzc_pool_checkpoint(const char *);
int lzc_pool_checkpoint_discard(const char *);

int lzc_wait(const char *, zpool_wait_activity_t, boolean_t *);
int lzc_wait_tag(const char *, zpool_wait_activity_t, uint64_t, boolean_t *);

#ifdef __cplusplus
}
#endif
@@ -1031,7 +1031,7 @@ typedef struct vdev_stat {
uint64_t vs_fragmentation; /* device fragmentation */
uint64_t vs_initialize_bytes_done; /* bytes initialized */
uint64_t vs_initialize_bytes_est; /* total bytes to initialize */
uint64_t vs_initialize_state; /* vdev_initialzing_state_t */
uint64_t vs_initialize_state; /* vdev_initializing_state_t */
uint64_t vs_initialize_action_time; /* time_t */
uint64_t vs_checkpoint_space; /* checkpoint-consumed space */
uint64_t vs_resilver_deferred; /* resilver deferred */
@@ -1277,6 +1277,7 @@ typedef enum zfs_ioc {
ZFS_IOC_POOL_TRIM, /* 0x5a50 */
ZFS_IOC_REDACT, /* 0x5a51 */
ZFS_IOC_GET_BOOKMARK_PROPS, /* 0x5a52 */
ZFS_IOC_WAIT, /* 0x5a53 */

/*
* Linux - 3/64 numbers reserved.
@@ -1340,6 +1341,17 @@ typedef enum {
SPA_LOAD_CREATE /* creation in progress */
} spa_load_state_t;

typedef enum {
ZPOOL_WAIT_CKPT_DISCARD,
ZPOOL_WAIT_FREE,
ZPOOL_WAIT_INITIALIZE,
ZPOOL_WAIT_REPLACE,
ZPOOL_WAIT_REMOVE,
ZPOOL_WAIT_RESILVER,
ZPOOL_WAIT_SCRUB,
ZPOOL_WAIT_NUM_ACTIVITIES
} zpool_wait_activity_t;

/*
* Bookmark name values.
*/
@@ -1390,6 +1402,13 @@ typedef enum {
#define ZPOOL_TRIM_RATE "trim_rate"
#define ZPOOL_TRIM_SECURE "trim_secure"

/*
* The following are names used when invoking ZFS_IOC_POOL_WAIT.
*/
#define ZPOOL_WAIT_ACTIVITY "wait_activity"
#define ZPOOL_WAIT_TAG "wait_tag"
#define ZPOOL_WAIT_WAITED "wait_waited"

/*
* Flags for ZFS_IOC_VDEV_SET_STATE
*/
@@ -1205,6 +1205,14 @@ extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t);
extern void spa_event_notify(spa_t *spa, vdev_t *vdev, nvlist_t *hist_nvl,
const char *name);

/* waiting for pool activities to complete */
extern int spa_wait(const char *pool, zpool_wait_activity_t activity,
boolean_t *waited);
extern int spa_wait_tag(const char *name, zpool_wait_activity_t activity,
uint64_t tag, boolean_t *waited);
extern void spa_notify_waiters(spa_t *spa);
extern void spa_wake_waiters(spa_t *spa);

#ifdef ZFS_DEBUG
#define dprintf_bp(bp, fmt, ...) do { \
if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
@@ -410,6 +410,13 @@ struct spa {
list_t spa_leaf_list; /* list of leaf vdevs */
uint64_t spa_leaf_list_gen; /* track leaf_list changes */

/* synchronization for threads in spa_wait */
kmutex_t spa_activities_lock;
kcondvar_t spa_activities_cv;
kcondvar_t spa_waiters_cv;
int spa_waiters; /* number of waiting threads */
boolean_t spa_waiters_cancel; /* waiters should return */

/*
* spa_refcount & spa_config_lock must be the last elements
* because zfs_refcount_t changes size based on compilation options.
@@ -85,6 +85,7 @@ extern void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset,
uint64_t size);
extern void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev,
uint64_t offset, uint64_t size, dmu_tx_t *tx);
extern boolean_t vdev_replace_in_progress(vdev_t *vdev);

extern void vdev_hold(vdev_t *);
extern void vdev_rele(vdev_t *);
@@ -101,7 +101,7 @@ zpool_get_all_props(zpool_handle_t *zhp)
return (0);
}

static int
int
zpool_props_refresh(zpool_handle_t *zhp)
{
nvlist_t *old_props;
@@ -2158,10 +2158,9 @@ xlate_init_err(int err)
* blocks) for the given vdevs in the given pool.
*/
int
zpool_initialize(zpool_handle_t *zhp, pool_initialize_func_t cmd_type,
nvlist_t *vds)
zpool_initialize_impl(zpool_handle_t *zhp, pool_initialize_func_t cmd_type,
nvlist_t *vds, boolean_t wait)
{
char msg[1024];
int err;

nvlist_t *vdev_guids = fnvlist_alloc();
@@ -2173,26 +2172,46 @@ zpool_initialize(zpool_handle_t *zhp, pool_initialize_func_t cmd_type,
err = zpool_translate_vdev_guids(zhp, vds, vdev_guids,
guids_to_paths, &vd_errlist);

if (err == 0) {
err = lzc_initialize(zhp->zpool_name, cmd_type,
vdev_guids, &errlist);
if (err == 0) {
fnvlist_free(vdev_guids);
fnvlist_free(guids_to_paths);
return (0);
}
if (err != 0) {
verify(vd_errlist != NULL);
goto list_errors;
}

err = lzc_initialize(zhp->zpool_name, cmd_type,
vdev_guids, &errlist);

if (err != 0) {
if (errlist != NULL) {
vd_errlist = fnvlist_lookup_nvlist(errlist,
ZPOOL_INITIALIZE_VDEVS);
goto list_errors;
}

(void) snprintf(msg, sizeof (msg),
(void) zpool_standard_error(zhp->zpool_hdl, err,
dgettext(TEXT_DOMAIN, "operation failed"));
} else {
verify(vd_errlist != NULL);
goto out;
}

if (wait) {
for (elem = nvlist_next_nvpair(vdev_guids, NULL); elem != NULL;
elem = nvlist_next_nvpair(vdev_guids, elem)) {

uint64_t guid = fnvpair_value_uint64(elem);

err = lzc_wait_tag(zhp->zpool_name,
ZPOOL_WAIT_INITIALIZE, guid, NULL);
if (err != 0) {
(void) zpool_standard_error_fmt(zhp->zpool_hdl,
err, dgettext(TEXT_DOMAIN, "error "
"waiting for '%s' to initialize"),
nvpair_name(elem));

goto out;
}
}
}
goto out;

list_errors:
for (elem = nvlist_next_nvpair(vd_errlist, NULL); elem != NULL;
elem = nvlist_next_nvpair(vd_errlist, elem)) {
int64_t vd_error = xlate_init_err(fnvpair_value_int64(elem));
@@ -2206,15 +2225,28 @@ zpool_initialize(zpool_handle_t *zhp, pool_initialize_func_t cmd_type,
"cannot initialize '%s'", path);
}

out:
fnvlist_free(vdev_guids);
fnvlist_free(guids_to_paths);

if (vd_errlist != NULL) {
if (vd_errlist != NULL)
fnvlist_free(vd_errlist);
return (-1);
}

return (zpool_standard_error(zhp->zpool_hdl, err, msg));
return (err == 0 ? 0 : -1);
}

int
zpool_initialize(zpool_handle_t *zhp, pool_initialize_func_t cmd_type,
nvlist_t *vds)
{
return (zpool_initialize_impl(zhp, cmd_type, vds, B_FALSE));
}

int
zpool_initialize_wait(zpool_handle_t *zhp, pool_initialize_func_t cmd_type,
nvlist_t *vds)
{
return (zpool_initialize_impl(zhp, cmd_type, vds, B_TRUE));
}

static int
@@ -4782,3 +4814,51 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name)

return (0);
}

/*
* Wait while the specified activity is in progress in the pool.
*/
int
zpool_wait(zpool_handle_t *zhp, zpool_wait_activity_t activity)
{
boolean_t missing;

int error = zpool_wait_status(zhp, activity, &missing, NULL);

if (missing) {
(void) zpool_standard_error_fmt(zhp->zpool_hdl, ENOENT,
dgettext(TEXT_DOMAIN, "error waiting in pool '%s'"),
zhp->zpool_name);
return (ENOENT);
} else {
return (error);
}
}

/*
* Wait for the given activity and return the status of the wait (whether or not
* any waiting was done) in the 'waited' parameter. Non-existent pools are
* reported via the 'missing' parameter, rather than by printing an error
* message. This is convenient when this function is called in a loop over a
* long period of time (as it is, for example, by zpool's wait cmd). In that
* scenario, a pool being exported or destroyed should be considered a normal
* event, so we don't want to print an error when we find that the pool doesn't
* exist.
*/
int
zpool_wait_status(zpool_handle_t *zhp, zpool_wait_activity_t activity,
boolean_t *missing, boolean_t *waited)
{
int error = lzc_wait(zhp->zpool_name, activity, waited);
*missing = (error == ENOENT);
if (*missing)
return (0);

if (error != 0) {
(void) zpool_standard_error_fmt(zhp->zpool_hdl, error,
dgettext(TEXT_DOMAIN, "error waiting in pool '%s'"),
zhp->zpool_name);
}

return (error);
}
@@ -1579,3 +1579,39 @@ lzc_redact(const char *snapshot, const char *bookname, nvlist_t *snapnv)
fnvlist_free(args);
return (error);
}

static int
wait_common(const char *pool, zpool_wait_activity_t activity, boolean_t use_tag,
uint64_t tag, boolean_t *waited)
{
nvlist_t *args = fnvlist_alloc();
nvlist_t *result = NULL;

fnvlist_add_int32(args, ZPOOL_WAIT_ACTIVITY, activity);
if (use_tag)
fnvlist_add_uint64(args, ZPOOL_WAIT_TAG, tag);

int error = lzc_ioctl(ZFS_IOC_WAIT, pool, args, &result);

if (error == 0 && waited != NULL)
*waited = fnvlist_lookup_boolean_value(result,
ZPOOL_WAIT_WAITED);

fnvlist_free(args);
fnvlist_free(result);

return (error);
}

int
lzc_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited)
{
return (wait_common(pool, activity, B_FALSE, 0, waited));
}

int
lzc_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag,
boolean_t *waited)
{
return (wait_common(pool, activity, B_TRUE, tag, waited));
}
@@ -1925,6 +1925,18 @@ Pattern written to vdev free space by \fBzpool initialize\fR.
Default value: \fB16,045,690,984,833,335,022\fR (0xdeadbeefdeadbeee).
.RE

.sp
.ne 2
.na
\fBzfs_initialize_chunk_size\fR (ulong)
.ad
.RS 12n
Size of writes used by \fBzpool initialize\fR.
This option is used by the test suite to facilitate testing.
.sp
Default value: \fB1,048,576\fR
.RE

.sp
.ne 2
.na

0 comments on commit f0ff6c4

Please sign in to comment.
You can’t perform that action at this time.