Skip to content
Permalink
Browse files

Fast Clone Deletion

Deleting a clone requires finding blocks are clone-only, not shared
with the snapshot. This was done by traversing the entire block tree
which results in a large performance penalty for sparsely
written clones.

This is new method keeps track of clone blocks when they are
modified in a "Livelist" so that, when it’s time to delete,
the clone-specific blocks are already at hand.

We see performance improvements because now deletion work is
proportional to the number of clone-modified blocks, not the size
of the original dataset.

Signed-off-by: sara hartse <sara.hartse@delphix.com>
  • Loading branch information...
shartse committed Feb 13, 2018
1 parent f378f42 commit 78c785bb2871347d7bd94d1a17ae19cab35cd442
Showing with 2,465 additions and 195 deletions.
  1. +263 −65 cmd/zdb/zdb.c
  2. +2 −0 include/sys/bplist.h
  3. +15 −4 include/sys/bpobj.h
  4. +2 −0 include/sys/dmu.h
  5. +1 −1 include/sys/dmu_objset.h
  6. +29 −2 include/sys/dsl_deadlist.h
  7. +1 −0 include/sys/dsl_destroy.h
  8. +13 −2 include/sys/dsl_dir.h
  9. +2 −1 include/sys/dsl_pool.h
  10. +9 −0 include/sys/spa.h
  11. +6 −0 include/sys/spa_impl.h
  12. +2 −0 include/sys/zthr.h
  13. +1 −0 include/zfeature_common.h
  14. +20 −0 man/man5/zpool-features.5
  15. +5 −0 module/zcommon/zfeature_common.c
  16. +15 −1 module/zfs/bplist.c
  17. +73 −22 module/zfs/bpobj.c
  18. +32 −1 module/zfs/dbuf.c
  19. +221 −9 module/zfs/dsl_dataset.c
  20. +325 −11 module/zfs/dsl_deadlist.c
  21. +136 −47 module/zfs/dsl_destroy.c
  22. +99 −1 module/zfs/dsl_dir.c
  23. +2 −1 module/zfs/dsl_pool.c
  24. +17 −4 module/zfs/dsl_scan.c
  25. +468 −13 module/zfs/spa.c
  26. +1 −2 module/zfs/spa_history.c
  27. +84 −3 module/zfs/zthr.c
  28. +5 −2 tests/runfiles/linux.run
  29. +19 −1 tests/zfs-tests/include/libtest.shlib
  30. +6 −1 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/Makefile.am
  31. +136 −0 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_and_disable.ksh
  32. +116 −0 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_races.ksh
  33. +140 −0 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_clone_livelist.ksh
  34. +41 −1 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib
  35. +66 −0 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_dev_removal.ksh
  36. +91 −0 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_dev_removal_condense.ksh
  37. +1 −0 tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg

Large diffs are not rendered by default.

Oops, something went wrong.
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2018 by Delphix. All rights reserved.
*/

#ifndef _SYS_BPLIST_H
@@ -49,6 +50,7 @@ void bplist_destroy(bplist_t *bpl);
void bplist_append(bplist_t *bpl, const blkptr_t *bp);
void bplist_iterate(bplist_t *bpl, bplist_itor_t *func,
void *arg, dmu_tx_t *tx);
void bplist_clear(bplist_t *bpl);

#ifdef __cplusplus
}
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2015, 2018 by Delphix. All rights reserved.
*/

#ifndef _SYS_BPOBJ_H
@@ -31,6 +31,7 @@
#include <sys/txg.h>
#include <sys/zio.h>
#include <sys/zfs_context.h>
#include <sys/bplist.h>

#ifdef __cplusplus
extern "C" {
@@ -48,10 +49,12 @@ typedef struct bpobj_phys {
uint64_t bpo_uncomp;
uint64_t bpo_subobjs;
uint64_t bpo_num_subobjs;
uint64_t bpo_num_freed;
} bpobj_phys_t;

#define BPOBJ_SIZE_V0 (2 * sizeof (uint64_t))
#define BPOBJ_SIZE_V1 (4 * sizeof (uint64_t))
#define BPOBJ_SIZE_V2 (6 * sizeof (uint64_t))

typedef struct bpobj {
kmutex_t bpo_lock;
@@ -60,12 +63,14 @@ typedef struct bpobj {
int bpo_epb;
uint8_t bpo_havecomp;
uint8_t bpo_havesubobj;
uint8_t bpo_havefreed;
bpobj_phys_t *bpo_phys;
dmu_buf_t *bpo_dbuf;
dmu_buf_t *bpo_cached_dbuf;
} bpobj_t;

typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, boolean_t free,
dmu_tx_t *tx);

uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx);
uint64_t bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx);
@@ -77,17 +82,23 @@ void bpobj_close(bpobj_t *bpo);
boolean_t bpobj_is_open(const bpobj_t *bpo);

int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx);
int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *);
int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *);
int livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func,
void *arg, int64_t start);

void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx);
void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx);
void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t free,
dmu_tx_t *tx);

int bpobj_space(bpobj_t *bpo,
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
boolean_t bpobj_is_empty(bpobj_t *bpo);

int bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t free,
dmu_tx_t *tx);

#ifdef __cplusplus
}
#endif
@@ -383,6 +383,7 @@ typedef struct dmu_buf {
#define DMU_POOL_OBSOLETE_BPOBJ "com.delphix:obsolete_bpobj"
#define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect"
#define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint"
#define DMU_POOL_DELETED_CLONES "com.delphix:deleted_clones"

/*
* Allocate an object from this objset. The range of object numbers
@@ -1004,6 +1005,7 @@ extern uint64_t dmu_objset_id(objset_t *os);
extern uint64_t dmu_objset_dnodesize(objset_t *os);
extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os);
extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os);
extern int dmu_objset_blksize(objset_t *os);
extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
extern int dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *val);
@@ -126,7 +126,7 @@ struct objset {
zfs_cache_type_t os_secondary_cache;
zfs_sync_type_t os_sync;
zfs_redundant_metadata_type_t os_redundant_metadata;
int os_recordsize;
uint64_t os_recordsize;
/*
* The next four values are used as a cache of whatever's on disk, and
* are initialized the first time these properties are queried. Before
@@ -20,20 +20,22 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2015 by Delphix. All rights reserved.
* Copyright (c) 2018 by Delphix. All rights reserved.
*/

#ifndef _SYS_DSL_DEADLIST_H
#define _SYS_DSL_DEADLIST_H

#include <sys/bpobj.h>
#include <sys/zfs_context.h>
#include <sys/zthr.h>

#ifdef __cplusplus
extern "C" {
#endif

struct dmu_buf;
struct dsl_pool;
struct dsl_dataset;

typedef struct dsl_deadlist_phys {
@@ -63,13 +65,34 @@ typedef struct dsl_deadlist_entry {
bpobj_t dle_bpobj;
} dsl_deadlist_entry_t;

typedef struct livelist_condense_entry {
struct dsl_dataset *ds;
dsl_deadlist_entry_t *first;
dsl_deadlist_entry_t *next;
boolean_t syncing;
boolean_t cancelled;
} livelist_condense_entry_t;

extern unsigned long zfs_livelist_max_entries;
extern int zfs_livelist_min_percent_shared;

typedef int deadlist_iter_t(void *args, dsl_deadlist_entry_t *dle);

void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object);
void dsl_deadlist_close(dsl_deadlist_t *dl);
void dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *arg);
uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx);
void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx);
void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx);
void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp,
boolean_t free, dmu_tx_t *tx);
int dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
int dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
void dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg,
dmu_tx_t *tx);
dsl_deadlist_entry_t *dsl_deadlist_first(dsl_deadlist_t *dl);
dsl_deadlist_entry_t *dsl_deadlist_last(dsl_deadlist_t *dl);
uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
uint64_t mrs_obj, dmu_tx_t *tx);
void dsl_deadlist_space(dsl_deadlist_t *dl,
@@ -81,6 +104,10 @@ void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx);
void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
dmu_tx_t *tx);
boolean_t dsl_deadlist_is_open(dsl_deadlist_t *dl);
int dsl_process_sub_livelist(bpobj_t *bpobj, struct bplist *to_free,
zthr_t *t);
void dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl,
dmu_tx_t *tx);

#ifdef __cplusplus
}
@@ -33,6 +33,7 @@ extern "C" {

struct nvlist;
struct dsl_dataset;
struct dsl_pool;
struct dmu_tx;

int dsl_destroy_snapshots_nvl(struct nvlist *, boolean_t,
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
@@ -29,18 +29,20 @@
#define _SYS_DSL_DIR_H

#include <sys/dmu.h>
#include <sys/dsl_deadlist.h>
#include <sys/dsl_pool.h>
#include <sys/dsl_synctask.h>
#include <sys/refcount.h>
#include <sys/zfs_context.h>
#include <sys/dsl_crypt.h>
#include <sys/bplist.h>

#ifdef __cplusplus
extern "C" {
#endif

struct dsl_dataset;

struct zthr;
/*
* DD_FIELD_* are strings that are used in the "extensified" dsl_dir zap object.
* They should be of the format <reverse-dns>:<field>.
@@ -50,6 +52,8 @@ struct dsl_dataset;
#define DD_FIELD_SNAPSHOT_COUNT "com.joyent:snapshot_count"
#define DD_FIELD_CRYPTO_KEY_OBJ "com.datto:crypto_key_obj"
#define DD_FIELD_LAST_REMAP_TXG "com.delphix:last_remap_txg"
#define DD_FIELD_LIVELIST "com.delphix:livelist"


typedef enum dd_used {
DD_USED_HEAD,
@@ -115,6 +119,10 @@ struct dsl_dir {
/* amount of space we expect to write; == amount of dirty data */
int64_t dd_space_towrite[TXG_SIZE];

dsl_deadlist_t dd_livelist;
bplist_t dd_pending_frees;
bplist_t dd_pending_allocs;

/* protected by dd_lock; keep at end of struct for better locality */
char dd_myname[ZFS_MAX_DATASET_NAME_LEN];
};
@@ -185,6 +193,9 @@ void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value,
dmu_tx_t *tx);
void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx);
boolean_t dsl_dir_is_zapified(dsl_dir_t *dd);
void dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj);
void dsl_dir_livelist_close(dsl_dir_t *dd);
void dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total);

/* internal reserved dir name */
#define MOS_DIR_NAME "$MOS"
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013, 2018 by Delphix. All rights reserved.
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
*/

@@ -54,6 +54,7 @@ struct dsl_pool;
struct dmu_tx;
struct dsl_scan;
struct dsl_crypto_params;
struct dsl_deadlist;

extern unsigned long zfs_dirty_data_max;
extern unsigned long zfs_dirty_data_max_max;
@@ -62,6 +62,8 @@ typedef struct ddt ddt_t;
typedef struct ddt_entry ddt_entry_t;
typedef struct zbookmark_phys zbookmark_phys_t;

struct bpobj;
struct bplist;
struct dsl_pool;
struct dsl_dataset;
struct dsl_crypto_params;
@@ -524,6 +526,9 @@ _NOTE(CONSTCOND) } while (0)
#define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1)
#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)

#define BP_GET_FREE(bp) BF64_GET((bp)->blk_fill, 0, 1)
#define BP_SET_FREE(bp, x) BF64_SET((bp)->blk_fill, 0, 1, x)

#define BP_PHYSICAL_BIRTH(bp) \
(BP_IS_EMBEDDED(bp) ? 0 : \
(bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
@@ -638,6 +643,7 @@ _NOTE(CONSTCOND) } while (0)
* 'func' is either snprintf() or mdb_snprintf().
* 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
*/

#define SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \
{ \
static const char *copyname[] = \
@@ -781,6 +787,8 @@ extern spa_t *spa_inject_addref(char *pool);
extern void spa_inject_delref(spa_t *spa);
extern void spa_scan_stat_init(spa_t *spa);
extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
extern int bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);

#define SPA_ASYNC_CONFIG_UPDATE 0x01
#define SPA_ASYNC_REMOVE 0x02
@@ -1106,6 +1114,7 @@ extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa);
extern boolean_t spa_multihost(spa_t *spa);
extern unsigned long spa_get_hostid(void);
extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *);
extern boolean_t spa_livelist_delete_check(spa_t *spa);

extern int spa_mode(spa_t *spa);
extern uint64_t zfs_strtonum(const char *str, char **nptr);
@@ -48,6 +48,7 @@
#include <sys/dsl_crypt.h>
#include <sys/zfeature.h>
#include <sys/zthr.h>
#include <sys/dsl_deadlist.h>
#include <zfeature_common.h>

#ifdef __cplusplus
@@ -307,6 +308,11 @@ struct spa {
spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */
zthr_t *spa_checkpoint_discard_zthr;

zthr_t *spa_livelist_delete_zthr; /* deleting livelists */
zthr_t *spa_livelist_condense_zthr; /* condensing livelists */
uint64_t spa_livelists_to_delete; /* set of livelists to free */
livelist_condense_entry_t spa_to_condense; /* next to condense */

char *spa_root; /* alternate root directory */
uint64_t spa_ena; /* spa-wide ereport ENA */
int spa_last_open_failed; /* error if last open failed */
@@ -33,7 +33,9 @@ extern void zthr_destroy(zthr_t *t);
extern void zthr_wakeup(zthr_t *t);
extern void zthr_cancel(zthr_t *t);
extern void zthr_resume(zthr_t *t);
extern void zthr_finish(zthr_t *t);

extern boolean_t zthr_iscancelled(zthr_t *t);
extern boolean_t zthr_isfinishing(zthr_t *t);

#endif /* _SYS_ZTHR_H */
@@ -67,6 +67,7 @@ typedef enum spa_feature {
SPA_FEATURE_ALLOCATION_CLASSES,
SPA_FEATURE_RESILVER_DEFER,
SPA_FEATURE_BOOKMARK_V2,
SPA_FEATURE_LIVELIST,
SPA_FEATURES
} spa_feature_t;

@@ -823,5 +823,25 @@ The feature will only return back to being \fBenabled\fR when the pool
is rewound or the checkpoint has been discarded.
.RE

.sp
.ne 2
.na
\fB\fBlivelist\fR\fR
.ad
.RS 4n
.TS
l l .
GUID com.delphix:livelist
READ\-ONLY COMPATIBLE yes
DEPENDENCIES none
.TE

This feature allows clones to be deleted faster than the traditional method
when there are large number of random/sparse writes made to the clone.
All blocks allocated and freed after a clone is created are tracked by the
the clone's livelist which is referenced during the deletion of the clone.
The feature is activated when a clone is created and remains active until all
clones have been destroyed.

.SH "SEE ALSO"
zpool(8)
@@ -348,6 +348,11 @@ zpool_feature_init(void)
ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
ZFEATURE_TYPE_BOOLEAN, NULL);

zfeature_register(SPA_FEATURE_LIVELIST,
"com.delphix:livelist", "livelist",
"Improved clone deletion performance.",
ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL);

{
static const spa_feature_t large_blocks_deps[] = {
SPA_FEATURE_EXTENSIBLE_DATASET,
Oops, something went wrong.

0 comments on commit 78c785b

Please sign in to comment.
You can’t perform that action at this time.