Skip to content
Permalink
Browse files

add fast path for zfs_ioc_space_snaps() handling of empty_bpobj

When there are many snapshots, calls to zfs_ioc_space_snaps() (e.g. from
`zfs destroy -nv pool/fs@snap1%snap10000`) can be very slow, resulting
in poor performance because we are holding the dp_config_rwlock the
entire time, blocking spa_sync() from continuing.  With around ten
thousand snapshots, we've seen up to 500 seconds in this ioctl,
iterating over up to 50,000,000 bpobjs, ~99% of which are the empty
bpobj.

By creating a fast path for zfs_ioc_space_snaps() handling of the
empty_bpobj, we can achieve a ~5x performance improvement of this ioctl
(when there are many snapshots, and the deadlist is mostly
empty_bpobj's).

External-issue: DLPX-58348
  • Loading branch information...
ahrens committed May 24, 2018
1 parent 737c202 commit 50e54e2ee26448ea8979e7e1c16b2aed533ae8cf
Showing with 190 additions and 26 deletions.
  1. +13 −1 include/sys/dsl_deadlist.h
  2. +170 −25 module/zfs/dsl_deadlist.c
  3. +7 −0 module/zfs/dsl_destroy.c
@@ -48,8 +48,10 @@ typedef struct dsl_deadlist_phys {
typedef struct dsl_deadlist {
objset_t *dl_os;
uint64_t dl_object;
avl_tree_t dl_tree;
avl_tree_t dl_tree; /* contains dsl_deadlist_entry_t */
avl_tree_t dl_cache; /* contains dsl_deadlist_cache_entry_t */
boolean_t dl_havetree;
boolean_t dl_havecache;
struct dmu_buf *dl_dbuf;
dsl_deadlist_phys_t *dl_phys;
kmutex_t dl_lock;
@@ -59,6 +61,15 @@ typedef struct dsl_deadlist {
boolean_t dl_oldfmt;
} dsl_deadlist_t;

typedef struct dsl_deadlist_cache_entry {
avl_node_t dlce_node;
uint64_t dlce_mintxg;
uint64_t dlce_bpobj;
uint64_t dlce_bytes;
uint64_t dlce_comp;
uint64_t dlce_uncomp;
} dsl_deadlist_cache_entry_t;

typedef struct dsl_deadlist_entry {
avl_node_t dle_node;
uint64_t dle_mintxg;
@@ -108,6 +119,7 @@ int dsl_process_sub_livelist(bpobj_t *bpobj, struct bplist *to_free,
zthr_t *t);
void dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl,
dmu_tx_t *tx);
void dsl_deadlist_discard_tree(dsl_deadlist_t *dl);

#ifdef __cplusplus
}
@@ -112,6 +112,7 @@ unsigned long zfs_livelist_max_entries = 500000;
*/
int zfs_livelist_min_percent_shared = 75;

boolean_t dsl_deadlist_prefetch_bpobj = B_TRUE;

static int
dsl_deadlist_compare(const void *arg1, const void *arg2)
@@ -122,6 +123,20 @@ dsl_deadlist_compare(const void *arg1, const void *arg2)
return (AVL_CMP(dle1->dle_mintxg, dle2->dle_mintxg));
}

static int
dsl_deadlist_cache_compare(const void *arg1, const void *arg2)
{
const dsl_deadlist_cache_entry_t *dlce1 = arg1;
const dsl_deadlist_cache_entry_t *dlce2 = arg2;

if (dlce1->dlce_mintxg < dlce2->dlce_mintxg)
return (-1);
else if (dlce1->dlce_mintxg > dlce2->dlce_mintxg)
return (+1);
else
return (0);
}

static void
dsl_deadlist_load_tree(dsl_deadlist_t *dl)
{
@@ -131,6 +146,23 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl)
ASSERT(MUTEX_HELD(&dl->dl_lock));

ASSERT(!dl->dl_oldfmt);
if (dl->dl_havecache) {
/*
* After loading the tree, the caller may modify the tree,
* e.g. to add or remove nodes, or to make a node no longer
* refer to the empty_bpobj. These changes would make the
* dl_cache incorrect. Therefore we discard the cache here,
* so that it can't become incorrect.
*/
dsl_deadlist_cache_entry_t *dlce;
void *cookie = NULL;
while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie))
!= NULL) {
kmem_free(dlce, sizeof (*dlce));
}
avl_destroy(&dl->dl_cache);
dl->dl_havecache = B_FALSE;
}
if (dl->dl_havetree)
return;

@@ -142,14 +174,118 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl)
zap_cursor_advance(&zc)) {
dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
dle->dle_mintxg = zfs_strtonum(za.za_name, NULL);
VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os,
za.za_first_integer));

/*
* Prefetch all the bpobj's so that we do that i/o
* in parallel. Then open them all in a second pass.
*/
dle->dle_bpobj.bpo_object = za.za_first_integer;
if (dsl_deadlist_prefetch_bpobj) {
dmu_prefetch(dl->dl_os, dle->dle_bpobj.bpo_object,
0, 0, 0, ZIO_PRIORITY_SYNC_READ);
}

avl_add(&dl->dl_tree, dle);
}
zap_cursor_fini(&zc);

for (dsl_deadlist_entry_t *dle = avl_first(&dl->dl_tree);
dle != NULL; dle = AVL_NEXT(&dl->dl_tree, dle)) {
VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os,
dle->dle_bpobj.bpo_object));
}
dl->dl_havetree = B_TRUE;
}

/*
* Load only the non-empty bpobj's into the dl_cache. The cache is an analog
* of the dl_tree, but contains only non-empty_bpobj nodes from the ZAP. It
* is used only for gathering space statistics. The dl_cache has two
* advantages over the dl_tree:
*
* 1. Loading the dl_cache is ~5x faster than loading the dl_tree (if it's
* mostly empty_bpobj's), due to less CPU overhead to open the empty_bpobj
* many times and to inquire about its (zero) space stats many times.
*
* 2. The dl_cache uses less memory than the dl_tree. We only need to load
* the dl_tree of snapshots when deleting a snpashot, after which we free the
* dl_tree with dsl_deadlist_discard_tree
*/
static void
dsl_deadlist_load_cache(dsl_deadlist_t *dl)
{
zap_cursor_t zc;
zap_attribute_t za;

ASSERT(MUTEX_HELD(&dl->dl_lock));

ASSERT(!dl->dl_oldfmt);
if (dl->dl_havecache)
return;

uint64_t empty_bpobj = dmu_objset_pool(dl->dl_os)->dp_empty_bpobj;

avl_create(&dl->dl_cache, dsl_deadlist_cache_compare,
sizeof (dsl_deadlist_cache_entry_t),
offsetof(dsl_deadlist_cache_entry_t, dlce_node));
for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
zap_cursor_retrieve(&zc, &za) == 0;
zap_cursor_advance(&zc)) {
if (za.za_first_integer == empty_bpobj)
continue;
dsl_deadlist_cache_entry_t *dlce =
kmem_alloc(sizeof (*dlce), KM_SLEEP);
dlce->dlce_mintxg = zfs_strtonum(za.za_name, NULL);

/*
* Prefetch all the bpobj's so that we do that i/o
* in parallel. Then open them all in a second pass.
*/
dlce->dlce_bpobj = za.za_first_integer;
if (dsl_deadlist_prefetch_bpobj) {
dmu_prefetch(dl->dl_os, dlce->dlce_bpobj,
0, 0, 0, ZIO_PRIORITY_SYNC_READ);
}
avl_add(&dl->dl_cache, dlce);
}
zap_cursor_fini(&zc);

for (dsl_deadlist_cache_entry_t *dlce = avl_first(&dl->dl_cache);
dlce != NULL; dlce = AVL_NEXT(&dl->dl_cache, dlce)) {
bpobj_t bpo;
VERIFY0(bpobj_open(&bpo, dl->dl_os, dlce->dlce_bpobj));

VERIFY0(bpobj_space(&bpo,
&dlce->dlce_bytes, &dlce->dlce_comp, &dlce->dlce_uncomp));
bpobj_close(&bpo);
}
dl->dl_havecache = B_TRUE;
}

/*
* Discard the tree to save memory.
*/
void
dsl_deadlist_discard_tree(dsl_deadlist_t *dl)
{
mutex_enter(&dl->dl_lock);

if (!dl->dl_havetree) {
mutex_exit(&dl->dl_lock);
return;
}
dsl_deadlist_entry_t *dle;
void *cookie = NULL;
while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) != NULL) {
bpobj_close(&dle->dle_bpobj);
kmem_free(dle, sizeof (*dle));
}
avl_destroy(&dl->dl_tree);

dl->dl_havetree = B_FALSE;
mutex_exit(&dl->dl_lock);
}

void
dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *args)
{
@@ -188,6 +324,7 @@ dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object)
dl->dl_oldfmt = B_FALSE;
dl->dl_phys = dl->dl_dbuf->db_data;
dl->dl_havetree = B_FALSE;
dl->dl_havecache = B_FALSE;
}

boolean_t
@@ -199,9 +336,6 @@ dsl_deadlist_is_open(dsl_deadlist_t *dl)
void
dsl_deadlist_close(dsl_deadlist_t *dl)
{
void *cookie = NULL;
dsl_deadlist_entry_t *dle;

ASSERT(dsl_deadlist_is_open(dl));
mutex_destroy(&dl->dl_lock);

@@ -214,13 +348,24 @@ dsl_deadlist_close(dsl_deadlist_t *dl)
}

if (dl->dl_havetree) {
dsl_deadlist_entry_t *dle;
void *cookie = NULL;
while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie))
!= NULL) {
bpobj_close(&dle->dle_bpobj);
kmem_free(dle, sizeof (*dle));
}
avl_destroy(&dl->dl_tree);
}
if (dl->dl_havecache) {
dsl_deadlist_cache_entry_t *dlce;
void *cookie = NULL;
while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie))
!= NULL) {
kmem_free(dlce, sizeof (*dlce));
}
avl_destroy(&dl->dl_cache);
}
dmu_buf_rele(dl->dl_dbuf, dl);
dl->dl_dbuf = NULL;
dl->dl_phys = NULL;
@@ -438,6 +583,7 @@ dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
avl_remove(&dl->dl_tree, dle);
VERIFY0(zap_remove_int(os, dl->dl_object, mintxg, tx));
VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp));
dmu_buf_will_dirty(dl->dl_dbuf, tx);
dl->dl_phys->dl_used -= used;
dl->dl_phys->dl_comp -= comp;
dl->dl_phys->dl_uncomp -= uncomp;
@@ -466,6 +612,7 @@ dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl,
mutex_enter(&dl->dl_lock);
VERIFY0(zap_remove_int(os, dl->dl_object, dle->dle_mintxg, tx));
VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp));
dmu_buf_will_dirty(dl->dl_dbuf, tx);
dl->dl_phys->dl_used -= used;
dl->dl_phys->dl_comp -= comp;
dl->dl_phys->dl_uncomp -= uncomp;
@@ -601,8 +748,8 @@ void
dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg,
uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
{
dsl_deadlist_entry_t *dle;
dsl_deadlist_entry_t dle_tofind;
dsl_deadlist_cache_entry_t *dlce;
dsl_deadlist_cache_entry_t dlce_tofind;
avl_index_t where;

if (dl->dl_oldfmt) {
@@ -614,27 +761,25 @@ dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg,
*usedp = *compp = *uncompp = 0;

mutex_enter(&dl->dl_lock);
dsl_deadlist_load_tree(dl);
dle_tofind.dle_mintxg = mintxg;
dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
dsl_deadlist_load_cache(dl);
dlce_tofind.dlce_mintxg = mintxg;
dlce = avl_find(&dl->dl_cache, &dlce_tofind, &where);

/*
* If we don't find this mintxg, there shouldn't be anything
* after it either.
* If this mintxg doesn't exist, it may be an empty_bpobj which
* is omitted from the sparse tree. Start at the next non-empty
* entry.
*/
ASSERT(dle != NULL ||
avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL);

for (; dle && dle->dle_mintxg < maxtxg;
dle = AVL_NEXT(&dl->dl_tree, dle)) {
uint64_t used, comp, uncomp;

VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
&used, &comp, &uncomp));

*usedp += used;
*compp += comp;
*uncompp += uncomp;
if (dlce == NULL)
dlce = avl_nearest(&dl->dl_cache, where, AVL_AFTER);

for (; dlce && dlce->dlce_mintxg < maxtxg;
dlce = AVL_NEXT(&dl->dl_tree, dlce)) {
*usedp += dlce->dlce_bytes;
*compp += dlce->dlce_comp;
*uncompp += dlce->dlce_uncomp;
}

mutex_exit(&dl->dl_lock);
}

@@ -394,6 +394,13 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
/* Merge our deadlist into next's and free it. */
dsl_deadlist_merge(&ds_next->ds_deadlist,
dsl_dataset_phys(ds)->ds_deadlist_obj, tx);

/*
* We are done with the deadlist tree (generated/used
* by dsl_deadlist_move_bpobj() and dsl_deadlist_merge()).
* Discard it to save memory.
*/
dsl_deadlist_discard_tree(&ds_next->ds_deadlist);
}

dsl_deadlist_close(&ds->ds_deadlist);

0 comments on commit 50e54e2

Please sign in to comment.
You can’t perform that action at this time.