Skip to content

Commit

Permalink
fat zap should prefetch when iterating
Browse files Browse the repository at this point in the history
When iterating over a ZAP object, we're almost always certain to iterate
over the entire object. If there are multiple leaf blocks, we can
realize a performance win by issuing reads for all the leaf blocks in
parallel when the iteration begins.

For example, if we have 10,000 snapshots, "zfs destroy -nv
pool/fs@1%9999" can take 30 minutes when the cache is cold. This change
provides a >3x performance improvement, by issuing the reads for all ~64
blocks of each ZAP object in parallel.

External-issue: DLPX-58347
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
  • Loading branch information
ahrens committed Jun 11, 2019
1 parent 5662fd5 commit 2a2d60a
Show file tree
Hide file tree
Showing 6 changed files with 140 additions and 9 deletions.
7 changes: 5 additions & 2 deletions include/sys/zap.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright 2017 Nexenta Systems, Inc.
*/

Expand Down Expand Up @@ -350,6 +350,7 @@ typedef struct zap_cursor {
uint64_t zc_serialized;
uint64_t zc_hash;
uint32_t zc_cd;
boolean_t zc_prefetch;
} zap_cursor_t;

typedef struct {
Expand All @@ -375,7 +376,9 @@ typedef struct {
* Initialize a zap cursor, pointing to the "first" attribute of the
* zapobj. You must _fini the cursor when you are done with it.
*/
void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj);
void zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj);
void zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os,
uint64_t zapobj);
void zap_cursor_fini(zap_cursor_t *zc);

/*
Expand Down
25 changes: 25 additions & 0 deletions man/man5/zfs-module-parameters.5
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,18 @@ to a log2 fraction of the target arc size.
Default value: \fB6\fR.
.RE

.sp
.ne 2
.na
\fBdmu_prefetch_max\fR (int)
.ad
.RS 12n
Limit the amount we can prefetch with one call to this amount (in bytes).
This helps to limit the amount of memory that can be used by prefetching.
.sp
Default value: \fB134,217,728\fR (128MB).
.RE

.sp
.ne 2
.na
Expand Down Expand Up @@ -502,6 +514,19 @@ regular reads (but there's no reason it has to be the same).
Default value: \fB32,768\fR.
.RE

.sp
.ne 2
.na
\fBzap_iterate_prefetch\fR (int)
.ad
.RS 12n
If this is set, when we start iterating over a ZAP object, zfs will prefetch
the entire object (all leaf blocks). However, this is limited by
\fBdmu_prefetch_max\fR.
.sp
Use \fB1\fR for on (default) and \fB0\fR for off.
.RE

.sp
.ne 2
.na
Expand Down
14 changes: 13 additions & 1 deletion module/zfs/ddt_zap.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2018 by Delphix. All rights reserved.
*/

#include <sys/zfs_context.h>
Expand Down Expand Up @@ -117,7 +118,18 @@ ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk)
zap_attribute_t za;
int error;

zap_cursor_init_serialized(&zc, os, object, *walk);
if (*walk == 0) {
/*
* We don't want to prefetch the entire ZAP object, because
* it can be enormous. Also the primary use of DDT iteration
* is for scrubbing, in which case we will be issuing many
* scrub I/Os for each ZAP block that we read in, so
* reading the ZAP is unlikely to be the bottleneck.
*/
zap_cursor_init_noprefetch(&zc, os, object);
} else {
zap_cursor_init_serialized(&zc, os, object, *walk);
}
if ((error = zap_cursor_retrieve(&zc, &za)) == 0) {
uchar_t cbuf[sizeof (dde->dde_phys) + 1];
uint64_t csize = za.za_num_integers;
Expand Down
16 changes: 16 additions & 0 deletions module/zfs/dmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,13 @@ int zfs_dmu_offset_next_sync = 0;
*/
int zfs_object_remap_one_indirect_delay_ms = 0;

/*
* Limit the amount we can prefetch with one call to this amount. This
* helps to limit the amount of memory that can be used by prefetching.
* Larger objects should be prefetched a bit at a time.
*/
int dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;

const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" },
{DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "object directory" },
Expand Down Expand Up @@ -667,6 +674,11 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
return;
}

/*
* See comment before the definition of dmu_prefetch_max.
*/
len = MIN(len, dmu_prefetch_max);

/*
* XXX - Note, if the dnode for the requested object is not
* already cached, we will do a *synchronous* read in the
Expand Down Expand Up @@ -2629,6 +2641,10 @@ module_param(zfs_dmu_offset_next_sync, int, 0644);
MODULE_PARM_DESC(zfs_dmu_offset_next_sync,
"Enable forcing txg sync to find holes");

module_param(dmu_prefetch_max, int, 0644);
MODULE_PARM_DESC(dmu_prefetch_max,
"Limit one prefetch call to this size");

/* END CSTYLED */

#endif
56 changes: 55 additions & 1 deletion module/zfs/zap.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2016 by Delphix. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/

Expand Down Expand Up @@ -49,6 +49,36 @@
#include <sys/zap_impl.h>
#include <sys/zap_leaf.h>

/*
* If zap_iterate_prefetch is set, we will prefetch the entire ZAP object
* (all leaf blocks) when we start iterating over it.
*
* For zap_cursor_init(), the callers all intend to iterate through all the
* entries. There are a few cases where an error (typically i/o error) could
* cause it to bail out early.
*
* For zap_cursor_init_serialized(), there are callers that do the iteration
* outside of ZFS. Typically they would iterate over everything, but we
* don't have control of that. E.g. zfs_ioc_snapshot_list_next(),
* zcp_snapshots_iter(), and other iterators over things in the MOS - these
* are called by /sbin/zfs and channel programs. The other example is
* zfs_readdir() which iterates over directory entries for the getdents()
* syscall. /sbin/ls iterates to the end (unless it receives a signal), but
* userland doesn't have to.
*
* Given that the ZAP entries aren't returned in a specific order, the only
* legitimate use cases for partial iteration would be:
*
* 1. Pagination: e.g. you only want to display 100 entries at a time, so you
* get the first 100 and then wait for the user to hit "next page", which
* they may never do).
*
* 2. You want to know if there are more than X entries, without relying on
* the zfs-specific implementation of the directory's st_size (which is
* the number of entries).
*/
int zap_iterate_prefetch = B_TRUE;

int fzap_default_block_shift = 14; /* 16k blocksize */

extern inline zap_phys_t *zap_f_phys(zap_t *zap);
Expand Down Expand Up @@ -1189,6 +1219,21 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
/* retrieve the next entry at or after zc_hash/zc_cd */
/* if no entry, return ENOENT */

/*
* If we are reading from the beginning, we're almost certain to
* iterate over the entire ZAP object. If there are multiple leaf
* blocks (freeblk > 2), prefetch the whole object (up to
* dmu_prefetch_max bytes), so that we read the leaf blocks
* concurrently. (Unless noprefetch was requested via
* zap_cursor_init_noprefetch()).
*/
if (zc->zc_hash == 0 && zap_iterate_prefetch &&
zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) {
dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0,
zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap),
ZIO_PRIORITY_ASYNC_READ);
}

if (zc->zc_leaf &&
(ZAP_HASH_IDX(zc->zc_hash,
zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
Expand Down Expand Up @@ -1333,3 +1378,12 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
}
}
}

#if defined(_KERNEL)
/* BEGIN CSTYLED */
module_param(zap_iterate_prefetch, int, 0644);
MODULE_PARM_DESC(zap_iterate_prefetch,
"When iterating ZAP object, prefetch it");

/* END CSTYLED */
#endif
31 changes: 26 additions & 5 deletions module/zfs/zap_micro.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2017 by Delphix. All rights reserved.
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright 2017 Nexenta Systems, Inc.
*/
Expand Down Expand Up @@ -1472,9 +1472,9 @@ zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
* Routines for iterating over the attributes.
*/

void
zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
uint64_t serialized)
static void
zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
uint64_t serialized, boolean_t prefetch)
{
zc->zc_objset = os;
zc->zc_zap = NULL;
Expand All @@ -1483,12 +1483,33 @@ zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
zc->zc_serialized = serialized;
zc->zc_hash = 0;
zc->zc_cd = 0;
zc->zc_prefetch = prefetch;
}
void
zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
uint64_t serialized)
{
zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE);
}

/*
* Initialize a cursor at the beginning of the ZAP object. The entire
* ZAP object will be prefetched.
*/
void
zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
{
zap_cursor_init_serialized(zc, os, zapobj, 0);
zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE);
}

/*
* Initialize a cursor at the beginning, but request that we not prefetch
* the entire ZAP object.
*/
void
zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
{
zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE);
}

void
Expand Down

0 comments on commit 2a2d60a

Please sign in to comment.