Permalink
Browse files

Improved dnode allocation and dmu_hold_impl() (#6611)

Refactor dmu_object_alloc_dnsize() and dnode_hold_impl() to simplify the
code, fix errors introduced by commit dbeb879 (PR #6117) interacting
badly with large dnodes, and improve performance.

* When allocating a new dnode in dmu_object_alloc_dnsize(), update the
percpu object ID for the core's metadnode chunk immediately.  This
eliminates most lock contention when taking the hold and creating the
dnode.

* Correct detection of the chunk boundary to work properly with large
dnodes.

* Separate the dmu_hold_impl() code for the FREE case from the code for
the ALLOCATED case to make it easier to read.

* Fully populate the dnode handle array immediately after reading a
block of the metadnode from disk.  Subsequently the dnode handle array
provides enough information to determine which dnode slots are in use
and which are free.

* Add several kstats to allow the behavior of the code to be examined.

* Verify dnode packing in large_dnode_008_pos.ksh.  Since the test is
purely creates, it should leave very few holes in the metadnode.

* Add test large_dnode_009_pos.ksh, which performs concurrent creates
and deletes, to complement existing test which does only creates.

With the above fixes, there is very little contention in a test of about
200,000 racing dnode allocations produced by tests 'large_dnode_008_pos'
and 'large_dnode_009_pos'.

name                            type data
dnode_hold_dbuf_hold            4    0
dnode_hold_dbuf_read            4    0
dnode_hold_alloc_hits           4    3804690
dnode_hold_alloc_misses         4    216
dnode_hold_alloc_interior       4    3
dnode_hold_alloc_lock_retry     4    0
dnode_hold_alloc_lock_misses    4    0
dnode_hold_alloc_type_none      4    0
dnode_hold_free_hits            4    203105
dnode_hold_free_misses          4    4
dnode_hold_free_lock_misses     4    0
dnode_hold_free_lock_retry      4    0
dnode_hold_free_overflow        4    0
dnode_hold_free_refcount        4    57
dnode_hold_free_txg             4    0
dnode_allocate                  4    203154
dnode_reallocate                4    0
dnode_buf_evict                 4    23918
dnode_alloc_next_chunk          4    4887
dnode_alloc_race                4    0
dnode_alloc_next_block          4    18

The performance is slightly improved for concurrent creates with
16+ threads, and unchanged for low thread counts.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
  • Loading branch information...
dinatale2 authored and tonyhutter committed Sep 13, 2017
1 parent 8995072 commit 45d1abc74d6bd4b09c573dd8db0d2571eb82220d
@@ -1933,7 +1933,8 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
};

static void
dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header,
uint64_t *dnode_slots_used)
{
dmu_buf_t *db = NULL;
dmu_object_info_t doi;
@@ -1965,6 +1966,9 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
}
dmu_object_info_from_dnode(dn, &doi);

if (dnode_slots_used)
*dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE;

zdb_nicenum(doi.doi_metadata_block_size, iblk);
zdb_nicenum(doi.doi_data_block_size, dblk);
zdb_nicenum(doi.doi_max_offset, lsize);
@@ -2072,6 +2076,9 @@ dump_dir(objset_t *os)
int verbosity = dump_opt['d'];
int print_header = 1;
int i, error;
uint64_t total_slots_used = 0;
uint64_t max_slot_used = 0;
uint64_t dnode_slots;

dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
dmu_objset_fast_stat(os, &dds);
@@ -2112,7 +2119,7 @@ dump_dir(objset_t *os)
if (zopt_objects != 0) {
for (i = 0; i < zopt_objects; i++)
dump_object(os, zopt_object[i], verbosity,
&print_header);
&print_header, NULL);
(void) printf("\n");
return;
}
@@ -2129,24 +2136,39 @@ dump_dir(objset_t *os)
if (BP_IS_HOLE(os->os_rootbp))
return;

dump_object(os, 0, verbosity, &print_header);
dump_object(os, 0, verbosity, &print_header, NULL);
object_count = 0;
if (DMU_USERUSED_DNODE(os) != NULL &&
DMU_USERUSED_DNODE(os)->dn_type != 0) {
dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header);
dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header);
dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header,
NULL);
dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header,
NULL);
}

object = 0;
while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
dump_object(os, object, verbosity, &print_header);
dump_object(os, object, verbosity, &print_header, &dnode_slots);
object_count++;
total_slots_used += dnode_slots;
max_slot_used = object + dnode_slots - 1;
}

ASSERT3U(object_count, ==, usedobjs);

(void) printf("\n");

(void) printf(" Dnode slots:\n");
(void) printf("\tTotal used: %10llu\n",
(u_longlong_t)total_slots_used);
(void) printf("\tMax used: %10llu\n",
(u_longlong_t)max_slot_used);
(void) printf("\tPercent empty: %10lf\n",
(double)(max_slot_used - total_slots_used)*100 /
(double)max_slot_used);

(void) printf("\n");

if (error != ESRCH) {
(void) fprintf(stderr, "dmu_object_next() = %d\n", error);
abort();
@@ -2610,7 +2632,7 @@ dump_path_impl(objset_t *os, uint64_t obj, char *name)
return (dump_path_impl(os, child_obj, s + 1));
/*FALLTHROUGH*/
case DMU_OT_PLAIN_FILE_CONTENTS:
dump_object(os, child_obj, dump_opt['v'], &header);
dump_object(os, child_obj, dump_opt['v'], &header, NULL);
return (0);
default:
(void) fprintf(stderr, "object %llu has non-file/directory "
@@ -100,6 +100,13 @@ extern "C" {
#define DN_ZERO_BONUSLEN (DN_BONUS_SIZE(DNODE_MAX_SIZE) + 1)
#define DN_KILL_SPILLBLK (1)

#define DN_SLOT_UNINIT ((void *)NULL) /* Uninitialized */
#define DN_SLOT_FREE ((void *)1UL) /* Free slot */
#define DN_SLOT_ALLOCATED ((void *)2UL) /* Allocated slot */
#define DN_SLOT_INTERIOR ((void *)3UL) /* Interior allocated slot */
#define DN_SLOT_IS_PTR(dn) ((void *)dn > DN_SLOT_INTERIOR)
#define DN_SLOT_IS_VALID(dn) ((void *)dn != NULL)

#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT)

@@ -363,6 +370,135 @@ void dnode_evict_bonus(dnode_t *dn);
((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \
(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA)

/*
* Used for dnodestats kstat.
*/
typedef struct dnode_stats {
/*
* Number of failed attempts to hold a meta dnode dbuf.
*/
kstat_named_t dnode_hold_dbuf_hold;
/*
* Number of failed attempts to read a meta dnode dbuf.
*/
kstat_named_t dnode_hold_dbuf_read;
/*
* Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was able
* to hold the requested object number which was allocated. This is
* the common case when looking up any allocated object number.
*/
kstat_named_t dnode_hold_alloc_hits;
/*
* Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not
* able to hold the request object number because it was not allocated.
*/
kstat_named_t dnode_hold_alloc_misses;
/*
* Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not
* able to hold the request object number because the object number
* refers to an interior large dnode slot.
*/
kstat_named_t dnode_hold_alloc_interior;
/*
* Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) needed
* to retry acquiring slot zrl locks due to contention.
*/
kstat_named_t dnode_hold_alloc_lock_retry;
/*
* Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) did not
* need to create the dnode because another thread did so after
* dropping the read lock but before acquiring the write lock.
*/
kstat_named_t dnode_hold_alloc_lock_misses;
/*
* Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) found
* a free dnode instantiated by dnode_create() but not yet allocated
* by dnode_allocate().
*/
kstat_named_t dnode_hold_alloc_type_none;
/*
* Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was able
* to hold the requested range of free dnode slots.
*/
kstat_named_t dnode_hold_free_hits;
/*
* Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not
* able to hold the requested range of free dnode slots because
* at least one slot was allocated.
*/
kstat_named_t dnode_hold_free_misses;
/*
* Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not
* able to hold the requested range of free dnode slots because
* after acquiring the zrl lock at least one slot was allocated.
*/
kstat_named_t dnode_hold_free_lock_misses;
/*
* Number of times dnode_hold(..., DNODE_MUST_BE_FREE) needed
* to retry acquiring slot zrl locks due to contention.
*/
kstat_named_t dnode_hold_free_lock_retry;
/*
* Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested
* a range of dnode slots which were held by another thread.
*/
kstat_named_t dnode_hold_free_refcount;
/*
* Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested
* a range of dnode slots which would overflow the dnode_phys_t.
*/
kstat_named_t dnode_hold_free_overflow;
/*
* Number of times a dnode_hold(...) was attempted on a dnode
* which had already been unlinked in an earlier txg.
*/
kstat_named_t dnode_hold_free_txg;
/*
* Number of new dnodes allocated by dnode_allocate().
*/
kstat_named_t dnode_allocate;
/*
* Number of dnodes re-allocated by dnode_reallocate().
*/
kstat_named_t dnode_reallocate;
/*
* Number of meta dnode dbufs evicted.
*/
kstat_named_t dnode_buf_evict;
/*
* Number of times dmu_object_alloc*() reached the end of the existing
* object ID chunk and advanced to a new one.
*/
kstat_named_t dnode_alloc_next_chunk;
/*
* Number of times multiple threads attempted to allocate a dnode
* from the same block of free dnodes.
*/
kstat_named_t dnode_alloc_race;
/*
* Number of times dmu_object_alloc*() was forced to advance to the
* next meta dnode dbuf due to an error from dmu_object_next().
*/
kstat_named_t dnode_alloc_next_block;
/*
* Statistics for tracking dnodes which have been moved.
*/
kstat_named_t dnode_move_invalid;
kstat_named_t dnode_move_recheck1;
kstat_named_t dnode_move_recheck2;
kstat_named_t dnode_move_special;
kstat_named_t dnode_move_handle;
kstat_named_t dnode_move_rwlock;
kstat_named_t dnode_move_active;
} dnode_stats_t;

extern dnode_stats_t dnode_stats;

#define DNODE_STAT_INCR(stat, val) \
atomic_add_64(&dnode_stats.stat.value.ui64, (val));
#define DNODE_STAT_BUMP(stat) \
DNODE_STAT_INCR(stat, 1);

#ifdef ZFS_DEBUG

/*
@@ -72,8 +72,7 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
if (db->db_buf)
arc_buf_info(db->db_buf, &abi, zfs_dbuf_state_index);

if (dn)
__dmu_object_info_from_dnode(dn, &doi);
__dmu_object_info_from_dnode(dn, &doi);

nwritten = snprintf(buf, size,
"%-16s %-8llu %-8lld %-8lld %-8lld %-8llu %-8llu %-5d %-5d %-5lu | "
@@ -93,7 +93,10 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
* If we finished a chunk of dnodes, get a new one from
* the global allocator.
*/
if (P2PHASE(object, dnodes_per_chunk) == 0) {
if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
(P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
dn_slots)) {
DNODE_STAT_BUMP(dnode_alloc_next_chunk);
mutex_enter(&os->os_obj_lock);
ASSERT0(P2PHASE(os->os_obj_next_chunk,
dnodes_per_chunk));
@@ -157,6 +160,13 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
mutex_exit(&os->os_obj_lock);
}

/*
* The value of (*cpuobj) before adding dn_slots is the object
* ID assigned to us. The value afterwards is the object ID
* assigned to whoever wants to do an allocation next.
*/
object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;

/*
* XXX We should check for an i/o error here and return
* up to our caller. Actually we should pre-read it in
@@ -177,21 +187,20 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
rw_exit(&dn->dn_struct_rwlock);
dmu_tx_add_new_object(tx, dn);
dnode_rele(dn, FTAG);

(void) atomic_swap_64(cpuobj,
object + dn_slots);
return (object);
}
rw_exit(&dn->dn_struct_rwlock);
dnode_rele(dn, FTAG);
DNODE_STAT_BUMP(dnode_alloc_race);
}

/*
* Skip to next known valid starting point on error. This
* is the start of the next block of dnodes.
*/
if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
/*
* Skip to next known valid starting point for a
* dnode.
*/
object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
DNODE_STAT_BUMP(dnode_alloc_next_block);
}
(void) atomic_swap_64(cpuobj, object);
}
@@ -304,24 +313,37 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
if (*objectp == 0) {
start_obj = 1;
} else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) {
uint64_t i = *objectp + 1;
uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
dmu_object_info_t doi;

/*
* For large_dnode datasets, scan from the beginning of the
* dnode block to find the starting offset. This is needed
* because objectp could be part of a large dnode so we can't
* assume it's a hole even if dmu_object_info() returns ENOENT.
* Scan through the remaining meta dnode block. The contents
* of each slot in the block are known so it can be quickly
* checked. If the block is exhausted without a match then
* hand off to dnode_next_offset() for further scanning.
*/
int epb = DNODE_BLOCK_SIZE >> DNODE_SHIFT;
int skip;
uint64_t i;

for (i = *objectp & ~(epb - 1); i <= *objectp; i += skip) {
dmu_object_info_t doi;

while (i <= last_obj) {
error = dmu_object_info(os, i, &doi);
if (error)
skip = 1;
else
skip = doi.doi_dnodesize >> DNODE_SHIFT;
if (error == ENOENT) {
if (hole) {
*objectp = i;
return (0);
} else {
i++;
}
} else if (error == EEXIST) {
i++;
} else if (error == 0) {
if (hole) {
i += doi.doi_dnodesize >> DNODE_SHIFT;
} else {
*objectp = i;
return (0);
}
} else {
return (error);
}
}

start_obj = i;
Oops, something went wrong.

0 comments on commit 45d1abc

Please sign in to comment.