Skip to content

Commit

Permalink
Illumos 5269 - zpool import slow
Browse files Browse the repository at this point in the history
5269 zpool import slow
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george@delphix.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Approved by: Dan McDonald <danmcd@omniti.com>

References:
  https://www.illumos.org/issues/5269
  illumos/illumos-gate@12380e1e

Ported-by: DHE <git@dehacked.net>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #3396
  • Loading branch information
Arne Jansen authored and behlendorf committed Jun 9, 2015
1 parent d050c62 commit 9c43027
Show file tree
Hide file tree
Showing 10 changed files with 240 additions and 60 deletions.
1 change: 1 addition & 0 deletions include/sys/dmu.h
Expand Up @@ -240,6 +240,7 @@ void zfs_znode_byteswap(void *buf, size_t size);

#define DS_FIND_SNAPSHOTS (1<<0)
#define DS_FIND_CHILDREN (1<<1)
#define DS_FIND_SERIALIZE (1<<2)

/*
* The maximum number of bytes that can be accessed as part of one
Expand Down
2 changes: 2 additions & 0 deletions include/sys/dmu_objset.h
Expand Up @@ -142,6 +142,8 @@ struct objset {
int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
int dmu_objset_own(const char *name, dmu_objset_type_t type,
boolean_t readonly, void *tag, objset_t **osp);
int dmu_objset_own_obj(struct dsl_pool *dp, uint64_t obj,
dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp);
void dmu_objset_refresh_ownership(objset_t *os, void *tag);
void dmu_objset_rele(objset_t *os, void *tag);
void dmu_objset_disown(objset_t *os, void *tag);
Expand Down
1 change: 1 addition & 0 deletions include/sys/dsl_pool.h
Expand Up @@ -158,6 +158,7 @@ boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp);
void dsl_pool_config_enter(dsl_pool_t *dp, void *tag);
void dsl_pool_config_exit(dsl_pool_t *dp, void *tag);
boolean_t dsl_pool_config_held(dsl_pool_t *dp);
boolean_t dsl_pool_config_held_writer(dsl_pool_t *dp);

taskq_t *dsl_pool_iput_taskq(dsl_pool_t *dp);

Expand Down
1 change: 1 addition & 0 deletions include/sys/vdev.h
Expand Up @@ -59,6 +59,7 @@ extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio);
extern boolean_t vdev_is_bootable(vdev_t *vd);
extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
extern int vdev_count_leaves(spa_t *spa);
extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
uint64_t txg, uint64_t size);
extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
Expand Down
9 changes: 7 additions & 2 deletions include/sys/zil.h
Expand Up @@ -37,6 +37,9 @@
extern "C" {
#endif

struct dsl_pool;
struct dsl_dataset;

/*
* Intent log format:
*
Expand Down Expand Up @@ -466,8 +469,10 @@ extern void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
extern void zil_commit(zilog_t *zilog, uint64_t oid);

extern int zil_vdev_offline(const char *osname, void *txarg);
extern int zil_claim(const char *osname, void *txarg);
extern int zil_check_log_chain(const char *osname, void *txarg);
extern int zil_claim(struct dsl_pool *dp,
struct dsl_dataset *ds, void *txarg);
extern int zil_check_log_chain(struct dsl_pool *dp,
struct dsl_dataset *ds, void *tx);
extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx);
extern void zil_clean(zilog_t *zilog, uint64_t synced_txg);

Expand Down
221 changes: 182 additions & 39 deletions module/zfs/dmu_objset.c
Expand Up @@ -24,6 +24,7 @@
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright (c) 2015, STRATO AG, Inc. All rights reserved.
*/

/* Portions Copyright 2010 Robert Milkowski */
Expand All @@ -48,13 +49,24 @@
#include <sys/sa.h>
#include <sys/zfs_onexit.h>
#include <sys/dsl_destroy.h>
#include <sys/vdev.h>

/*
* Needed to close a window in dnode_move() that allows the objset to be freed
* before it can be safely accessed.
*/
krwlock_t os_lock;

/*
* Tunable to overwrite the maximum number of threads for the parallization
* of dmu_objset_find_dp, needed to speed up the import of pools with many
* datasets.
* Default is 4 times the number of leaf vdevs.
*/
int dmu_find_threads = 0;

static void dmu_objset_find_dp_cb(void *arg);

void
dmu_objset_init(void)
{
Expand Down Expand Up @@ -504,6 +516,25 @@ dmu_objset_hold(const char *name, void *tag, objset_t **osp)
return (err);
}

static int
dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
boolean_t readonly, void *tag, objset_t **osp)
{
int err;

err = dmu_objset_from_ds(ds, osp);
if (err != 0) {
dsl_dataset_disown(ds, tag);
} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
dsl_dataset_disown(ds, tag);
return (SET_ERROR(EINVAL));
} else if (!readonly && dsl_dataset_is_snapshot(ds)) {
dsl_dataset_disown(ds, tag);
return (SET_ERROR(EROFS));
}
return (err);
}

/*
* dsl_pool must not be held when this is called.
* Upon successful return, there will be a longhold on the dataset,
Expand All @@ -525,21 +556,26 @@ dmu_objset_own(const char *name, dmu_objset_type_t type,
dsl_pool_rele(dp, FTAG);
return (err);
}

err = dmu_objset_from_ds(ds, osp);
err = dmu_objset_own_impl(ds, type, readonly, tag, osp);
dsl_pool_rele(dp, FTAG);
if (err != 0) {
dsl_dataset_disown(ds, tag);
} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
dsl_dataset_disown(ds, tag);
return (SET_ERROR(EINVAL));
} else if (!readonly && ds->ds_is_snapshot) {
dsl_dataset_disown(ds, tag);
return (SET_ERROR(EROFS));
}

return (err);
}

int
dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
boolean_t readonly, void *tag, objset_t **osp)
{
dsl_dataset_t *ds;
int err;

err = dsl_dataset_own_obj(dp, obj, tag, &ds);
if (err != 0)
return (err);

return (dmu_objset_own_impl(ds, type, readonly, tag, osp));
}

void
dmu_objset_rele(objset_t *os, void *tag)
{
Expand Down Expand Up @@ -1618,30 +1654,41 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name,
return (0);
}

/*
* Find objsets under and including ddobj, call func(ds) on each.
*/
int
dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
typedef struct dmu_objset_find_ctx {
taskq_t *dc_tq;
dsl_pool_t *dc_dp;
uint64_t dc_ddobj;
int (*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *);
void *dc_arg;
int dc_flags;
kmutex_t *dc_error_lock;
int *dc_error;
} dmu_objset_find_ctx_t;

static void
dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)
{
dsl_pool_t *dp = dcp->dc_dp;
dmu_objset_find_ctx_t *child_dcp;
dsl_dir_t *dd;
dsl_dataset_t *ds;
zap_cursor_t zc;
zap_attribute_t *attr;
uint64_t thisobj;
int err;
int err = 0;

ASSERT(dsl_pool_config_held(dp));
/* don't process if there already was an error */
if (*dcp->dc_error != 0)
goto out;

err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd);
err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, NULL, FTAG, &dd);
if (err != 0)
return (err);
goto out;

/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
if (dd->dd_myname[0] == '$') {
dsl_dir_rele(dd, FTAG);
return (0);
goto out;
}

thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
Expand All @@ -1650,7 +1697,7 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
/*
* Iterate over all children.
*/
if (flags & DS_FIND_CHILDREN) {
if (dcp->dc_flags & DS_FIND_CHILDREN) {
for (zap_cursor_init(&zc, dp->dp_meta_objset,
dsl_dir_phys(dd)->dd_child_dir_zapobj);
zap_cursor_retrieve(&zc, attr) == 0;
Expand All @@ -1659,24 +1706,22 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
sizeof (uint64_t));
ASSERT3U(attr->za_num_integers, ==, 1);

err = dmu_objset_find_dp(dp, attr->za_first_integer,
func, arg, flags);
if (err != 0)
break;
child_dcp = kmem_alloc(sizeof (*child_dcp), KM_SLEEP);
*child_dcp = *dcp;
child_dcp->dc_ddobj = attr->za_first_integer;
if (dcp->dc_tq != NULL)
(void) taskq_dispatch(dcp->dc_tq,
dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP);
else
dmu_objset_find_dp_impl(child_dcp);
}
zap_cursor_fini(&zc);

if (err != 0) {
dsl_dir_rele(dd, FTAG);
kmem_free(attr, sizeof (zap_attribute_t));
return (err);
}
}

/*
* Iterate over all snapshots.
*/
if (flags & DS_FIND_SNAPSHOTS) {
if (dcp->dc_flags & DS_FIND_SNAPSHOTS) {
dsl_dataset_t *ds;
err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);

Expand All @@ -1697,7 +1742,7 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
attr->za_first_integer, FTAG, &ds);
if (err != 0)
break;
err = func(dp, ds, arg);
err = dcp->dc_func(dp, ds, dcp->dc_arg);
dsl_dataset_rele(ds, FTAG);
if (err != 0)
break;
Expand All @@ -1710,17 +1755,115 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
kmem_free(attr, sizeof (zap_attribute_t));

if (err != 0)
return (err);
goto out;

/*
* Apply to self.
*/
err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
if (err != 0)
return (err);
err = func(dp, ds, arg);
goto out;
err = dcp->dc_func(dp, ds, dcp->dc_arg);
dsl_dataset_rele(ds, FTAG);
return (err);

out:
if (err != 0) {
mutex_enter(dcp->dc_error_lock);
/* only keep first error */
if (*dcp->dc_error == 0)
*dcp->dc_error = err;
mutex_exit(dcp->dc_error_lock);
}

kmem_free(dcp, sizeof (*dcp));
}

static void
dmu_objset_find_dp_cb(void *arg)
{
dmu_objset_find_ctx_t *dcp = arg;
dsl_pool_t *dp = dcp->dc_dp;

dsl_pool_config_enter(dp, FTAG);

dmu_objset_find_dp_impl(dcp);

dsl_pool_config_exit(dp, FTAG);
}

/*
* Find objsets under and including ddobj, call func(ds) on each.
* The order for the enumeration is completely undefined.
* func is called with dsl_pool_config held.
*/
int
dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
{
int error = 0;
taskq_t *tq = NULL;
int ntasks;
dmu_objset_find_ctx_t *dcp;
kmutex_t err_lock;

mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL);
dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP);
dcp->dc_tq = NULL;
dcp->dc_dp = dp;
dcp->dc_ddobj = ddobj;
dcp->dc_func = func;
dcp->dc_arg = arg;
dcp->dc_flags = flags;
dcp->dc_error_lock = &err_lock;
dcp->dc_error = &error;

if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) {
/*
* In case a write lock is held we can't make use of
* parallelism, as down the stack of the worker threads
* the lock is asserted via dsl_pool_config_held.
* In case of a read lock this is solved by getting a read
* lock in each worker thread, which isn't possible in case
* of a writer lock. So we fall back to the synchronous path
* here.
* In the future it might be possible to get some magic into
* dsl_pool_config_held in a way that it returns true for
* the worker threads so that a single lock held from this
* thread suffices. For now, stay single threaded.
*/
dmu_objset_find_dp_impl(dcp);

return (error);
}

ntasks = dmu_find_threads;
if (ntasks == 0)
ntasks = vdev_count_leaves(dp->dp_spa) * 4;
tq = taskq_create("dmu_objset_find", ntasks, minclsyspri, ntasks,
INT_MAX, 0);
if (tq == NULL) {
kmem_free(dcp, sizeof (*dcp));
return (SET_ERROR(ENOMEM));
}
dcp->dc_tq = tq;

/* dcp will be freed by task */
(void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP);

/*
* PORTING: this code relies on the property of taskq_wait to wait
* until no more tasks are queued and no more tasks are active. As
* we always queue new tasks from within other tasks, task_wait
* reliably waits for the full recursion to finish, even though we
* enqueue new tasks after taskq_wait has been called.
* On platforms other than illumos, taskq_wait may not have this
* property.
*/
taskq_wait(tq);
taskq_destroy(tq);
mutex_destroy(&err_lock);

return (error);
}

/*
Expand Down

0 comments on commit 9c43027

Please sign in to comment.