Skip to content

Commit 9c43027

Browse files
Arne Jansenbehlendorf
authored andcommitted
Illumos 5269 - zpool import slow
5269 zpool import slow Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: George Wilson <george@delphix.com> Reviewed by: Dan McDonald <danmcd@omniti.com> Approved by: Dan McDonald <danmcd@omniti.com> References: https://www.illumos.org/issues/5269 illumos/illumos-gate@12380e1e Ported-by: DHE <git@dehacked.net> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #3396
1 parent d050c62 commit 9c43027

File tree

10 files changed

+240
-60
lines changed

10 files changed

+240
-60
lines changed

include/sys/dmu.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,7 @@ void zfs_znode_byteswap(void *buf, size_t size);
240240

241241
#define DS_FIND_SNAPSHOTS (1<<0)
242242
#define DS_FIND_CHILDREN (1<<1)
243+
#define DS_FIND_SERIALIZE (1<<2)
243244

244245
/*
245246
* The maximum number of bytes that can be accessed as part of one

include/sys/dmu_objset.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,8 @@ struct objset {
142142
int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
143143
int dmu_objset_own(const char *name, dmu_objset_type_t type,
144144
boolean_t readonly, void *tag, objset_t **osp);
145+
int dmu_objset_own_obj(struct dsl_pool *dp, uint64_t obj,
146+
dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp);
145147
void dmu_objset_refresh_ownership(objset_t *os, void *tag);
146148
void dmu_objset_rele(objset_t *os, void *tag);
147149
void dmu_objset_disown(objset_t *os, void *tag);

include/sys/dsl_pool.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp);
158158
void dsl_pool_config_enter(dsl_pool_t *dp, void *tag);
159159
void dsl_pool_config_exit(dsl_pool_t *dp, void *tag);
160160
boolean_t dsl_pool_config_held(dsl_pool_t *dp);
161+
boolean_t dsl_pool_config_held_writer(dsl_pool_t *dp);
161162

162163
taskq_t *dsl_pool_iput_taskq(dsl_pool_t *dp);
163164

include/sys/vdev.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio);
5959
extern boolean_t vdev_is_bootable(vdev_t *vd);
6060
extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
6161
extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
62+
extern int vdev_count_leaves(spa_t *spa);
6263
extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
6364
uint64_t txg, uint64_t size);
6465
extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,

include/sys/zil.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@
3737
extern "C" {
3838
#endif
3939

40+
struct dsl_pool;
41+
struct dsl_dataset;
42+
4043
/*
4144
* Intent log format:
4245
*
@@ -466,8 +469,10 @@ extern void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
466469
extern void zil_commit(zilog_t *zilog, uint64_t oid);
467470

468471
extern int zil_vdev_offline(const char *osname, void *txarg);
469-
extern int zil_claim(const char *osname, void *txarg);
470-
extern int zil_check_log_chain(const char *osname, void *txarg);
472+
extern int zil_claim(struct dsl_pool *dp,
473+
struct dsl_dataset *ds, void *txarg);
474+
extern int zil_check_log_chain(struct dsl_pool *dp,
475+
struct dsl_dataset *ds, void *tx);
471476
extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx);
472477
extern void zil_clean(zilog_t *zilog, uint64_t synced_txg);
473478

module/zfs/dmu_objset.c

Lines changed: 182 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
2525
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
2626
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
27+
* Copyright (c) 2015, STRATO AG, Inc. All rights reserved.
2728
*/
2829

2930
/* Portions Copyright 2010 Robert Milkowski */
@@ -48,13 +49,24 @@
4849
#include <sys/sa.h>
4950
#include <sys/zfs_onexit.h>
5051
#include <sys/dsl_destroy.h>
52+
#include <sys/vdev.h>
5153

5254
/*
5355
* Needed to close a window in dnode_move() that allows the objset to be freed
5456
* before it can be safely accessed.
5557
*/
5658
krwlock_t os_lock;
5759

60+
/*
61+
* Tunable to overwrite the maximum number of threads for the parallization
62+
* of dmu_objset_find_dp, needed to speed up the import of pools with many
63+
* datasets.
64+
* Default is 4 times the number of leaf vdevs.
65+
*/
66+
int dmu_find_threads = 0;
67+
68+
static void dmu_objset_find_dp_cb(void *arg);
69+
5870
void
5971
dmu_objset_init(void)
6072
{
@@ -504,6 +516,25 @@ dmu_objset_hold(const char *name, void *tag, objset_t **osp)
504516
return (err);
505517
}
506518

519+
static int
520+
dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
521+
boolean_t readonly, void *tag, objset_t **osp)
522+
{
523+
int err;
524+
525+
err = dmu_objset_from_ds(ds, osp);
526+
if (err != 0) {
527+
dsl_dataset_disown(ds, tag);
528+
} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
529+
dsl_dataset_disown(ds, tag);
530+
return (SET_ERROR(EINVAL));
531+
} else if (!readonly && dsl_dataset_is_snapshot(ds)) {
532+
dsl_dataset_disown(ds, tag);
533+
return (SET_ERROR(EROFS));
534+
}
535+
return (err);
536+
}
537+
507538
/*
508539
* dsl_pool must not be held when this is called.
509540
* Upon successful return, there will be a longhold on the dataset,
@@ -525,21 +556,26 @@ dmu_objset_own(const char *name, dmu_objset_type_t type,
525556
dsl_pool_rele(dp, FTAG);
526557
return (err);
527558
}
528-
529-
err = dmu_objset_from_ds(ds, osp);
559+
err = dmu_objset_own_impl(ds, type, readonly, tag, osp);
530560
dsl_pool_rele(dp, FTAG);
531-
if (err != 0) {
532-
dsl_dataset_disown(ds, tag);
533-
} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
534-
dsl_dataset_disown(ds, tag);
535-
return (SET_ERROR(EINVAL));
536-
} else if (!readonly && ds->ds_is_snapshot) {
537-
dsl_dataset_disown(ds, tag);
538-
return (SET_ERROR(EROFS));
539-
}
561+
540562
return (err);
541563
}
542564

565+
int
566+
dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
567+
boolean_t readonly, void *tag, objset_t **osp)
568+
{
569+
dsl_dataset_t *ds;
570+
int err;
571+
572+
err = dsl_dataset_own_obj(dp, obj, tag, &ds);
573+
if (err != 0)
574+
return (err);
575+
576+
return (dmu_objset_own_impl(ds, type, readonly, tag, osp));
577+
}
578+
543579
void
544580
dmu_objset_rele(objset_t *os, void *tag)
545581
{
@@ -1618,30 +1654,41 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name,
16181654
return (0);
16191655
}
16201656

1621-
/*
1622-
* Find objsets under and including ddobj, call func(ds) on each.
1623-
*/
1624-
int
1625-
dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
1626-
int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
1657+
typedef struct dmu_objset_find_ctx {
1658+
taskq_t *dc_tq;
1659+
dsl_pool_t *dc_dp;
1660+
uint64_t dc_ddobj;
1661+
int (*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *);
1662+
void *dc_arg;
1663+
int dc_flags;
1664+
kmutex_t *dc_error_lock;
1665+
int *dc_error;
1666+
} dmu_objset_find_ctx_t;
1667+
1668+
static void
1669+
dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)
16271670
{
1671+
dsl_pool_t *dp = dcp->dc_dp;
1672+
dmu_objset_find_ctx_t *child_dcp;
16281673
dsl_dir_t *dd;
16291674
dsl_dataset_t *ds;
16301675
zap_cursor_t zc;
16311676
zap_attribute_t *attr;
16321677
uint64_t thisobj;
1633-
int err;
1678+
int err = 0;
16341679

1635-
ASSERT(dsl_pool_config_held(dp));
1680+
/* don't process if there already was an error */
1681+
if (*dcp->dc_error != 0)
1682+
goto out;
16361683

1637-
err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd);
1684+
err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, NULL, FTAG, &dd);
16381685
if (err != 0)
1639-
return (err);
1686+
goto out;
16401687

16411688
/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
16421689
if (dd->dd_myname[0] == '$') {
16431690
dsl_dir_rele(dd, FTAG);
1644-
return (0);
1691+
goto out;
16451692
}
16461693

16471694
thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
@@ -1650,7 +1697,7 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
16501697
/*
16511698
* Iterate over all children.
16521699
*/
1653-
if (flags & DS_FIND_CHILDREN) {
1700+
if (dcp->dc_flags & DS_FIND_CHILDREN) {
16541701
for (zap_cursor_init(&zc, dp->dp_meta_objset,
16551702
dsl_dir_phys(dd)->dd_child_dir_zapobj);
16561703
zap_cursor_retrieve(&zc, attr) == 0;
@@ -1659,24 +1706,22 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
16591706
sizeof (uint64_t));
16601707
ASSERT3U(attr->za_num_integers, ==, 1);
16611708

1662-
err = dmu_objset_find_dp(dp, attr->za_first_integer,
1663-
func, arg, flags);
1664-
if (err != 0)
1665-
break;
1709+
child_dcp = kmem_alloc(sizeof (*child_dcp), KM_SLEEP);
1710+
*child_dcp = *dcp;
1711+
child_dcp->dc_ddobj = attr->za_first_integer;
1712+
if (dcp->dc_tq != NULL)
1713+
(void) taskq_dispatch(dcp->dc_tq,
1714+
dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP);
1715+
else
1716+
dmu_objset_find_dp_impl(child_dcp);
16661717
}
16671718
zap_cursor_fini(&zc);
1668-
1669-
if (err != 0) {
1670-
dsl_dir_rele(dd, FTAG);
1671-
kmem_free(attr, sizeof (zap_attribute_t));
1672-
return (err);
1673-
}
16741719
}
16751720

16761721
/*
16771722
* Iterate over all snapshots.
16781723
*/
1679-
if (flags & DS_FIND_SNAPSHOTS) {
1724+
if (dcp->dc_flags & DS_FIND_SNAPSHOTS) {
16801725
dsl_dataset_t *ds;
16811726
err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
16821727

@@ -1697,7 +1742,7 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
16971742
attr->za_first_integer, FTAG, &ds);
16981743
if (err != 0)
16991744
break;
1700-
err = func(dp, ds, arg);
1745+
err = dcp->dc_func(dp, ds, dcp->dc_arg);
17011746
dsl_dataset_rele(ds, FTAG);
17021747
if (err != 0)
17031748
break;
@@ -1710,17 +1755,115 @@ dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
17101755
kmem_free(attr, sizeof (zap_attribute_t));
17111756

17121757
if (err != 0)
1713-
return (err);
1758+
goto out;
17141759

17151760
/*
17161761
* Apply to self.
17171762
*/
17181763
err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
17191764
if (err != 0)
1720-
return (err);
1721-
err = func(dp, ds, arg);
1765+
goto out;
1766+
err = dcp->dc_func(dp, ds, dcp->dc_arg);
17221767
dsl_dataset_rele(ds, FTAG);
1723-
return (err);
1768+
1769+
out:
1770+
if (err != 0) {
1771+
mutex_enter(dcp->dc_error_lock);
1772+
/* only keep first error */
1773+
if (*dcp->dc_error == 0)
1774+
*dcp->dc_error = err;
1775+
mutex_exit(dcp->dc_error_lock);
1776+
}
1777+
1778+
kmem_free(dcp, sizeof (*dcp));
1779+
}
1780+
1781+
static void
1782+
dmu_objset_find_dp_cb(void *arg)
1783+
{
1784+
dmu_objset_find_ctx_t *dcp = arg;
1785+
dsl_pool_t *dp = dcp->dc_dp;
1786+
1787+
dsl_pool_config_enter(dp, FTAG);
1788+
1789+
dmu_objset_find_dp_impl(dcp);
1790+
1791+
dsl_pool_config_exit(dp, FTAG);
1792+
}
1793+
1794+
/*
1795+
* Find objsets under and including ddobj, call func(ds) on each.
1796+
* The order for the enumeration is completely undefined.
1797+
* func is called with dsl_pool_config held.
1798+
*/
1799+
int
1800+
dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
1801+
int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
1802+
{
1803+
int error = 0;
1804+
taskq_t *tq = NULL;
1805+
int ntasks;
1806+
dmu_objset_find_ctx_t *dcp;
1807+
kmutex_t err_lock;
1808+
1809+
mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL);
1810+
dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP);
1811+
dcp->dc_tq = NULL;
1812+
dcp->dc_dp = dp;
1813+
dcp->dc_ddobj = ddobj;
1814+
dcp->dc_func = func;
1815+
dcp->dc_arg = arg;
1816+
dcp->dc_flags = flags;
1817+
dcp->dc_error_lock = &err_lock;
1818+
dcp->dc_error = &error;
1819+
1820+
if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) {
1821+
/*
1822+
* In case a write lock is held we can't make use of
1823+
* parallelism, as down the stack of the worker threads
1824+
* the lock is asserted via dsl_pool_config_held.
1825+
* In case of a read lock this is solved by getting a read
1826+
* lock in each worker thread, which isn't possible in case
1827+
* of a writer lock. So we fall back to the synchronous path
1828+
* here.
1829+
* In the future it might be possible to get some magic into
1830+
* dsl_pool_config_held in a way that it returns true for
1831+
* the worker threads so that a single lock held from this
1832+
* thread suffices. For now, stay single threaded.
1833+
*/
1834+
dmu_objset_find_dp_impl(dcp);
1835+
1836+
return (error);
1837+
}
1838+
1839+
ntasks = dmu_find_threads;
1840+
if (ntasks == 0)
1841+
ntasks = vdev_count_leaves(dp->dp_spa) * 4;
1842+
tq = taskq_create("dmu_objset_find", ntasks, minclsyspri, ntasks,
1843+
INT_MAX, 0);
1844+
if (tq == NULL) {
1845+
kmem_free(dcp, sizeof (*dcp));
1846+
return (SET_ERROR(ENOMEM));
1847+
}
1848+
dcp->dc_tq = tq;
1849+
1850+
/* dcp will be freed by task */
1851+
(void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP);
1852+
1853+
/*
1854+
* PORTING: this code relies on the property of taskq_wait to wait
1855+
* until no more tasks are queued and no more tasks are active. As
1856+
* we always queue new tasks from within other tasks, task_wait
1857+
* reliably waits for the full recursion to finish, even though we
1858+
* enqueue new tasks after taskq_wait has been called.
1859+
* On platforms other than illumos, taskq_wait may not have this
1860+
* property.
1861+
*/
1862+
taskq_wait(tq);
1863+
taskq_destroy(tq);
1864+
mutex_destroy(&err_lock);
1865+
1866+
return (error);
17241867
}
17251868

17261869
/*

0 commit comments

Comments
 (0)