Permalink
Browse files

port async unlinked drain from illumos-nexenta

This patch is an async implementation of the existing sync
zfs_unlinked_drain() function. This function is called at mount time and
is responsible for freeing znodes that we didn't get to freeing before.
We don't have to hold mounting of the dataset until the unlinked list is
fully drained as is done now. Since we can process the unlinked set
asynchronously this results in a better user experience when mounting a
dataset with entries in the unlinked set.

Signed-off-by: Alek Pinchuk <apinchuk@datto.com>
  • Loading branch information...
alek-p committed Dec 6, 2018
1 parent 9042f60 commit 33585da281652798b9766dd25518c5c6b8f12376
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2018 by Delphix. All rights reserved.
* Copyright (c) 2018 Datto Inc.
*/
#ifndef _SYS_DATASET_KSTATS_H
@@ -35,6 +36,8 @@ typedef struct dataset_aggsum_stats_t {
aggsum_t das_nwritten;
aggsum_t das_reads;
aggsum_t das_nread;
aggsum_t das_nunlinks;
aggsum_t das_nunlinked;
} dataset_aggsum_stats_t;
typedef struct dataset_kstat_values {
@@ -43,6 +46,8 @@ typedef struct dataset_kstat_values {
kstat_named_t dkv_nwritten;
kstat_named_t dkv_reads;
kstat_named_t dkv_nread;
kstat_named_t dkv_nunlinks; /* num znodes added to unlinked set */
kstat_named_t dkv_nunlinked; /* num znodes processed by unlinked set */
} dataset_kstat_values_t;
typedef struct dataset_kstats {
@@ -56,4 +61,7 @@ void dataset_kstats_destroy(dataset_kstats_t *);
void dataset_kstats_update_write_kstats(dataset_kstats_t *, int64_t);
void dataset_kstats_update_read_kstats(dataset_kstats_t *, int64_t);
void dataset_kstats_inc_nunlinks_kstat(dataset_kstats_t *);
void dataset_kstats_update_nunlinked_kstat(dataset_kstats_t *, int64_t);
#endif /* _SYS_DATASET_KSTATS_H */
@@ -96,6 +96,7 @@ typedef struct dsl_pool {
struct dsl_dataset *dp_origin_snap;
uint64_t dp_root_dir_obj;
struct taskq *dp_iput_taskq;
struct taskq *dp_unlinked_drain_taskq;
/* No lock needed - sync context only */
blkptr_t dp_meta_rootbp;
@@ -176,6 +177,7 @@ boolean_t dsl_pool_config_held(dsl_pool_t *dp);
boolean_t dsl_pool_config_held_writer(dsl_pool_t *dp);
taskq_t *dsl_pool_iput_taskq(dsl_pool_t *dp);
taskq_t *dsl_pool_unlinked_drain_taskq(dsl_pool_t *dp);
int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj,
const char *tag, uint64_t now, dmu_tx_t *tx);
@@ -64,6 +64,7 @@ extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old);
extern boolean_t zfs_dirempty(znode_t *);
extern void zfs_unlinked_add(znode_t *, dmu_tx_t *);
extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs);
extern void zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs);
extern int zfs_sticky_remove_access(znode_t *, znode_t *, cred_t *cr);
extern int zfs_get_xattrdir(znode_t *, struct inode **, cred_t *, int);
extern int zfs_make_xattrdir(znode_t *, vattr_t *, struct inode **, cred_t *);
@@ -132,6 +132,10 @@ struct zfsvfs {
uint64_t z_hold_size; /* znode hold array size */
avl_tree_t *z_hold_trees; /* znode hold trees */
kmutex_t *z_hold_locks; /* znode hold locks */
/* for controlling async zfs_unlinked_drain */
taskqid_t z_drain_task; /* task id for the unlink drain task */
boolean_t z_draining; /* is true when drain is active */
boolean_t z_drain_continue; /* used to signal we drain to stop */
};
#define ZSB_XATTR 0x0001 /* Enable user xattrs */
@@ -1149,6 +1149,18 @@ Rate limit IO delay events to this many per second.
Default value: 20
.RE
.sp
.ne 2
.na
\fBzfs_keep_unlinked\fR (uint)
.ad
.RS 12n
When enabled prevents freeing of files until unlinked drain is called.
This option is used by the test suite to facilitate testing.
.sp
Uses \fB0\fR (default) to disable and \fB1\fR to enable.
.RE
.sp
.ne 2
.na
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2018 by Delphix. All rights reserved.
* Copyright (c) 2018 Datto Inc.
*/
#include <sys/dataset_kstats.h>
@@ -34,6 +35,8 @@ static dataset_kstat_values_t empty_dataset_kstats = {
{ "nwritten", KSTAT_DATA_UINT64 },
{ "reads", KSTAT_DATA_UINT64 },
{ "nread", KSTAT_DATA_UINT64 },
{ "nunlinks", KSTAT_DATA_UINT64 },
{ "nunlinked", KSTAT_DATA_UINT64 },
};
static int
@@ -54,6 +57,10 @@ dataset_kstats_update(kstat_t *ksp, int rw)
aggsum_value(&dk->dk_aggsums.das_reads);
dkv->dkv_nread.value.ui64 =
aggsum_value(&dk->dk_aggsums.das_nread);
dkv->dkv_nunlinks.value.ui64 =
aggsum_value(&dk->dk_aggsums.das_nunlinks);
dkv->dkv_nunlinked.value.ui64 =
aggsum_value(&dk->dk_aggsums.das_nunlinked);
return (0);
}
@@ -136,6 +143,8 @@ dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset)
aggsum_init(&dk->dk_aggsums.das_nwritten, 0);
aggsum_init(&dk->dk_aggsums.das_reads, 0);
aggsum_init(&dk->dk_aggsums.das_nread, 0);
aggsum_init(&dk->dk_aggsums.das_nunlinks, 0);
aggsum_init(&dk->dk_aggsums.das_nunlinked, 0);
}
void
@@ -156,6 +165,8 @@ dataset_kstats_destroy(dataset_kstats_t *dk)
aggsum_fini(&dk->dk_aggsums.das_nwritten);
aggsum_fini(&dk->dk_aggsums.das_reads);
aggsum_fini(&dk->dk_aggsums.das_nread);
aggsum_fini(&dk->dk_aggsums.das_nunlinks);
aggsum_fini(&dk->dk_aggsums.das_nunlinked);
}
void
@@ -183,3 +194,21 @@ dataset_kstats_update_read_kstats(dataset_kstats_t *dk,
aggsum_add(&dk->dk_aggsums.das_reads, 1);
aggsum_add(&dk->dk_aggsums.das_nread, nread);
}
void
dataset_kstats_inc_nunlinks_kstat(dataset_kstats_t *dk)
{
if (dk->dk_kstats == NULL)
return;
aggsum_add(&dk->dk_aggsums.das_nunlinks, 1);
}
void
dataset_kstats_update_nunlinked_kstat(dataset_kstats_t *dk, int64_t delta)
{
if (dk->dk_kstats == NULL)
return;
aggsum_add(&dk->dk_aggsums.das_nunlinked, delta);
}
@@ -223,6 +223,9 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
dp->dp_iput_taskq = taskq_create("z_iput", max_ncpus, defclsyspri,
max_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
dp->dp_unlinked_drain_taskq = taskq_create("z_unlinked_drain",
max_ncpus, defclsyspri, max_ncpus, INT_MAX,
TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
return (dp);
}
@@ -413,6 +416,7 @@ dsl_pool_close(dsl_pool_t *dp)
rrw_destroy(&dp->dp_config_rwlock);
mutex_destroy(&dp->dp_lock);
cv_destroy(&dp->dp_spaceavail_cv);
taskq_destroy(dp->dp_unlinked_drain_taskq);
taskq_destroy(dp->dp_iput_taskq);
if (dp->dp_blkstats != NULL) {
mutex_destroy(&dp->dp_blkstats->zab_lock);
@@ -1095,6 +1099,12 @@ dsl_pool_iput_taskq(dsl_pool_t *dp)
return (dp->dp_iput_taskq);
}
taskq_t *
dsl_pool_unlinked_drain_taskq(dsl_pool_t *dp)
{
return (dp->dp_unlinked_drain_taskq);
}
/*
* Walk through the pool-wide zap object of temporary snapshot user holds
* and release them.
@@ -458,26 +458,32 @@ zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
VERIFY3U(0, ==,
zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
dataset_kstats_inc_nunlinks_kstat(&zfsvfs->z_kstat);
dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1);
}
/*
* Clean up any znodes that had no links when we either crashed or
* (force) umounted the file system.
*/
void
zfs_unlinked_drain(zfsvfs_t *zfsvfs)
static void
zfs_unlinked_drain_task(void *zvfs)
{
zfsvfs_t *zfsvfs = zvfs;
zap_cursor_t zc;
zap_attribute_t zap;
dmu_object_info_t doi;
znode_t *zp;
int error;
ASSERT3B(zfsvfs->z_draining, ==, B_TRUE);
/*
* Iterate over the contents of the unlinked set.
*/
for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
zap_cursor_retrieve(&zc, &zap) == 0;
zap_cursor_retrieve(&zc, &zap) == 0 && zfsvfs->z_drain_continue;
zap_cursor_advance(&zc)) {
/*
@@ -508,8 +514,50 @@ zfs_unlinked_drain(zfsvfs_t *zfsvfs)
zp->z_unlinked = B_TRUE;
iput(ZTOI(zp));
ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
}
zap_cursor_fini(&zc);
zfsvfs->z_draining = B_FALSE;
zfsvfs->z_drain_task = TASKQID_INVALID;
}
/*
* Sets z_draining then tries to dispatch async unlinked drain.
* If that fails executes synchronous unlinked drain.
*/
void
zfs_unlinked_drain(zfsvfs_t *zfsvfs)
{
ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
ASSERT3B(zfsvfs->z_draining, ==, B_FALSE);
zfsvfs->z_draining = B_TRUE;
zfsvfs->z_drain_continue = B_TRUE;
if ((zfsvfs->z_drain_task = taskq_dispatch(dsl_pool_unlinked_drain_taskq
(dmu_objset_pool(zfsvfs->z_os)), zfs_unlinked_drain_task, zfsvfs,
TQ_SLEEP)) == TASKQID_INVALID) {
zfs_dbgmsg("async zfs_unlinked_drain dispatch failed");
zfs_unlinked_drain_task(zfsvfs);
}
}
/*
* Wait for the unlinked drain taskq task to stop. This will interrupt the
* unlinked set processing if it is in progress.
*/
void
zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs)
{
ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
if (zfsvfs->z_draining) {
zfsvfs->z_drain_continue = B_FALSE;
taskq_wait_id(dsl_pool_unlinked_drain_taskq(
dmu_objset_pool(zfsvfs->z_os)), zfsvfs->z_drain_task);
}
ASSERT3B(zfsvfs->z_draining, ==, B_FALSE);
}
/*
@@ -684,6 +732,8 @@ zfs_rmnode(znode_t *zp)
VERIFY3U(0, ==,
zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, -1);
zfs_znode_delete(zp, tx);
dmu_tx_commit(tx);
@@ -1184,6 +1184,10 @@ zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
return (error);
}
zfsvfs->z_drain_task = TASKQID_INVALID;
zfsvfs->z_draining = B_FALSE;
zfsvfs->z_drain_continue = B_FALSE;
*zfvp = zfsvfs;
return (0);
}
@@ -1206,14 +1210,27 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
* operations out since we closed the ZIL.
*/
if (mounting) {
ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
/*
* During replay we remove the read only flag to
* allow replays to succeed.
*/
if (readonly != 0)
if (readonly != 0) {
readonly_changed_cb(zfsvfs, B_FALSE);
else
} else {
zap_stats_t zs;
if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
&zs) == 0) {
dataset_kstats_update_nunlinked_kstat(
&zfsvfs->z_kstat, zs.zs_num_entries);
}
dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
"zfsvfs_setup() num_entries in unlinked set: %llu",
zs.zs_num_entries);
zfs_unlinked_drain(zfsvfs);
}
/*
* Parse and replay the intent log.
@@ -1256,9 +1273,6 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
/* restore readonly bit */
if (readonly != 0)
readonly_changed_cb(zfsvfs, B_TRUE);
ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
}
/*
@@ -1637,6 +1651,8 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
{
znode_t *zp;
zfs_unlinked_drain_stop_wait(zfsvfs);
/*
* If someone has not already unmounted this file system,
* drain the iput_taskq to ensure all active references to the
@@ -1888,6 +1904,7 @@ zfs_preumount(struct super_block *sb)
/* zfsvfs is NULL when zfs_domount fails during mount */
if (zfsvfs) {
zfs_unlinked_drain_stop_wait(zfsvfs);
zfsctl_destroy(sb->s_fs_info);
/*
* Wait for iput_async before entering evict_inodes in
@@ -2163,6 +2180,15 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
}
mutex_exit(&zfsvfs->z_znodes_lock);
if (!zfs_is_readonly(zfsvfs) && !zfsvfs->z_unmounted) {
/*
* zfs_suspend_fs() could have interrupted freeing
* of dnodes. We need to restart this freeing so
* that we don't "leak" the space.
*/
zfs_unlinked_drain(zfsvfs);
}
bail:
/* release the VFS ops */
rw_exit(&zfsvfs->z_teardown_inactive_lock);
@@ -91,6 +91,12 @@ static kmem_cache_t *znode_cache = NULL;
static kmem_cache_t *znode_hold_cache = NULL;
unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ;
/*
* Tunable used in zfs tests to prevent znodes from being freed therefor keeping
* them in the unlinked set
*/
unsigned int zfs_keep_unlinked = 0;
/*
* This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
* z_rangelock. It will modify the offset and length of the lock to reflect
@@ -1339,7 +1345,7 @@ zfs_zinactive(znode_t *zp)
*/
if (zp->z_unlinked) {
ASSERT(!zfsvfs->z_issnap);
if (!zfs_is_readonly(zfsvfs)) {
if (!zfs_is_readonly(zfsvfs) && zfs_keep_unlinked == 0) {
mutex_exit(&zp->z_lock);
zfs_znode_hold_exit(zfsvfs, zh);
zfs_rmnode(zp);
@@ -2214,4 +2220,6 @@ EXPORT_SYMBOL(zfs_obj_to_path);
/* CSTYLED */
module_param(zfs_object_mutex_size, uint, 0644);
MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array");
module_param(zfs_keep_unlinked, uint, 0644);
MODULE_PARM_DESC(zfs_keep_unlinked, "Leak space into the unlinked set");
#endif
Oops, something went wrong.

0 comments on commit 33585da

Please sign in to comment.