Skip to content

Commit

Permalink
Illumos #3104: eliminate empty bpobjs
Browse files Browse the repository at this point in the history
3104 eliminate empty bpobjs
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <chris.siden@delphix.com>
Reviewed by: Garrett D'Amore <garrett@damore.org>
Approved by: Eric Schrock <eric.schrock@delphix.com>

References:
  illumos/illumos-gate@f174573
  illumos changeset: 13782:8f78aae28a63
  https://www.illumos.org/issues/3104

Ported-by: Brian Behlendorf <behlendorf1@llnl.gov>
  • Loading branch information
ahrens authored and behlendorf committed Jan 8, 2013
1 parent 9157970 commit 753c383
Show file tree
Hide file tree
Showing 12 changed files with 166 additions and 11 deletions.
3 changes: 3 additions & 0 deletions include/sys/bpobj.h
Expand Up @@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
*/

#ifndef _SYS_BPOBJ_H
Expand Down Expand Up @@ -67,7 +68,9 @@ typedef struct bpobj {
typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);

uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx);
uint64_t bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx);
void bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
void bpobj_decr_empty(objset_t *os, dmu_tx_t *tx);

int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object);
void bpobj_close(bpobj_t *bpo);
Expand Down
1 change: 1 addition & 0 deletions include/sys/dmu.h
Expand Up @@ -309,6 +309,7 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
#define DMU_POOL_SCAN "scan"
#define DMU_POOL_FREE_BPOBJ "free_bpobj"
#define DMU_POOL_BPTREE_OBJ "bptree_obj"
#define DMU_POOL_EMPTY_BPOBJ "empty_bpobj"

/*
* Allocate an object from this objset. The range of object numbers
Expand Down
1 change: 1 addition & 0 deletions include/sys/dsl_pool.h
Expand Up @@ -96,6 +96,7 @@ typedef struct dsl_pool {
uint64_t dp_tmp_userrefs_obj;
bpobj_t dp_free_bpobj;
uint64_t dp_bptree_obj;
uint64_t dp_empty_bpobj;

struct dsl_scan *dp_scan;

Expand Down
2 changes: 2 additions & 0 deletions include/sys/zap.h
Expand Up @@ -300,6 +300,8 @@ int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
/* Here the key is an int and the value is a different int. */
int zap_add_int_key(objset_t *os, uint64_t obj,
uint64_t key, uint64_t value, dmu_tx_t *tx);
int zap_update_int_key(objset_t *os, uint64_t obj,
uint64_t key, uint64_t value, dmu_tx_t *tx);
int zap_lookup_int_key(objset_t *os, uint64_t obj,
uint64_t key, uint64_t *valuep);

Expand Down
1 change: 1 addition & 0 deletions include/zfeature_common.h
Expand Up @@ -51,6 +51,7 @@ typedef int (zfeature_func_t)(zfeature_info_t *fi, void *arg);

typedef enum spa_feature {
SPA_FEATURE_ASYNC_DESTROY,
SPA_FEATURE_EMPTY_BPOBJ,
SPA_FEATURES
} spa_feature_t;

Expand Down
28 changes: 28 additions & 0 deletions man/man5/zpool-features.5
Expand Up @@ -169,5 +169,33 @@ through the \fBfreeing\fR property.

This feature is only \fBactive\fR while \fBfreeing\fR is non\-zero.
.RE

.sp
.ne 2
.na
\fB\fBempty_bpobj\fR\fR
.ad
.RS 4n
.TS
l l .
GUID com.delphix:empty_bpobj
READ\-ONLY COMPATIBLE yes
DEPENDENCIES none
.TE

This feature increases the performance of creating and using a large
number of snapshots of a single filesystem or volume, and also reduces
the disk space required.

When there are many snapshots, each snapshot uses many Block Pointer
Objects (bpobj's) to track blocks associated with that snapshot.
However, in common use cases, most of these bpobj's are empty. This
feature allows us to create each bpobj on-demand, thus eliminating the
empty bpobjs.

This feature is \fBactive\fR while there are any filesystems, volumes,
or snapshots which were created after enabling this feature.
.RE

.SH "SEE ALSO"
\fBzpool\fR(1M)
58 changes: 57 additions & 1 deletion module/zfs/bpobj.c
Expand Up @@ -20,13 +20,61 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 by Delphix. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
*/

#include <sys/bpobj.h>
#include <sys/zfs_context.h>
#include <sys/refcount.h>
#include <sys/dsl_pool.h>
#include <sys/zfeature.h>
#include <sys/zap.h>

/*
* Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).
*/
uint64_t
bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
{
zfeature_info_t *empty_bpobj_feat =
&spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ];
spa_t *spa = dmu_objset_spa(os);
dsl_pool_t *dp = dmu_objset_pool(os);

if (spa_feature_is_enabled(spa, empty_bpobj_feat)) {
if (!spa_feature_is_active(spa, empty_bpobj_feat)) {
ASSERT3U(dp->dp_empty_bpobj, ==, 0);
dp->dp_empty_bpobj =
bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx);
VERIFY(zap_add(os,
DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
&dp->dp_empty_bpobj, tx) == 0);
}
spa_feature_incr(spa, empty_bpobj_feat, tx);
ASSERT(dp->dp_empty_bpobj != 0);
return (dp->dp_empty_bpobj);
} else {
return (bpobj_alloc(os, blocksize, tx));
}
}

void
bpobj_decr_empty(objset_t *os, dmu_tx_t *tx)
{
zfeature_info_t *empty_bpobj_feat =
&spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ];
dsl_pool_t *dp = dmu_objset_pool(os);

spa_feature_decr(dmu_objset_spa(os), empty_bpobj_feat, tx);
if (!spa_feature_is_active(dmu_objset_spa(os), empty_bpobj_feat)) {
VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_EMPTY_BPOBJ, tx));
VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx));
dp->dp_empty_bpobj = 0;
}
}

uint64_t
bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
Expand All @@ -53,6 +101,7 @@ bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
int epb;
dmu_buf_t *dbuf = NULL;

ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj);
VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));

mutex_enter(&bpo.bpo_lock);
Expand Down Expand Up @@ -320,6 +369,12 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)

ASSERT(bpo->bpo_havesubobj);
ASSERT(bpo->bpo_havecomp);
ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);

if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) {
bpobj_decr_empty(bpo->bpo_os, tx);
return;
}

VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
Expand Down Expand Up @@ -388,6 +443,7 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
blkptr_t *bparray;

ASSERT(!BP_IS_HOLE(bp));
ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);

/* We never need the fill count. */
stored_bp.blk_fill = 0;
Expand Down
54 changes: 45 additions & 9 deletions module/zfs/dsl_deadlist.c
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 by Delphix. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
*/

#include <sys/dsl_dataset.h>
Expand Down Expand Up @@ -165,12 +165,49 @@ dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)

for (zap_cursor_init(&zc, os, dlobj);
zap_cursor_retrieve(&zc, &za) == 0;
zap_cursor_advance(&zc))
bpobj_free(os, za.za_first_integer, tx);
zap_cursor_advance(&zc)) {
uint64_t obj = za.za_first_integer;
if (obj == dmu_objset_pool(os)->dp_empty_bpobj)
bpobj_decr_empty(os, tx);
else
bpobj_free(os, obj, tx);
}
zap_cursor_fini(&zc);
VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx));
}

static void
dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
const blkptr_t *bp, dmu_tx_t *tx)
{
if (dle->dle_bpobj.bpo_object ==
dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
uint64_t obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
bpobj_close(&dle->dle_bpobj);
bpobj_decr_empty(dl->dl_os, tx);
VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
dle->dle_mintxg, obj, tx));
}
bpobj_enqueue(&dle->dle_bpobj, bp, tx);
}

static void
dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
uint64_t obj, dmu_tx_t *tx)
{
if (dle->dle_bpobj.bpo_object !=
dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
} else {
bpobj_close(&dle->dle_bpobj);
bpobj_decr_empty(dl->dl_os, tx);
VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
dle->dle_mintxg, obj, tx));
}
}

void
dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
{
Expand Down Expand Up @@ -199,7 +236,7 @@ dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
else
dle = AVL_PREV(&dl->dl_tree, dle);
bpobj_enqueue(&dle->dle_bpobj, bp, tx);
dle_enqueue(dl, dle, bp, tx);
}

/*
Expand All @@ -219,7 +256,7 @@ dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)

dle = kmem_alloc(sizeof (*dle), KM_PUSHPAGE);
dle->dle_mintxg = mintxg;
obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
avl_add(&dl->dl_tree, dle);

Expand All @@ -245,8 +282,7 @@ dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
dle_prev = AVL_PREV(&dl->dl_tree, dle);

bpobj_enqueue_subobj(&dle_prev->dle_bpobj,
dle->dle_bpobj.bpo_object, tx);
dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx);

avl_remove(&dl->dl_tree, dle);
bpobj_close(&dle->dle_bpobj);
Expand Down Expand Up @@ -304,7 +340,7 @@ dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
if (dle->dle_mintxg >= maxtxg)
break;

obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
dle->dle_mintxg, obj, tx));
}
Expand Down Expand Up @@ -402,7 +438,7 @@ dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
if (dle == NULL)
dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
dle_enqueue_subobj(dl, dle, obj, tx);
}

static int
Expand Down
9 changes: 9 additions & 0 deletions module/zfs/dsl_pool.c
Expand Up @@ -322,6 +322,15 @@ dsl_pool_open(dsl_pool_t *dp)
goto out;
}

if (spa_feature_is_active(dp->dp_spa,
&spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ])) {
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
&dp->dp_empty_bpobj);
if (err != 0)
goto out;
}

err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
&dp->dp_tmp_userrefs_obj);
Expand Down
10 changes: 10 additions & 0 deletions module/zfs/zap.c
Expand Up @@ -1093,6 +1093,16 @@ zap_add_int_key(objset_t *os, uint64_t obj,
return (zap_add(os, obj, name, 8, 1, &value, tx));
}

int
zap_update_int_key(objset_t *os, uint64_t obj,
uint64_t key, uint64_t value, dmu_tx_t *tx)
{
char name[20];

(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
return (zap_update(os, obj, name, 8, 1, &value, tx));
}

int
zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep)
{
Expand Down
7 changes: 6 additions & 1 deletion module/zfs/zfeature.c
Expand Up @@ -229,7 +229,12 @@ feature_get_refcount(objset_t *os, uint64_t read_obj, uint64_t write_obj,
uint64_t refcount;
uint64_t zapobj = feature->fi_can_readonly ? write_obj : read_obj;

ASSERT(0 != zapobj);
/*
* If the pool is currently being created, the feature objects may not
* have been allocated yet. Act as though all features are disabled.
*/
if (zapobj == 0)
return (ENOTSUP);

err = zap_lookup(os, zapobj, feature->fi_guid, sizeof (uint64_t), 1,
&refcount);
Expand Down
3 changes: 3 additions & 0 deletions module/zfs/zfeature_common.c
Expand Up @@ -157,4 +157,7 @@ zpool_feature_init(void)
zfeature_register(SPA_FEATURE_ASYNC_DESTROY,
"com.delphix:async_destroy", "async_destroy",
"Destroy filesystems asynchronously.", B_TRUE, B_FALSE, NULL);
zfeature_register(SPA_FEATURE_EMPTY_BPOBJ,
"com.delphix:empty_bpobj", "empty_bpobj",
"Snapshots use less space.", B_TRUE, B_FALSE, NULL);
}

0 comments on commit 753c383

Please sign in to comment.