Skip to content

Commit b663a23

Browse files
ahrensbehlendorf
authored andcommitted
Illumos #4047
4047 panic from dbuf_free_range() from dmu_free_object() while doing zfs receive Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Approved by: Dan McDonald <danmcd@nexenta.com> References: https://www.illumos.org/issues/4047 illumos/illumos-gate@713d6c2 Ported-by: Richard Yao <ryao@gentoo.org> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #1775 Porting notes: 1. The exported symbol dmu_free_object() was renamed to dmu_free_long_object() in Illumos.
1 parent 46ba1e5 commit b663a23

File tree

8 files changed

+99
-89
lines changed

8 files changed

+99
-89
lines changed

include/sys/dmu.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
*/
2121
/*
2222
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23-
* Copyright (c) 2012 by Delphix. All rights reserved.
23+
* Copyright (c) 2013 by Delphix. All rights reserved.
2424
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
2525
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
2626
*/
@@ -579,7 +579,7 @@ int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
579579
uint64_t size, dmu_tx_t *tx);
580580
int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset,
581581
uint64_t size);
582-
int dmu_free_object(objset_t *os, uint64_t object);
582+
int dmu_free_long_object(objset_t *os, uint64_t object);
583583

584584
/*
585585
* Convenience functions.

include/sys/dnode.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
*/
2121
/*
2222
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23-
* Copyright (c) 2012 by Delphix. All rights reserved.
23+
* Copyright (c) 2013 by Delphix. All rights reserved.
2424
*/
2525

2626
#ifndef _SYS_DNODE_H
@@ -188,6 +188,8 @@ typedef struct dnode {
188188

189189
/* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */
190190
uint32_t dn_dbufs_count; /* count of dn_dbufs */
191+
/* There are no level-0 blocks of this blkid or higher in dn_dbufs */
192+
uint64_t dn_unlisted_l0_blkid;
191193

192194
/* protected by os_lock: */
193195
list_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */

module/zfs/dbuf.c

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,12 @@ static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh,
6464
void *tag, dmu_buf_impl_t **dbp, int depth);
6565
static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh);
6666

67+
/*
68+
* Number of times that zfs_free_range() took the slow path while doing
69+
* a zfs receive. A nonzero value indicates a potential performance problem.
70+
*/
71+
uint64_t zfs_free_range_recv_miss;
72+
6773
static void dbuf_destroy(dmu_buf_impl_t *db);
6874
static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
6975
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
@@ -869,20 +875,22 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
869875
}
870876
dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
871877

872-
if (dmu_objset_is_receiving(dn->dn_objset)) {
878+
mutex_enter(&dn->dn_dbufs_mtx);
879+
if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz) {
880+
/* There can't be any dbufs in this range; no need to search. */
881+
mutex_exit(&dn->dn_dbufs_mtx);
882+
return;
883+
} else if (dmu_objset_is_receiving(dn->dn_objset)) {
873884
/*
874-
* When processing a free record from a zfs receive,
875-
* there should have been no previous modifications to the
876-
* data in this range. Therefore there should be no dbufs
877-
* in the range. Searching dn_dbufs for these non-existent
878-
* dbufs can be very expensive, so simply ignore this.
885+
* If we are receiving, we expect there to be no dbufs in
886+
* the range to be freed, because receive modifies each
887+
* block at most once, and in offset order. If this is
888+
* not the case, it can lead to performance problems,
889+
* so note that we unexpectedly took the slow path.
879890
*/
880-
VERIFY3P(dbuf_find(dn, 0, start), ==, NULL);
881-
VERIFY3P(dbuf_find(dn, 0, end), ==, NULL);
882-
return;
891+
atomic_inc_64(&zfs_free_range_recv_miss);
883892
}
884893

885-
mutex_enter(&dn->dn_dbufs_mtx);
886894
for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
887895
db_next = list_next(&dn->dn_dbufs, db);
888896
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
@@ -1781,6 +1789,9 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
17811789
return (odb);
17821790
}
17831791
list_insert_head(&dn->dn_dbufs, db);
1792+
if (db->db_level == 0 && db->db_blkid >=
1793+
dn->dn_unlisted_l0_blkid)
1794+
dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
17841795
db->db_state = DB_UNCACHED;
17851796
mutex_exit(&dn->dn_dbufs_mtx);
17861797
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);

module/zfs/dmu.c

Lines changed: 63 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -568,98 +568,95 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
568568
* the end so that the file gets shorter over time (if we crashes in the
569569
* middle, this will leave us in a better state). We find allocated file
570570
* data by simply searching the allocated level 1 indirects.
571+
*
572+
* On input, *start should be the first offset that does not need to be
573+
* freed (e.g. "offset + length"). On return, *start will be the first
574+
* offset that should be freed.
571575
*/
572576
static int
573-
get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit)
577+
get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum)
574578
{
575-
uint64_t len = *start - limit;
576-
uint64_t blkcnt = 0;
577-
uint64_t maxblks = DMU_MAX_ACCESS / (1ULL << (dn->dn_indblkshift + 1));
579+
uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
580+
/* bytes of data covered by a level-1 indirect block */
578581
uint64_t iblkrange =
579582
dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
583+
uint64_t blks;
580584

581-
ASSERT(limit <= *start);
585+
ASSERT3U(minimum, <=, *start);
582586

583-
if (len <= iblkrange * maxblks) {
584-
*start = limit;
587+
if (*start - minimum <= iblkrange * maxblks) {
588+
*start = minimum;
585589
return (0);
586590
}
587591
ASSERT(ISP2(iblkrange));
588592

589-
while (*start > limit && blkcnt < maxblks) {
593+
for (blks = 0; *start > minimum && blks < maxblks; blks++) {
590594
int err;
591595

592-
/* find next allocated L1 indirect */
596+
/*
597+
* dnode_next_offset(BACKWARDS) will find an allocated L1
598+
* indirect block at or before the input offset. We must
599+
* decrement *start so that it is at the end of the region
600+
* to search.
601+
*/
602+
(*start)--;
593603
err = dnode_next_offset(dn,
594604
DNODE_FIND_BACKWARDS, start, 2, 1, 0);
595605

596-
/* if there are no more, then we are done */
606+
/* if there are no indirect blocks before start, we are done */
597607
if (err == ESRCH) {
598-
*start = limit;
599-
return (0);
600-
} else if (err) {
608+
*start = minimum;
609+
break;
610+
} else if (err != 0) {
601611
return (err);
602612
}
603-
blkcnt += 1;
604613

605-
/* reset offset to end of "next" block back */
614+
/* set start to the beginning of this L1 indirect */
606615
*start = P2ALIGN(*start, iblkrange);
607-
if (*start <= limit)
608-
*start = limit;
609-
else
610-
*start -= 1;
611616
}
617+
if (*start < minimum)
618+
*start = minimum;
612619
return (0);
613620
}
614621

615622
static int
616623
dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
617-
uint64_t length, boolean_t free_dnode)
624+
uint64_t length)
618625
{
619-
dmu_tx_t *tx;
620-
uint64_t object_size, start, end, len;
621-
boolean_t trunc = (length == DMU_OBJECT_END);
622-
int align, err;
623-
624-
align = 1 << dn->dn_datablkshift;
625-
ASSERT(align > 0);
626-
object_size = align == 1 ? dn->dn_datablksz :
627-
(dn->dn_maxblkid + 1) << dn->dn_datablkshift;
628-
629-
end = offset + length;
630-
if (trunc || end > object_size)
631-
end = object_size;
632-
if (end <= offset)
626+
uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
627+
int err;
628+
629+
if (offset >= object_size)
633630
return (0);
634-
length = end - offset;
635631

636-
while (length) {
637-
start = end;
638-
/* assert(offset <= start) */
639-
err = get_next_chunk(dn, &start, offset);
632+
if (length == DMU_OBJECT_END || offset + length > object_size)
633+
length = object_size - offset;
634+
635+
while (length != 0) {
636+
uint64_t chunk_end, chunk_begin;
637+
dmu_tx_t *tx;
638+
639+
chunk_end = chunk_begin = offset + length;
640+
641+
/* move chunk_begin backwards to the beginning of this chunk */
642+
err = get_next_chunk(dn, &chunk_begin, offset);
640643
if (err)
641644
return (err);
642-
len = trunc ? DMU_OBJECT_END : end - start;
645+
ASSERT3U(chunk_begin, >=, offset);
646+
ASSERT3U(chunk_begin, <=, chunk_end);
643647

644648
tx = dmu_tx_create(os);
645-
dmu_tx_hold_free(tx, dn->dn_object, start, len);
649+
dmu_tx_hold_free(tx, dn->dn_object,
650+
chunk_begin, chunk_end - chunk_begin);
646651
err = dmu_tx_assign(tx, TXG_WAIT);
647652
if (err) {
648653
dmu_tx_abort(tx);
649654
return (err);
650655
}
651-
652-
dnode_free_range(dn, start, trunc ? -1 : len, tx);
653-
654-
if (start == 0 && free_dnode) {
655-
ASSERT(trunc);
656-
dnode_free(dn, tx);
657-
}
658-
659-
length -= end - start;
660-
656+
dnode_free_range(dn, chunk_begin, chunk_end - chunk_begin, tx);
661657
dmu_tx_commit(tx);
662-
end = start;
658+
659+
length -= chunk_end - chunk_begin;
663660
}
664661
return (0);
665662
}
@@ -674,38 +671,32 @@ dmu_free_long_range(objset_t *os, uint64_t object,
674671
err = dnode_hold(os, object, FTAG, &dn);
675672
if (err != 0)
676673
return (err);
677-
err = dmu_free_long_range_impl(os, dn, offset, length, FALSE);
674+
err = dmu_free_long_range_impl(os, dn, offset, length);
678675
dnode_rele(dn, FTAG);
679676
return (err);
680677
}
681678

682679
int
683-
dmu_free_object(objset_t *os, uint64_t object)
680+
dmu_free_long_object(objset_t *os, uint64_t object)
684681
{
685-
dnode_t *dn;
686682
dmu_tx_t *tx;
687683
int err;
688684

689-
err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
690-
FTAG, &dn);
685+
err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
691686
if (err != 0)
692687
return (err);
693-
if (dn->dn_nlevels == 1) {
694-
tx = dmu_tx_create(os);
695-
dmu_tx_hold_bonus(tx, object);
696-
dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END);
697-
err = dmu_tx_assign(tx, TXG_WAIT);
698-
if (err == 0) {
699-
dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
700-
dnode_free(dn, tx);
701-
dmu_tx_commit(tx);
702-
} else {
703-
dmu_tx_abort(tx);
704-
}
688+
689+
tx = dmu_tx_create(os);
690+
dmu_tx_hold_bonus(tx, object);
691+
dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
692+
err = dmu_tx_assign(tx, TXG_WAIT);
693+
if (err == 0) {
694+
err = dmu_object_free(os, object, tx);
695+
dmu_tx_commit(tx);
705696
} else {
706-
err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE);
697+
dmu_tx_abort(tx);
707698
}
708-
dnode_rele(dn, FTAG);
699+
709700
return (err);
710701
}
711702

@@ -2042,7 +2033,7 @@ EXPORT_SYMBOL(dmu_buf_rele_array);
20422033
EXPORT_SYMBOL(dmu_prefetch);
20432034
EXPORT_SYMBOL(dmu_free_range);
20442035
EXPORT_SYMBOL(dmu_free_long_range);
2045-
EXPORT_SYMBOL(dmu_free_object);
2036+
EXPORT_SYMBOL(dmu_free_long_object);
20462037
EXPORT_SYMBOL(dmu_read);
20472038
EXPORT_SYMBOL(dmu_write);
20482039
EXPORT_SYMBOL(dmu_prealloc);

module/zfs/dmu_send.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1262,7 +1262,7 @@ restore_freeobjects(struct restorearg *ra, objset_t *os,
12621262
if (dmu_object_info(os, obj, NULL) != 0)
12631263
continue;
12641264

1265-
err = dmu_free_object(os, obj);
1265+
err = dmu_free_long_object(os, obj);
12661266
if (err != 0)
12671267
return (err);
12681268
}

module/zfs/dmu_tx.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -632,7 +632,8 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
632632
* if they are blocksize-aligned.
633633
*/
634634
if (dn->dn_datablkshift == 0) {
635-
dmu_tx_count_write(txh, off, len);
635+
if (off != 0 || len < dn->dn_datablksz)
636+
dmu_tx_count_write(txh, off, len);
636637
} else {
637638
/* first block will be modified if it is not aligned */
638639
if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))

module/zfs/dnode.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
117117
dn->dn_id_flags = 0;
118118

119119
dn->dn_dbufs_count = 0;
120+
dn->dn_unlisted_l0_blkid = 0;
120121
list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
121122
offsetof(dmu_buf_impl_t, db_link));
122123

@@ -169,6 +170,7 @@ dnode_dest(void *arg, void *unused)
169170
ASSERT0(dn->dn_id_flags);
170171

171172
ASSERT0(dn->dn_dbufs_count);
173+
ASSERT0(dn->dn_unlisted_l0_blkid);
172174
list_destroy(&dn->dn_dbufs);
173175
}
174176

@@ -472,6 +474,7 @@ dnode_destroy(dnode_t *dn)
472474
dn->dn_newuid = 0;
473475
dn->dn_newgid = 0;
474476
dn->dn_id_flags = 0;
477+
dn->dn_unlisted_l0_blkid = 0;
475478

476479
dmu_zfetch_rele(&dn->dn_zfetch);
477480
kmem_cache_free(dnode_cache, dn);
@@ -703,6 +706,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
703706
ASSERT(list_is_empty(&ndn->dn_dbufs));
704707
list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs);
705708
ndn->dn_dbufs_count = odn->dn_dbufs_count;
709+
ndn->dn_unlisted_l0_blkid = odn->dn_unlisted_l0_blkid;
706710
ndn->dn_bonus = odn->dn_bonus;
707711
ndn->dn_have_spill = odn->dn_have_spill;
708712
ndn->dn_zio = odn->dn_zio;
@@ -737,6 +741,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
737741
list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t),
738742
offsetof(dmu_buf_impl_t, db_link));
739743
odn->dn_dbufs_count = 0;
744+
odn->dn_unlisted_l0_blkid = 0;
740745
odn->dn_bonus = NULL;
741746
odn->dn_zfetch.zf_dnode = NULL;
742747

@@ -1524,7 +1529,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
15241529
blkshift = dn->dn_datablkshift;
15251530
epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
15261531

1527-
if (len == -1ULL) {
1532+
if (len == DMU_OBJECT_END) {
15281533
len = UINT64_MAX - off;
15291534
trunc = TRUE;
15301535
}

module/zfs/dsl_destroy.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -905,7 +905,7 @@ dsl_destroy_head(const char *name)
905905
for (obj = 0; error == 0;
906906
error = dmu_object_next(os, &obj, FALSE,
907907
prev_snap_txg))
908-
(void) dmu_free_object(os, obj);
908+
(void) dmu_free_long_object(os, obj);
909909
/* sync out all frees */
910910
txg_wait_synced(dmu_objset_pool(os), 0);
911911
dmu_objset_disown(os, FTAG);

0 commit comments

Comments
 (0)