Skip to content

Commit 4bda3bd

Browse files
ahrensbehlendorf
authored andcommitted
Illumos 5911 - ZFS "hangs" while deleting file
5911 ZFS "hangs" while deleting file Reviewed by: Bayard Bell <buffer.g.overflow@gmail.com> Reviewed by: Alek Pinchuk <alek@nexenta.com> Reviewed by: Simon Klinkert <simon.klinkert@gmail.com> Reviewed by: Dan McDonald <danmcd@omniti.com> Approved by: Richard Lowe <richlowe@richlowe.net> References: https://www.illumos.org/issues/5911 illumos/illumos-gate@46e1baa Porting notes: Resolved ISO C90 forbids mixed declarations and code wanting in the dnode_free_range() function. Ported-by: kernelOfTruth kerneloftruth@gmail.com Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #3554
1 parent 5e8cd5d commit 4bda3bd

File tree

5 files changed

+91
-34
lines changed

5 files changed

+91
-34
lines changed

include/sys/dbuf.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
*/
2121
/*
2222
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23-
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
23+
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
2424
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
2525
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
2626
*/
@@ -287,7 +287,7 @@ void dbuf_clear(dmu_buf_impl_t *db);
287287
void dbuf_evict(dmu_buf_impl_t *db);
288288

289289
void dbuf_unoverride(dbuf_dirty_record_t *dr);
290-
void dbuf_sync_list(list_t *list, dmu_tx_t *tx);
290+
void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx);
291291
void dbuf_release_bp(dmu_buf_impl_t *db);
292292

293293
void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,

module/zfs/dbuf.c

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
/*
2222
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
2323
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24-
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
24+
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
2525
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
2626
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
2727
*/
@@ -1455,6 +1455,16 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
14551455
dbuf_dirty_record_t *dr, **drp;
14561456

14571457
ASSERT(txg != 0);
1458+
1459+
/*
1460+
* Due to our use of dn_nlevels below, this can only be called
1461+
* in open context, unless we are operating on the MOS.
1462+
* From syncing context, dn_nlevels may be different from the
1463+
* dn_nlevels used when dbuf was dirtied.
1464+
*/
1465+
ASSERT(db->db_objset ==
1466+
dmu_objset_pool(db->db_objset)->dp_meta_objset ||
1467+
txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
14581468
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
14591469
ASSERT0(db->db_level);
14601470
ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -1477,11 +1487,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
14771487

14781488
ASSERT(db->db.db_size != 0);
14791489

1480-
/*
1481-
* Any space we accounted for in dp_dirty_* will be cleaned up by
1482-
* dsl_pool_sync(). This is relatively rare so the discrepancy
1483-
* is not a big deal.
1484-
*/
1490+
dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
1491+
dr->dr_accounted, txg);
14851492

14861493
*drp = dr->dr_next;
14871494

@@ -1496,7 +1503,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
14961503
list_remove(&dr->dr_parent->dt.di.dr_children, dr);
14971504
mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
14981505
} else if (db->db_blkid == DMU_SPILL_BLKID ||
1499-
db->db_level+1 == dn->dn_nlevels) {
1506+
db->db_level + 1 == dn->dn_nlevels) {
15001507
ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
15011508
mutex_enter(&dn->dn_mtx);
15021509
list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
@@ -1513,11 +1520,6 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
15131520
VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
15141521
}
15151522

1516-
if (db->db_level != 0) {
1517-
mutex_destroy(&dr->dt.di.dr_mtx);
1518-
list_destroy(&dr->dt.di.dr_children);
1519-
}
1520-
15211523
kmem_free(dr, sizeof (dbuf_dirty_record_t));
15221524

15231525
ASSERT(db->db_dirtycnt > 0);
@@ -2603,7 +2605,7 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
26032605

26042606
zio = dr->dr_zio;
26052607
mutex_enter(&dr->dt.di.dr_mtx);
2606-
dbuf_sync_list(&dr->dt.di.dr_children, tx);
2608+
dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
26072609
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
26082610
mutex_exit(&dr->dt.di.dr_mtx);
26092611
zio_nowait(zio);
@@ -2754,7 +2756,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
27542756
}
27552757

27562758
void
2757-
dbuf_sync_list(list_t *list, dmu_tx_t *tx)
2759+
dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
27582760
{
27592761
dbuf_dirty_record_t *dr;
27602762

@@ -2771,6 +2773,10 @@ dbuf_sync_list(list_t *list, dmu_tx_t *tx)
27712773
DMU_META_DNODE_OBJECT);
27722774
break;
27732775
}
2776+
if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
2777+
dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
2778+
VERIFY3U(dr->dr_dbuf->db_level, ==, level);
2779+
}
27742780
list_remove(list, dr);
27752781
if (dr->dr_dbuf->db_level > 0)
27762782
dbuf_sync_indirect(dr, tx);

module/zfs/dmu_tx.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
/*
2222
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
2323
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24-
* Copyright (c) 2013 by Delphix. All rights reserved.
24+
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
2525
*/
2626

2727
#include <sys/dmu.h>
@@ -679,7 +679,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
679679
uint64_t ibyte = i << shift;
680680
err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
681681
i = ibyte >> shift;
682-
if (err == ESRCH)
682+
if (err == ESRCH || i > end)
683683
break;
684684
if (err) {
685685
tx->tx_err = err;

module/zfs/dnode.c

Lines changed: 65 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
*/
2121
/*
2222
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23-
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
23+
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
2424
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
2525
*/
2626

@@ -1517,6 +1517,16 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
15171517
rw_downgrade(&dn->dn_struct_rwlock);
15181518
}
15191519

1520+
static void
1521+
dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
1522+
{
1523+
dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
1524+
if (db != NULL) {
1525+
dmu_buf_will_dirty(&db->db, tx);
1526+
dbuf_rele(db, FTAG);
1527+
}
1528+
}
1529+
15201530
void
15211531
dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
15221532
{
@@ -1637,27 +1647,68 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
16371647
nblks += 1;
16381648

16391649
/*
1640-
* Dirty the first and last indirect blocks, as they (and/or their
1641-
* parents) will need to be written out if they were only
1642-
* partially freed. Interior indirect blocks will be themselves freed,
1643-
* by free_children(), so they need not be dirtied. Note that these
1644-
* interior blocks have already been prefetched by dmu_tx_hold_free().
1650+
* Dirty all the indirect blocks in this range. Note that only
1651+
* the first and last indirect blocks can actually be written
1652+
* (if they were partially freed) -- they must be dirtied, even if
1653+
* they do not exist on disk yet. The interior blocks will
1654+
* be freed by free_children(), so they will not actually be written.
1655+
* Even though these interior blocks will not be written, we
1656+
* dirty them for two reasons:
1657+
*
1658+
* - It ensures that the indirect blocks remain in memory until
1659+
* syncing context. (They have already been prefetched by
1660+
* dmu_tx_hold_free(), so we don't have to worry about reading
1661+
* them serially here.)
1662+
*
1663+
* - The dirty space accounting will put pressure on the txg sync
1664+
* mechanism to begin syncing, and to delay transactions if there
1665+
* is a large amount of freeing. Even though these indirect
1666+
* blocks will not be written, we could need to write the same
1667+
* amount of space if we copy the freed BPs into deadlists.
16451668
*/
16461669
if (dn->dn_nlevels > 1) {
1647-
uint64_t first, last;
1670+
uint64_t first, last, i, ibyte;
1671+
int shift, err;
16481672

16491673
first = blkid >> epbs;
1650-
if ((db = dbuf_hold_level(dn, 1, first, FTAG))) {
1651-
dmu_buf_will_dirty(&db->db, tx);
1652-
dbuf_rele(db, FTAG);
1653-
}
1674+
dnode_dirty_l1(dn, first, tx);
16541675
if (trunc)
16551676
last = dn->dn_maxblkid >> epbs;
16561677
else
16571678
last = (blkid + nblks - 1) >> epbs;
1658-
if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) {
1659-
dmu_buf_will_dirty(&db->db, tx);
1660-
dbuf_rele(db, FTAG);
1679+
if (last != first)
1680+
dnode_dirty_l1(dn, last, tx);
1681+
1682+
shift = dn->dn_datablkshift + dn->dn_indblkshift -
1683+
SPA_BLKPTRSHIFT;
1684+
for (i = first + 1; i < last; i++) {
1685+
/*
1686+
* Set i to the blockid of the next non-hole
1687+
* level-1 indirect block at or after i. Note
1688+
* that dnode_next_offset() operates in terms of
1689+
* level-0-equivalent bytes.
1690+
*/
1691+
ibyte = i << shift;
1692+
err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
1693+
&ibyte, 2, 1, 0);
1694+
i = ibyte >> shift;
1695+
if (i >= last)
1696+
break;
1697+
1698+
/*
1699+
* Normally we should not see an error, either
1700+
* from dnode_next_offset() or dbuf_hold_level()
1701+
* (except for ESRCH from dnode_next_offset).
1702+
* If there is an i/o error, then when we read
1703+
* this block in syncing context, it will use
1704+
* ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
1705+
* to the "failmode" property. dnode_next_offset()
1706+
* doesn't have a flag to indicate MUSTSUCCEED.
1707+
*/
1708+
if (err != 0)
1709+
break;
1710+
1711+
dnode_dirty_l1(dn, i, tx);
16611712
}
16621713
}
16631714

module/zfs/dnode_sync.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121

2222
/*
2323
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24-
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
24+
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
2525
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
2626
*/
2727

@@ -718,7 +718,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
718718
mutex_exit(&dn->dn_mtx);
719719
}
720720

721-
dbuf_sync_list(list, tx);
721+
dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx);
722722

723723
if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
724724
ASSERT3P(list_head(list), ==, NULL);

0 commit comments

Comments
 (0)