Skip to content

Commit e49f1e2

Browse files
wcabehlendorf
authored andcommitted
Illumos #3741
3741 zfs needs better comments Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Eric Schrock <eric.schrock@delphix.com> Approved by: Christopher Siden <christopher.siden@delphix.com> References: https://www.illumos.org/issues/3741 illumos/illumos-gate@3e30c24 Ported-by: Richard Yao <ryao@gentoo.org> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #1775
1 parent b1118ac commit e49f1e2

File tree

12 files changed

+154
-11
lines changed

12 files changed

+154
-11
lines changed

include/sys/dmu.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,8 @@ void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp,
407407
* object must be held in an assigned transaction before calling
408408
* dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus
409409
* buffer as well. You must release what you hold with dmu_buf_rele().
410+
*
411+
* Returns ENOENT, EIO, or 0.
410412
*/
411413
int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
412414
int dmu_bonus_max(void);
@@ -662,8 +664,14 @@ extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS];
662664
*/
663665
int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
664666
void __dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
667+
/* Like dmu_object_info, but faster if you have a held dnode in hand. */
665668
void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
669+
/* Like dmu_object_info, but faster if you have a held dbuf in hand. */
666670
void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
671+
/*
672+
* Like dmu_object_info_from_db, but faster still when you only care about
673+
* the size. This is specifically optimized for zfs_getattr().
674+
*/
667675
void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
668676
u_longlong_t *nblk512);
669677

lib/libzfs/libzfs_dataset.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4791,6 +4791,11 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl)
47914791
return (err);
47924792
}
47934793

4794+
/*
4795+
* Convert the zvol's volume size to an appropriate reservation.
4796+
* Note: If this routine is updated, it is necessary to update the ZFS test
4797+
* suite's shell version in reservation.kshlib.
4798+
*/
47944799
uint64_t
47954800
zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props)
47964801
{

module/zfs/arc.c

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,18 @@ typedef struct arc_stats {
260260
kstat_named_t arcstat_mfu_ghost_hits;
261261
kstat_named_t arcstat_deleted;
262262
kstat_named_t arcstat_recycle_miss;
263+
/*
264+
* Number of buffers that could not be evicted because the hash lock
265+
* was held by another thread. The lock may not necessarily be held
266+
* by something using the same buffer, since hash locks are shared
267+
* by multiple buffers.
268+
*/
263269
kstat_named_t arcstat_mutex_miss;
270+
/*
271+
* Number of buffers skipped because they have I/O in progress, are
272+
* indrect prefetch buffers that have not lived long enough, or are
273+
* not from the spa we're trying to evict from.
274+
*/
264275
kstat_named_t arcstat_evict_skip;
265276
kstat_named_t arcstat_evict_l2_cached;
266277
kstat_named_t arcstat_evict_l2_eligible;
@@ -3174,6 +3185,10 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
31743185

31753186
mutex_exit(hash_lock);
31763187

3188+
/*
3189+
* At this point, we have a level 1 cache miss. Try again in
3190+
* L2ARC if possible.
3191+
*/
31773192
ASSERT3U(hdr->b_size, ==, size);
31783193
DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
31793194
uint64_t, size, zbookmark_t *, zb);
@@ -3445,8 +3460,8 @@ arc_buf_evict(arc_buf_t *buf)
34453460
}
34463461

34473462
/*
3448-
* Release this buffer from the cache. This must be done
3449-
* after a read and prior to modifying the buffer contents.
3463+
* Release this buffer from the cache, making it an anonymous buffer. This
3464+
* must be done after a read and prior to modifying the buffer contents.
34503465
* If the buffer has more than one reference, we must make
34513466
* a new hdr for the buffer.
34523467
*/

module/zfs/dbuf.c

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -691,6 +691,14 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
691691
if (!havepzio)
692692
err = zio_wait(zio);
693693
} else {
694+
/*
695+
* Another reader came in while the dbuf was in flight
696+
* between UNCACHED and CACHED. Either a writer will finish
697+
* writing the buffer (sending the dbuf to CACHED) or the
698+
* first reader's request will reach the read_done callback
699+
* and send the dbuf to CACHED. Otherwise, a failure
700+
* occurred and the dbuf went to UNCACHED.
701+
*/
694702
mutex_exit(&db->db_mtx);
695703
if (prefetch)
696704
dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
@@ -699,6 +707,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
699707
rw_exit(&dn->dn_struct_rwlock);
700708
DB_DNODE_EXIT(db);
701709

710+
/* Skip the wait per the caller's request. */
702711
mutex_enter(&db->db_mtx);
703712
if ((flags & DB_RF_NEVERWAIT) == 0) {
704713
while (db->db_state == DB_READ ||
@@ -1313,7 +1322,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
13131322
}
13141323

13151324
/*
1316-
* Return TRUE if this evicted the dbuf.
1325+
* Undirty a buffer in the transaction group referenced by the given
1326+
* transaction. Return whether this evicted the dbuf.
13171327
*/
13181328
static boolean_t
13191329
dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
@@ -2324,6 +2334,7 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
23242334
ASSERT(db->db_level > 0);
23252335
DBUF_VERIFY(db);
23262336

2337+
/* Read the block if it hasn't been read yet. */
23272338
if (db->db_buf == NULL) {
23282339
mutex_exit(&db->db_mtx);
23292340
(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
@@ -2334,10 +2345,12 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
23342345

23352346
DB_DNODE_ENTER(db);
23362347
dn = DB_DNODE(db);
2348+
/* Indirect block size must match what the dnode thinks it is. */
23372349
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
23382350
dbuf_check_blkptr(dn, db);
23392351
DB_DNODE_EXIT(db);
23402352

2353+
/* Provide the pending dirty record to child dbufs */
23412354
db->db_data_pending = dr;
23422355

23432356
mutex_exit(&db->db_mtx);
@@ -2728,6 +2741,7 @@ dbuf_write_override_done(zio_t *zio)
27282741
dbuf_write_done(zio, NULL, db);
27292742
}
27302743

2744+
/* Issue I/O to commit a dirty buffer to disk. */
27312745
static void
27322746
dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
27332747
{
@@ -2762,11 +2776,19 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
27622776
}
27632777

27642778
if (parent != dn->dn_dbuf) {
2779+
/* Our parent is an indirect block. */
2780+
/* We have a dirty parent that has been scheduled for write. */
27652781
ASSERT(parent && parent->db_data_pending);
2782+
/* Our parent's buffer is one level closer to the dnode. */
27662783
ASSERT(db->db_level == parent->db_level-1);
2784+
/*
2785+
* We're about to modify our parent's db_data by modifying
2786+
* our block pointer, so the parent must be released.
2787+
*/
27672788
ASSERT(arc_released(parent->db_buf));
27682789
zio = parent->db_data_pending->dr_zio;
27692790
} else {
2791+
/* Our parent is the dnode itself. */
27702792
ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
27712793
db->db_blkid != DMU_SPILL_BLKID) ||
27722794
(db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));

module/zfs/dmu.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1965,7 +1965,7 @@ dmu_init(void)
19651965
void
19661966
dmu_fini(void)
19671967
{
1968-
arc_fini();
1968+
arc_fini(); /* arc depends on l2arc, so arc must go first */
19691969
l2arc_fini();
19701970
dmu_tx_fini();
19711971
zfetch_fini();

module/zfs/dmu_tx.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1040,6 +1040,10 @@ dmu_tx_unassign(dmu_tx_t *tx)
10401040

10411041
txg_rele_to_quiesce(&tx->tx_txgh);
10421042

1043+
/*
1044+
* Walk the transaction's hold list, removing the hold on the
1045+
* associated dnode, and notifying waiters if the refcount drops to 0.
1046+
*/
10431047
for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
10441048
txh = list_next(&tx->tx_holds, txh)) {
10451049
dnode_t *dn = txh->txh_dnode;
@@ -1157,6 +1161,10 @@ dmu_tx_commit(dmu_tx_t *tx)
11571161

11581162
ASSERT(tx->tx_txg != 0);
11591163

1164+
/*
1165+
* Go through the transaction's hold list and remove holds on
1166+
* associated dnodes, notifying waiters if no holds remain.
1167+
*/
11601168
while ((txh = list_head(&tx->tx_holds))) {
11611169
dnode_t *dn = txh->txh_dnode;
11621170

module/zfs/dmu_zfetch.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,11 +48,11 @@ unsigned int zfetch_block_cap = 256;
4848
unsigned long zfetch_array_rd_sz = 1024 * 1024;
4949

5050
/* forward decls for static routines */
51-
static int dmu_zfetch_colinear(zfetch_t *, zstream_t *);
51+
static boolean_t dmu_zfetch_colinear(zfetch_t *, zstream_t *);
5252
static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *);
5353
static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t);
5454
static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t);
55-
static int dmu_zfetch_find(zfetch_t *, zstream_t *, int);
55+
static boolean_t dmu_zfetch_find(zfetch_t *, zstream_t *, int);
5656
static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *);
5757
static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *);
5858
static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
@@ -104,9 +104,9 @@ kstat_t *zfetch_ksp;
104104
* last stream, then we are probably in a strided access pattern. So
105105
* combine the two sequential streams into a single strided stream.
106106
*
107-
* If no co-linear streams are found, return NULL.
107+
* Returns whether co-linear streams were found.
108108
*/
109-
static int
109+
static boolean_t
110110
dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh)
111111
{
112112
zstream_t *z_walk;
@@ -326,7 +326,7 @@ dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
326326
* for this block read. If so, it starts a prefetch for the stream it
327327
* located and returns true, otherwise it returns false
328328
*/
329-
static int
329+
static boolean_t
330330
dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
331331
{
332332
zstream_t *zs;
@@ -639,7 +639,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
639639
{
640640
zstream_t zst;
641641
zstream_t *newstream;
642-
int fetched;
642+
boolean_t fetched;
643643
int inserted;
644644
unsigned int blkshft;
645645
uint64_t blksz;

module/zfs/spa.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
*/
2727

2828
/*
29+
* SPA: Storage Pool Allocator
30+
*
2931
* This file contains all the routines used when modifying on-disk SPA state.
3032
* This includes opening, importing, destroying, exporting a pool, and syncing a
3133
* pool.

module/zfs/txg.c

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,12 @@ txg_rele_to_sync(txg_handle_t *th)
354354
th->th_cpu = NULL; /* defensive */
355355
}
356356

357+
/*
358+
* Blocks until all transactions in the group are committed.
359+
*
360+
* On return, the transaction group has reached a stable state in which it can
361+
* then be passed off to the syncing context.
362+
*/
357363
static void
358364
txg_quiesce(dsl_pool_t *dp, uint64_t txg)
359365
{
@@ -409,6 +415,9 @@ txg_do_callbacks(list_t *cb_list)
409415

410416
/*
411417
* Dispatch the commit callbacks registered on this txg to worker threads.
418+
*
419+
* If no callbacks are registered for a given TXG, nothing happens.
420+
* This function creates a taskq for the associated pool, if needed.
412421
*/
413422
static void
414423
txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
@@ -419,7 +428,10 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
419428

420429
for (c = 0; c < max_ncpus; c++) {
421430
tx_cpu_t *tc = &tx->tx_cpu[c];
422-
/* No need to lock tx_cpu_t at this point */
431+
/*
432+
* No need to lock tx_cpu_t at this point, since this can
433+
* only be called once a txg has been synced.
434+
*/
423435

424436
int g = txg & TXG_MASK;
425437

module/zfs/vdev_label.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1035,6 +1035,7 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
10351035
zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd));
10361036
}
10371037

1038+
/* Sync the uberblocks to all vdevs in svd[] */
10381039
int
10391040
vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
10401041
{

0 commit comments

Comments
 (0)