Skip to content

Commit 63fd3c6

Browse files
Adam Leventhalbehlendorf
authored andcommitted
Illumos #3582, #3584
3582 zfs_delay() should support a variable resolution 3584 DTrace sdt probes for ZFS txg states Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Christopher Siden <christopher.siden@delphix.com> Reviewed by: Dan McDonald <danmcd@nexenta.com> Reviewed by: Richard Elling <richard.elling@dey-sys.com> Approved by: Garrett D'Amore <garrett@damore.org> References: https://www.illumos.org/issues/3582 illumos/illumos-gate@0689f76 Ported by: Ned Bass <bass6@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #1775
1 parent c1fabe7 commit 63fd3c6

File tree

10 files changed

+87
-33
lines changed

10 files changed

+87
-33
lines changed

include/sys/txg.h

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -74,13 +74,8 @@ extern void txg_rele_to_quiesce(txg_handle_t *txghp);
7474
extern void txg_rele_to_sync(txg_handle_t *txghp);
7575
extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks);
7676

77-
/*
78-
* Delay the caller by the specified number of ticks or until
79-
* the txg closes (whichever comes first). This is intended
80-
* to be used to throttle writers when the system nears its
81-
* capacity.
82-
*/
83-
extern void txg_delay(struct dsl_pool *dp, uint64_t txg, int ticks);
77+
extern void txg_delay(struct dsl_pool *dp, uint64_t txg, hrtime_t delta,
78+
hrtime_t resolution);
8479

8580
/*
8681
* Wait until the given transaction group has finished syncing.

include/sys/txg_impl.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ struct tx_cpu {
7070
kmutex_t tc_open_lock; /* protects tx_open_txg */
7171
kmutex_t tc_lock; /* protects the rest of this struct */
7272
kcondvar_t tc_cv[TXG_SIZE];
73-
uint64_t tc_count[TXG_SIZE];
73+
uint64_t tc_count[TXG_SIZE]; /* tx hold count on each txg */
7474
list_t tc_callbacks[TXG_SIZE]; /* commit cb list */
7575
char tc_pad[8]; /* pad to fill 3 cache lines */
7676
};
@@ -87,8 +87,8 @@ struct tx_cpu {
8787
* every cpu (see txg_quiesce()).
8888
*/
8989
typedef struct tx_state {
90-
tx_cpu_t *tx_cpu; /* protects right to enter txg */
91-
kmutex_t tx_sync_lock; /* protects tx_state_t */
90+
tx_cpu_t *tx_cpu; /* protects access to tx_open_txg */
91+
kmutex_t tx_sync_lock; /* protects the rest of this struct */
9292
uint64_t tx_open_txg; /* currently open txg id */
9393
uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */
9494
uint64_t tx_syncing_txg; /* currently syncing txg id */

include/sys/zfs_context.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,8 @@ extern void cv_init(kcondvar_t *cv, char *name, int type, void *arg);
338338
extern void cv_destroy(kcondvar_t *cv);
339339
extern void cv_wait(kcondvar_t *cv, kmutex_t *mp);
340340
extern clock_t cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime);
341+
extern clock_t cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
342+
hrtime_t res, int flag);
341343
extern void cv_signal(kcondvar_t *cv);
342344
extern void cv_broadcast(kcondvar_t *cv);
343345
#define cv_timedwait_interruptible(cv, mp, at) cv_timedwait(cv, mp, at)

lib/libspl/include/sys/time.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,14 @@
5050
#define NSEC_PER_USEC 1000L
5151
#endif
5252

53+
#ifndef MSEC2NSEC
54+
#define MSEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MILLISEC))
55+
#endif
56+
57+
#ifndef NSEC2MSEC
58+
#define NSEC2MSEC(n) ((n) / (NANOSEC / MILLISEC))
59+
#endif
60+
5361
extern hrtime_t gethrtime(void);
5462
extern void gethrestime(timestruc_t *);
5563

lib/libzpool/kernel.c

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -528,6 +528,41 @@ cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime)
528528
return (1);
529529
}
530530

531+
/*ARGSUSED*/
532+
clock_t
533+
cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res,
534+
int flag)
535+
{
536+
int error;
537+
timestruc_t ts;
538+
hrtime_t delta;
539+
540+
ASSERT(flag == 0);
541+
542+
top:
543+
delta = tim - gethrtime();
544+
if (delta <= 0)
545+
return (-1);
546+
547+
ts.tv_sec = delta / NANOSEC;
548+
ts.tv_nsec = delta % NANOSEC;
549+
550+
ASSERT(mutex_owner(mp) == curthread);
551+
mp->m_owner = NULL;
552+
error = pthread_cond_timedwait(&cv->cv, &mp->m_lock, &ts);
553+
mp->m_owner = curthread;
554+
555+
if (error == ETIME)
556+
return (-1);
557+
558+
if (error == EINTR)
559+
goto top;
560+
561+
ASSERT(error == 0);
562+
563+
return (1);
564+
}
565+
531566
void
532567
cv_signal(kcondvar_t *cv)
533568
{

module/zfs/dsl_dir.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -743,7 +743,8 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
743743
err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx);
744744
} else {
745745
if (err == EAGAIN) {
746-
txg_delay(dd->dd_pool, tx->tx_txg, 1);
746+
txg_delay(dd->dd_pool, tx->tx_txg,
747+
MSEC2NSEC(10), MSEC2NSEC(10));
747748
err = SET_ERROR(ERESTART);
748749
}
749750
dsl_pool_memory_pressure(dd->dd_pool);

module/zfs/dsl_pool.c

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ kmutex_t zfs_write_limit_lock;
5858

5959
static pgcnt_t old_physmem = 0;
6060

61+
hrtime_t zfs_throttle_delay = MSEC2NSEC(10);
62+
hrtime_t zfs_throttle_resolution = MSEC2NSEC(10);
63+
6164
int
6265
dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
6366
{
@@ -512,12 +515,13 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
512515
* Weight the throughput calculation towards the current value:
513516
* thru = 3/4 old_thru + 1/4 new_thru
514517
*
515-
* Note: write_time is in nanosecs, so write_time/MICROSEC
516-
* yields millisecs
518+
* Note: write_time is in nanosecs while dp_throughput is expressed in
519+
* bytes per millisecond.
517520
*/
518521
ASSERT(zfs_write_limit_min > 0);
519-
if (data_written > zfs_write_limit_min / 8 && write_time > MICROSEC) {
520-
uint64_t throughput = data_written / (write_time / MICROSEC);
522+
if (data_written > zfs_write_limit_min / 8 &&
523+
write_time > MSEC2NSEC(1)) {
524+
uint64_t throughput = data_written / NSEC2MSEC(write_time);
521525

522526
if (dp->dp_throughput)
523527
dp->dp_throughput = throughput / 4 +
@@ -617,8 +621,10 @@ dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx)
617621
* the caller 1 clock tick. This will slow down the "fill"
618622
* rate until the sync process can catch up with us.
619623
*/
620-
if (reserved && reserved > (write_limit - (write_limit >> 3)))
621-
txg_delay(dp, tx->tx_txg, 1);
624+
if (reserved && reserved > (write_limit - (write_limit >> 3))) {
625+
txg_delay(dp, tx->tx_txg, zfs_throttle_delay,
626+
zfs_throttle_resolution);
627+
}
622628

623629
return (0);
624630
}

module/zfs/dsl_scan.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,7 @@ dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb)
409409
zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
410410
elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
411411
if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
412-
(elapsed_nanosecs / MICROSEC > mintime &&
412+
(NSEC2MSEC(elapsed_nanosecs) > mintime &&
413413
txg_sync_waiting(scn->scn_dp)) ||
414414
spa_shutting_down(scn->scn_dp->dp_spa)) {
415415
if (zb) {
@@ -1335,7 +1335,7 @@ dsl_scan_free_should_pause(dsl_scan_t *scn)
13351335

13361336
elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
13371337
return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
1338-
(elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
1338+
(NSEC2MSEC(elapsed_nanosecs) > zfs_free_min_time_ms &&
13391339
txg_sync_waiting(scn->scn_dp)) ||
13401340
spa_shutting_down(scn->scn_dp->dp_spa));
13411341
}
@@ -1459,7 +1459,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
14591459
"free_bpobj/bptree txg %llu",
14601460
(longlong_t)scn->scn_visited_this_txg,
14611461
(longlong_t)
1462-
(gethrtime() - scn->scn_sync_start_time) / MICROSEC,
1462+
NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
14631463
(longlong_t)tx->tx_txg);
14641464
scn->scn_visited_this_txg = 0;
14651465
/*
@@ -1507,7 +1507,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
15071507

15081508
zfs_dbgmsg("visited %llu blocks in %llums",
15091509
(longlong_t)scn->scn_visited_this_txg,
1510-
(longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC);
1510+
(longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time));
15111511

15121512
if (!scn->scn_pausing) {
15131513
/* finished with scan. */

module/zfs/spa_misc.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -490,8 +490,8 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
490490
spa->spa_proc = &p0;
491491
spa->spa_proc_state = SPA_PROC_NONE;
492492

493-
spa->spa_deadman_synctime = zfs_deadman_synctime *
494-
zfs_txg_synctime_ms * MICROSEC;
493+
spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime *
494+
zfs_txg_synctime_ms);
495495

496496
refcount_create(&spa->spa_refcount);
497497
spa_config_lock_init(spa);

module/zfs/txg.c

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp)
236236
}
237237

238238
static void
239-
txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, uint64_t time)
239+
txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time)
240240
{
241241
CALLB_CPR_SAFE_BEGIN(cpr);
242242

@@ -373,6 +373,9 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
373373
spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_OPEN, gethrtime());
374374
spa_txg_history_add(dp->dp_spa, tx->tx_open_txg);
375375

376+
DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg);
377+
DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg);
378+
376379
/*
377380
* Now that we've incremented tx_open_txg, we can let threads
378381
* enter the next transaction group.
@@ -531,6 +534,7 @@ txg_sync_thread(dsl_pool_t *dp)
531534
txg = tx->tx_quiesced_txg;
532535
tx->tx_quiesced_txg = 0;
533536
tx->tx_syncing_txg = txg;
537+
DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg);
534538
cv_broadcast(&tx->tx_quiesce_more_cv);
535539

536540
dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
@@ -544,6 +548,7 @@ txg_sync_thread(dsl_pool_t *dp)
544548
mutex_enter(&tx->tx_sync_lock);
545549
tx->tx_synced_txg = txg;
546550
tx->tx_syncing_txg = 0;
551+
DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg);
547552
cv_broadcast(&tx->tx_sync_done_cv);
548553

549554
/*
@@ -602,21 +607,22 @@ txg_quiesce_thread(dsl_pool_t *dp)
602607
*/
603608
dprintf("quiesce done, handing off txg %llu\n", txg);
604609
tx->tx_quiesced_txg = txg;
610+
DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
605611
cv_broadcast(&tx->tx_sync_more_cv);
606612
cv_broadcast(&tx->tx_quiesce_done_cv);
607613
}
608614
}
609615

610616
/*
611-
* Delay this thread by 'ticks' if we are still in the open transaction
612-
* group and there is already a waiting txg quiesing or quiesced. Abort
613-
* the delay if this txg stalls or enters the quiesing state.
617+
* Delay this thread by delay nanoseconds if we are still in the open
618+
* transaction group and there is already a waiting txg quiesing or quiesced.
619+
* Abort the delay if this txg stalls or enters the quiesing state.
614620
*/
615621
void
616-
txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks)
622+
txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution)
617623
{
618624
tx_state_t *tx = &dp->dp_tx;
619-
clock_t timeout = ddi_get_lbolt() + ticks;
625+
hrtime_t start = gethrtime();
620626

621627
/* don't delay if this txg could transition to quiesing immediately */
622628
if (tx->tx_open_txg > txg ||
@@ -629,10 +635,11 @@ txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks)
629635
return;
630636
}
631637

632-
while (ddi_get_lbolt() < timeout &&
633-
tx->tx_syncing_txg < txg-1 && !txg_stalled(dp))
634-
(void) cv_timedwait(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock,
635-
timeout);
638+
while (gethrtime() - start < delay &&
639+
tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) {
640+
(void) cv_timedwait_hires(&tx->tx_quiesce_more_cv,
641+
&tx->tx_sync_lock, delay, resolution, 0);
642+
}
636643

637644
DMU_TX_STAT_BUMP(dmu_tx_delay);
638645

0 commit comments

Comments
 (0)