Skip to content

Commit ec8501e

Browse files
Justin T. Gibbsbehlendorf
authored andcommitted
5313 Allow I/Os to be aggregated across ZIO priority classes
Reviewed by: Andriy Gapon <avg@FreeBSD.org> Reviewed by: Will Andrews <willa@SpectraLogic.com> Reviewed by: Matt Ahrens <mahrens@delphix.com> Reviewed by: George Wilson <george@delphix.com> Approved by: Robert Mustacchi <rm@joyent.com> References: https://www.illumos.org/issues/5313 illumos/illumos-gate@fe319232 Ported-by: DHE <git@dehacked.net> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #3280
1 parent 0bf8501 commit ec8501e

File tree

3 files changed

+51
-31
lines changed

3 files changed

+51
-31
lines changed

include/sys/vdev_impl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ struct vdev_queue {
113113
vdev_t *vq_vdev;
114114
vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
115115
avl_tree_t vq_active_tree;
116+
avl_tree_t vq_read_offset_tree;
117+
avl_tree_t vq_write_offset_tree;
116118
uint64_t vq_last_offset;
117119
hrtime_t vq_io_complete_ts; /* time last i/o completed */
118120
hrtime_t vq_io_delta_ts;

include/sys/zio.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,7 @@ struct zio {
427427
hrtime_t io_delta; /* vdev queue service delta */
428428
uint64_t io_delay; /* vdev disk service delta (ticks) */
429429
avl_node_t io_queue_node;
430+
avl_node_t io_offset_node;
430431

431432
/* Internal pipeline state */
432433
enum zio_flag io_flags;

module/zfs/vdev_queue.c

Lines changed: 48 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,22 @@ vdev_queue_offset_compare(const void *x1, const void *x2)
190190
return (0);
191191
}
192192

193+
static inline avl_tree_t *
194+
vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
195+
{
196+
return (&vq->vq_class[p].vqc_queued_tree);
197+
}
198+
199+
static inline avl_tree_t *
200+
vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
201+
{
202+
ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE);
203+
if (t == ZIO_TYPE_READ)
204+
return (&vq->vq_read_offset_tree);
205+
else
206+
return (&vq->vq_write_offset_tree);
207+
}
208+
193209
int
194210
vdev_queue_timestamp_compare(const void *x1, const void *x2)
195211
{
@@ -303,7 +319,7 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
303319

304320
/* find a queue that has not reached its minimum # outstanding i/os */
305321
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
306-
if (avl_numnodes(&vq->vq_class[p].vqc_queued_tree) > 0 &&
322+
if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
307323
vq->vq_class[p].vqc_active <
308324
vdev_queue_class_min_active(p))
309325
return (p);
@@ -314,7 +330,7 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
314330
* maximum # outstanding i/os.
315331
*/
316332
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
317-
if (avl_numnodes(&vq->vq_class[p].vqc_queued_tree) > 0 &&
333+
if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
318334
vq->vq_class[p].vqc_active <
319335
vdev_queue_class_max_active(spa, p))
320336
return (p);
@@ -335,20 +351,27 @@ vdev_queue_init(vdev_t *vd)
335351

336352
avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
337353
sizeof (zio_t), offsetof(struct zio, io_queue_node));
354+
avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
355+
vdev_queue_offset_compare, sizeof (zio_t),
356+
offsetof(struct zio, io_offset_node));
357+
avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
358+
vdev_queue_offset_compare, sizeof (zio_t),
359+
offsetof(struct zio, io_offset_node));
338360

339361
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
362+
int (*compfn) (const void *, const void *);
363+
340364
/*
341-
* The synchronous i/o queues are FIFO rather than LBA ordered.
342-
* This provides more consistent latency for these i/os, and
343-
* they tend to not be tightly clustered anyway so there is
344-
* little to no throughput loss.
365+
* The synchronous i/o queues are dispatched in FIFO rather
366+
* than LBA order. This provides more consistent latency for
367+
* these i/os.
345368
*/
346-
boolean_t fifo = (p == ZIO_PRIORITY_SYNC_READ ||
347-
p == ZIO_PRIORITY_SYNC_WRITE);
348-
avl_create(&vq->vq_class[p].vqc_queued_tree,
349-
fifo ? vdev_queue_timestamp_compare :
350-
vdev_queue_offset_compare,
351-
sizeof (zio_t), offsetof(struct zio, io_queue_node));
369+
if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE)
370+
compfn = vdev_queue_timestamp_compare;
371+
else
372+
compfn = vdev_queue_offset_compare;
373+
avl_create(vdev_queue_class_tree(vq, p), compfn,
374+
sizeof (zio_t), offsetof(struct zio, io_queue_node));
352375
}
353376
}
354377

@@ -359,8 +382,10 @@ vdev_queue_fini(vdev_t *vd)
359382
zio_priority_t p;
360383

361384
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
362-
avl_destroy(&vq->vq_class[p].vqc_queued_tree);
385+
avl_destroy(vdev_queue_class_tree(vq, p));
363386
avl_destroy(&vq->vq_active_tree);
387+
avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
388+
avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
364389

365390
mutex_destroy(&vq->vq_lock);
366391
}
@@ -372,7 +397,8 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
372397
spa_stats_history_t *ssh = &spa->spa_stats.io_history;
373398

374399
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
375-
avl_add(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio);
400+
avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
401+
avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
376402

377403
if (ssh->kstat != NULL) {
378404
mutex_enter(&ssh->lock);
@@ -388,7 +414,8 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
388414
spa_stats_history_t *ssh = &spa->spa_stats.io_history;
389415

390416
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
391-
avl_remove(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio);
417+
avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
418+
avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
392419

393420
if (ssh->kstat != NULL) {
394421
mutex_enter(&ssh->lock);
@@ -472,8 +499,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
472499
uint64_t maxgap = 0;
473500
uint64_t size;
474501
boolean_t stretch = B_FALSE;
475-
vdev_queue_class_t *vqc = &vq->vq_class[zio->io_priority];
476-
avl_tree_t *t = &vqc->vqc_queued_tree;
502+
avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
477503
enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
478504

479505
if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
@@ -486,15 +512,6 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
486512
zfs_vdev_aggregation_limit =
487513
MIN(zfs_vdev_aggregation_limit, SPA_MAXBLOCKSIZE);
488514

489-
/*
490-
* The synchronous i/o queues are not sorted by LBA, so we can't
491-
* find adjacent i/os. These i/os tend to not be tightly clustered,
492-
* or too large to aggregate, so this has little impact on performance.
493-
*/
494-
if (zio->io_priority == ZIO_PRIORITY_SYNC_READ ||
495-
zio->io_priority == ZIO_PRIORITY_SYNC_WRITE)
496-
return (NULL);
497-
498515
first = last = zio;
499516

500517
if (zio->io_type == ZIO_TYPE_READ)
@@ -627,7 +644,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq)
627644
zio_t *zio, *aio;
628645
zio_priority_t p;
629646
avl_index_t idx;
630-
vdev_queue_class_t *vqc;
647+
avl_tree_t *tree;
631648

632649
again:
633650
ASSERT(MUTEX_HELD(&vq->vq_lock));
@@ -645,14 +662,14 @@ vdev_queue_io_to_issue(vdev_queue_t *vq)
645662
*
646663
* For FIFO queues (sync), issue the i/o with the lowest timestamp.
647664
*/
648-
vqc = &vq->vq_class[p];
665+
tree = vdev_queue_class_tree(vq, p);
649666
vq->vq_io_search.io_timestamp = 0;
650667
vq->vq_io_search.io_offset = vq->vq_last_offset + 1;
651-
VERIFY3P(avl_find(&vqc->vqc_queued_tree, &vq->vq_io_search,
668+
VERIFY3P(avl_find(tree, &vq->vq_io_search,
652669
&idx), ==, NULL);
653-
zio = avl_nearest(&vqc->vqc_queued_tree, idx, AVL_AFTER);
670+
zio = avl_nearest(tree, idx, AVL_AFTER);
654671
if (zio == NULL)
655-
zio = avl_first(&vqc->vqc_queued_tree);
672+
zio = avl_first(tree);
656673
ASSERT3U(zio->io_priority, ==, p);
657674

658675
aio = vdev_queue_aggregate(vq, zio);

0 commit comments

Comments
 (0)