Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve N-way mirror performance #1487

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions lib/libspl/include/sys/time.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@
#define NANOSEC 1000000000
#endif

#ifndef NSEC_PER_MSEC
#define NSEC_PER_MSEC 1000000L
#endif

#ifndef NSEC_PER_USEC
#define NSEC_PER_USEC 1000L
#endif
Expand Down
70 changes: 67 additions & 3 deletions module/zfs/vdev_mirror.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ typedef struct mirror_child {
vdev_t *mc_vd;
uint64_t mc_offset;
int mc_error;
int mc_pending;
uint8_t mc_tried;
uint8_t mc_skipped;
uint8_t mc_speculative;
Expand All @@ -54,7 +55,17 @@ typedef struct mirror_map {
mirror_child_t mm_child[1];
} mirror_map_t;

int vdev_mirror_shift = 21;
/*
* When the children are equally busy queue incoming request to a single
* child for N milliseconds. This is done to maximum the likelihood that
* the Linux elevator will be able to merge requests while it is plugged.
*
* For rotational disks the Linux elevator will plug for 10ms which is
* why zfs_vdev_mirror_switch_ms is set to 10ms by default. For non-
* rotational disks the elevator will not plug but 10ms is still a small
* enough value the requests will get spread over all the children.
*/
int zfs_vdev_mirror_switch_ms = 10;

static void
vdev_mirror_map_free(zio_t *zio)
Expand All @@ -69,6 +80,19 @@ static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
zio_vsd_default_cksum_report
};

static int
vdev_mirror_pending(vdev_t *vd)
{
vdev_queue_t *vq = &vd->vdev_queue;
int pending;

mutex_enter(&vq->vq_lock);
pending = avl_numnodes(&vq->vq_pending_tree);
mutex_exit(&vq->vq_lock);

return (pending);
}

static mirror_map_t *
vdev_mirror_map_alloc(zio_t *zio)
{
Expand Down Expand Up @@ -108,20 +132,55 @@ vdev_mirror_map_alloc(zio_t *zio)
mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
}
} else {
int lowest_pending = INT_MAX;
int lowest_nr = 0;

c = vd->vdev_children;

mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_PUSHPAGE);
mm->mm_children = c;
mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops ||
vd->vdev_ops == &vdev_spare_ops);
mm->mm_preferred = mm->mm_replacing ? 0 :
(zio->io_offset >> vdev_mirror_shift) % c;
mm->mm_preferred = 0;
mm->mm_root = B_FALSE;

for (c = 0; c < mm->mm_children; c++) {
mc = &mm->mm_child[c];
mc->mc_vd = vd->vdev_child[c];
mc->mc_offset = zio->io_offset;

if (mm->mm_replacing)
continue;

if (!vdev_readable(mc->mc_vd)) {
mc->mc_error = ENXIO;
mc->mc_tried = 1;
mc->mc_skipped = 1;
mc->mc_pending = INT_MAX;
continue;
}

mc->mc_pending = vdev_mirror_pending(mc->mc_vd);
if (mc->mc_pending < lowest_pending) {
lowest_pending = mc->mc_pending;
lowest_nr = 1;
} else if (mc->mc_pending == lowest_pending) {
lowest_nr++;
}
}

d = gethrtime() / (NSEC_PER_MSEC * zfs_vdev_mirror_switch_ms);
d = (d % lowest_nr) + 1;

for (c = 0; c < mm->mm_children; c++) {
mc = &mm->mm_child[c];

if (mm->mm_child[c].mc_pending == lowest_pending) {
if (--d == 0) {
mm->mm_preferred = c;
break;
}
}
}
}

Expand Down Expand Up @@ -492,3 +551,8 @@ vdev_ops_t vdev_spare_ops = {
VDEV_TYPE_SPARE, /* name of this vdev type */
B_FALSE /* not a leaf vdev */
};

#if defined(_KERNEL) && defined(HAVE_SPL)
module_param(zfs_vdev_mirror_switch_ms, int, 0644);
MODULE_PARM_DESC(zfs_vdev_mirror_switch_ms, "Switch mirrors every N msecs");
#endif