Skip to content

Commit 6d97422

Browse files
grwilsonbehlendorf
authored andcommitted
Illumos #1051: zfs should handle imbalanced luns
Today zfs tries to allocate blocks evenly across all devices. This means when devices are imbalanced zfs will use lots of CPU searching for space on devices which tend to be pretty full. It should instead fail quickly on the full LUNs and move onto devices which have more availability. Reviewed by: Eric Schrock <Eric.Schrock@delphix.com> Reviewed by: Matt Ahrens <Matt.Ahrens@delphix.com> Reviewed by: Adam Leventhal <Adam.Leventhal@delphix.com> Reviewed by: Albert Lee <trisk@nexenta.com> Reviewed by: Gordon Ross <gwr@nexenta.com> Approved by: Garrett D'Amore <garrett@nexenta.com> References to Illumos issue and patch: - https://www.illumos.org/issues/510 - illumos/illumos-gate@5ead3ed965 Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #340
1 parent bb939d1 commit 6d97422

File tree

8 files changed

+123
-28
lines changed

8 files changed

+123
-28
lines changed

cmd/ztest/ztest.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
*/
2121
/*
2222
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23+
* Copyright (c) 2011 by Delphix. All rights reserved.
2324
*/
2425

2526
/*
@@ -5300,6 +5301,7 @@ ztest_run(ztest_shared_t *zs)
53005301
*/
53015302
kernel_init(FREAD | FWRITE);
53025303
VERIFY(spa_open(zs->zs_pool, &spa, FTAG) == 0);
5304+
spa->spa_debug = B_TRUE;
53035305
zs->zs_spa = spa;
53045306

53055307
spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN;

include/sys/metaslab.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
*/
2121
/*
2222
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23+
* Copyright (c) 2011 by Delphix. All rights reserved.
2324
*/
2425

2526
#ifndef _SYS_METASLAB_H
@@ -47,6 +48,8 @@ extern void metaslab_sync_reassess(metaslab_group_t *mg);
4748
#define METASLAB_HINTBP_FAVOR 0x0
4849
#define METASLAB_HINTBP_AVOID 0x1
4950
#define METASLAB_GANG_HEADER 0x2
51+
#define METASLAB_GANG_CHILD 0x4
52+
#define METASLAB_GANG_AVOID 0x8
5053

5154
extern int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
5255
blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags);

include/sys/metaslab_impl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
/*
2222
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
2323
* Use is subject to license terms.
24+
* Copyright (c) 2011 by Delphix. All rights reserved.
2425
*/
2526

2627
#ifndef _SYS_METASLAB_IMPL_H
@@ -52,6 +53,7 @@ struct metaslab_group {
5253
avl_tree_t mg_metaslab_tree;
5354
uint64_t mg_aliquot;
5455
uint64_t mg_bonus_area;
56+
uint64_t mg_alloc_failures;
5557
int64_t mg_bias;
5658
int64_t mg_activation_count;
5759
metaslab_class_t *mg_class;

include/sys/spa.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
*/
2121
/*
2222
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23+
* Copyright (c) 2011 by Delphix. All rights reserved.
2324
*/
2425

2526
#ifndef _SYS_SPA_H
@@ -698,6 +699,13 @@ _NOTE(CONSTCOND) } while (0)
698699
#define dprintf_bp(bp, fmt, ...)
699700
#endif
700701

702+
extern boolean_t spa_debug_enabled(spa_t *spa);
703+
#define spa_dbgmsg(spa, ...) \
704+
{ \
705+
if (spa_debug_enabled(spa)) \
706+
zfs_dbgmsg(__VA_ARGS__); \
707+
}
708+
701709
extern int spa_mode_global; /* mode, e.g. FREAD | FWRITE */
702710

703711
#ifdef __cplusplus

include/sys/spa_impl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
*/
2121
/*
2222
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23+
* Copyright (c) 2011 by Delphix. All rights reserved.
2324
*/
2425

2526
#ifndef _SYS_SPA_IMPL_H
@@ -196,6 +197,7 @@ struct spa {
196197
kcondvar_t spa_suspend_cv; /* notification of resume */
197198
uint8_t spa_suspended; /* pool is suspended */
198199
uint8_t spa_claiming; /* pool is doing zil_claim() */
200+
boolean_t spa_debug; /* debug enabled? */
199201
boolean_t spa_is_root; /* pool is root */
200202
int spa_minref; /* num refs when first opened */
201203
int spa_mode; /* FREAD | FWRITE */

module/zfs/metaslab.c

Lines changed: 78 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
*/
2121
/*
2222
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23+
* Copyright (c) 2011 by Delphix. All rights reserved.
2324
*/
2425

2526
#include <sys/zfs_context.h>
@@ -30,11 +31,30 @@
3031
#include <sys/vdev_impl.h>
3132
#include <sys/zio.h>
3233

33-
#define WITH_NDF_BLOCK_ALLOCATOR
34+
#define WITH_DF_BLOCK_ALLOCATOR
35+
36+
/*
37+
* Allow allocations to switch to gang blocks quickly. We do this to
38+
* avoid having to load lots of space_maps in a given txg. There are,
39+
* however, some cases where we want to avoid "fast" ganging and instead
40+
* we want to do an exhaustive search of all metaslabs on this device.
41+
* Currently we don't allow any gang or dump device related allocations
42+
* to "fast" gang.
43+
*/
44+
#define CAN_FASTGANG(flags) \
45+
(!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
46+
METASLAB_GANG_AVOID)))
3447

3548
uint64_t metaslab_aliquot = 512ULL << 10;
3649
uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
3750

51+
/*
52+
* This value defines the number of allowed allocation failures per vdev.
53+
* If a device reaches this threshold in a given txg then we consider skipping
54+
* allocations on that device.
55+
*/
56+
int zfs_mg_alloc_failures;
57+
3858
/*
3959
* Metaslab debugging: when set, keeps all space maps in core to verify frees.
4060
*/
@@ -865,7 +885,7 @@ metaslab_prefetch(metaslab_group_t *mg)
865885
}
866886

867887
static int
868-
metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
888+
metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
869889
{
870890
metaslab_group_t *mg = msp->ms_group;
871891
space_map_t *sm = &msp->ms_map;
@@ -899,13 +919,6 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
899919
mutex_exit(&mg->mg_lock);
900920
}
901921

902-
/*
903-
* If we were able to load the map then make sure
904-
* that this map is still able to satisfy our request.
905-
*/
906-
if (msp->ms_weight < size)
907-
return (ENOSPC);
908-
909922
metaslab_group_sort(msp->ms_group, msp,
910923
msp->ms_weight | activation_weight);
911924
}
@@ -1123,6 +1136,7 @@ void
11231136
metaslab_sync_reassess(metaslab_group_t *mg)
11241137
{
11251138
vdev_t *vd = mg->mg_vd;
1139+
int64_t failures = mg->mg_alloc_failures;
11261140
int m;
11271141

11281142
/*
@@ -1140,6 +1154,8 @@ metaslab_sync_reassess(metaslab_group_t *mg)
11401154
mutex_exit(&msp->ms_lock);
11411155
}
11421156

1157+
atomic_add_64(&mg->mg_alloc_failures, -failures);
1158+
11431159
/*
11441160
* Prefetch the next potential metaslabs
11451161
*/
@@ -1164,9 +1180,10 @@ metaslab_distance(metaslab_t *msp, dva_t *dva)
11641180
}
11651181

11661182
static uint64_t
1167-
metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
1168-
uint64_t min_distance, dva_t *dva, int d)
1183+
metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
1184+
uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags)
11691185
{
1186+
spa_t *spa = mg->mg_vd->vdev_spa;
11701187
metaslab_t *msp = NULL;
11711188
uint64_t offset = -1ULL;
11721189
avl_tree_t *t = &mg->mg_metaslab_tree;
@@ -1187,11 +1204,17 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
11871204

11881205
mutex_enter(&mg->mg_lock);
11891206
for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
1190-
if (msp->ms_weight < size) {
1207+
if (msp->ms_weight < asize) {
1208+
spa_dbgmsg(spa, "%s: failed to meet weight "
1209+
"requirement: vdev %llu, txg %llu, mg %p, "
1210+
"msp %p, psize %llu, asize %llu, "
1211+
"failures %llu, weight %llu",
1212+
spa_name(spa), mg->mg_vd->vdev_id, txg,
1213+
mg, msp, psize, asize,
1214+
mg->mg_alloc_failures, msp->ms_weight);
11911215
mutex_exit(&mg->mg_lock);
11921216
return (-1ULL);
11931217
}
1194-
11951218
was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
11961219
if (activation_weight == METASLAB_WEIGHT_PRIMARY)
11971220
break;
@@ -1210,6 +1233,25 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
12101233
if (msp == NULL)
12111234
return (-1ULL);
12121235

1236+
/*
1237+
* If we've already reached the allowable number of failed
1238+
* allocation attempts on this metaslab group then we
1239+
* consider skipping it. We skip it only if we're allowed
1240+
* to "fast" gang, the physical size is larger than
1241+
* a gang block, and we're attempting to allocate from
1242+
* the primary metaslab.
1243+
*/
1244+
if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
1245+
CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
1246+
activation_weight == METASLAB_WEIGHT_PRIMARY) {
1247+
spa_dbgmsg(spa, "%s: skipping metaslab group: "
1248+
"vdev %llu, txg %llu, mg %p, psize %llu, "
1249+
"asize %llu, failures %llu", spa_name(spa),
1250+
mg->mg_vd->vdev_id, txg, mg, psize, asize,
1251+
mg->mg_alloc_failures);
1252+
return (-1ULL);
1253+
}
1254+
12131255
mutex_enter(&msp->ms_lock);
12141256

12151257
/*
@@ -1218,7 +1260,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
12181260
* another thread may have changed the weight while we
12191261
* were blocked on the metaslab lock.
12201262
*/
1221-
if (msp->ms_weight < size || (was_active &&
1263+
if (msp->ms_weight < asize || (was_active &&
12221264
!(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
12231265
activation_weight == METASLAB_WEIGHT_PRIMARY)) {
12241266
mutex_exit(&msp->ms_lock);
@@ -1233,14 +1275,16 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
12331275
continue;
12341276
}
12351277

1236-
if (metaslab_activate(msp, activation_weight, size) != 0) {
1278+
if (metaslab_activate(msp, activation_weight) != 0) {
12371279
mutex_exit(&msp->ms_lock);
12381280
continue;
12391281
}
12401282

1241-
if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
1283+
if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL)
12421284
break;
12431285

1286+
atomic_inc_64(&mg->mg_alloc_failures);
1287+
12441288
metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));
12451289

12461290
mutex_exit(&msp->ms_lock);
@@ -1249,7 +1293,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
12491293
if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
12501294
vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
12511295

1252-
space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
1296+
space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize);
12531297

12541298
mutex_exit(&msp->ms_lock);
12551299

@@ -1376,7 +1420,8 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
13761420
asize = vdev_psize_to_asize(vd, psize);
13771421
ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
13781422

1379-
offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d);
1423+
offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
1424+
dva, d, flags);
13801425
if (offset != -1ULL) {
13811426
/*
13821427
* If we've just selected this metaslab group,
@@ -1388,18 +1433,24 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
13881433
vdev_stat_t *vs = &vd->vdev_stat;
13891434
int64_t vu, cu;
13901435

1391-
/*
1392-
* Determine percent used in units of 0..1024.
1393-
* (This is just to avoid floating point.)
1394-
*/
1395-
vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
1396-
cu = (mc->mc_alloc << 10) / (mc->mc_space + 1);
1436+
vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
1437+
cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
13971438

13981439
/*
1399-
* Bias by at most +/- 25% of the aliquot.
1440+
* Calculate how much more or less we should
1441+
* try to allocate from this device during
1442+
* this iteration around the rotor.
1443+
* For example, if a device is 80% full
1444+
* and the pool is 20% full then we should
1445+
* reduce allocations by 60% on this device.
1446+
*
1447+
* mg_bias = (20 - 80) * 512K / 100 = -307K
1448+
*
1449+
* This reduces allocations by 307K for this
1450+
* iteration.
14001451
*/
14011452
mg->mg_bias = ((cu - vu) *
1402-
(int64_t)mg->mg_aliquot) / (1024 * 4);
1453+
(int64_t)mg->mg_aliquot) / 100;
14031454
}
14041455

14051456
if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
@@ -1513,7 +1564,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
15131564
mutex_enter(&msp->ms_lock);
15141565

15151566
if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
1516-
error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
1567+
error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
15171568

15181569
if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
15191570
error = ENOENT;

module/zfs/spa_misc.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
*/
2121
/*
2222
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23+
* Copyright (c) 2011 by Delphix. All rights reserved.
2324
*/
2425

2526
#include <sys/zfs_context.h>
@@ -1680,6 +1681,12 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
16801681
return (0);
16811682
}
16821683

1684+
boolean_t
1685+
spa_debug_enabled(spa_t *spa)
1686+
{
1687+
return (spa->spa_debug);
1688+
}
1689+
16831690
#if defined(_KERNEL) && defined(HAVE_SPL)
16841691
/* Namespace manipulation */
16851692
EXPORT_SYMBOL(spa_lookup);

module/zfs/zio.c

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
*/
2121
/*
2222
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23+
* Copyright (c) 2011 by Delphix. All rights reserved.
2324
*/
2425

2526
#include <sys/zfs_context.h>
@@ -79,6 +80,7 @@ int zio_delay_max = ZIO_DELAY_MAX;
7980
#ifdef _KERNEL
8081
extern vmem_t *zio_alloc_arena;
8182
#endif
83+
extern int zfs_mg_alloc_failures;
8284

8385
/*
8486
* An allocating zio is one that either currently has the DVA allocate
@@ -158,6 +160,12 @@ zio_init(void)
158160
zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
159161
}
160162

163+
/*
164+
* The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs
165+
* to fail 3 times per txg or 8 failures, whichever is greater.
166+
*/
167+
zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8);
168+
161169
zio_inject_init();
162170
}
163171

@@ -2151,6 +2159,7 @@ zio_dva_allocate(zio_t *zio)
21512159
metaslab_class_t *mc = spa_normal_class(spa);
21522160
blkptr_t *bp = zio->io_bp;
21532161
int error;
2162+
int flags = 0;
21542163

21552164
if (zio->io_gang_leader == NULL) {
21562165
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
@@ -2163,10 +2172,21 @@ zio_dva_allocate(zio_t *zio)
21632172
ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
21642173
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
21652174

2175+
/*
2176+
* The dump device does not support gang blocks so allocation on
2177+
* behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
2178+
* the "fast" gang feature.
2179+
*/
2180+
flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
2181+
flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
2182+
METASLAB_GANG_CHILD : 0;
21662183
error = metaslab_alloc(spa, mc, zio->io_size, bp,
2167-
zio->io_prop.zp_copies, zio->io_txg, NULL, 0);
2184+
zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
21682185

21692186
if (error) {
2187+
spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
2188+
"size %llu, error %d", spa_name(spa), zio, zio->io_size,
2189+
error);
21702190
if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
21712191
return (zio_write_gang_block(zio));
21722192
zio->io_error = error;

0 commit comments

Comments
 (0)