Skip to content

Commit ac72fac

Browse files
grwilsonbehlendorf
authored andcommitted
Illumos #3954, #4080, #4081
3954 metaslabs continue to load even after hitting zfs_mg_alloc_failure limit 4080 zpool clear fails to clear pool 4081 need zfs_mg_noalloc_threshold Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Approved by: Richard Lowe <richlowe@richlowe.net> References: https://www.illumos.org/issues/3954 https://www.illumos.org/issues/4080 https://www.illumos.org/issues/4081 illumos/illumos-gate@22e3098 Ported-by: Richard Yao <ryao@gentoo.org> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #1775
1 parent a169a62 commit ac72fac

File tree

4 files changed

+117
-8
lines changed

4 files changed

+117
-8
lines changed

include/sys/metaslab_impl.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
*/
2525

2626
/*
27-
* Copyright (c) 2012 by Delphix. All rights reserved.
27+
* Copyright (c) 2013 by Delphix. All rights reserved.
2828
*/
2929

3030
#ifndef _SYS_METASLAB_IMPL_H
@@ -45,6 +45,7 @@ struct metaslab_class {
4545
metaslab_group_t *mc_rotor;
4646
space_map_ops_t *mc_ops;
4747
uint64_t mc_aliquot;
48+
uint64_t mc_alloc_groups; /* # of allocatable groups */
4849
uint64_t mc_alloc; /* total allocated space */
4950
uint64_t mc_deferred; /* total deferred frees */
5051
uint64_t mc_space; /* total space (alloc + free) */
@@ -58,6 +59,8 @@ struct metaslab_group {
5859
uint64_t mg_aliquot;
5960
uint64_t mg_bonus_area;
6061
uint64_t mg_alloc_failures;
62+
boolean_t mg_allocatable; /* can we allocate? */
63+
uint64_t mg_free_capacity; /* percentage free */
6164
int64_t mg_bias;
6265
int64_t mg_activation_count;
6366
metaslab_class_t *mg_class;

module/zfs/metaslab.c

Lines changed: 109 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,25 @@ int zfs_condense_pct = 200;
6060
/*
6161
* This value defines the number of allowed allocation failures per vdev.
6262
* If a device reaches this threshold in a given txg then we consider skipping
63-
* allocations on that device.
63+
* allocations on that device. The value of zfs_mg_alloc_failures is computed
64+
* in zio_init() unless it has been overridden in /etc/system.
6465
*/
65-
int zfs_mg_alloc_failures;
66+
int zfs_mg_alloc_failures = 0;
67+
68+
/*
69+
* The zfs_mg_noalloc_threshold defines which metaslab groups should
70+
* be eligible for allocation. The value is defined as a percentage of
71+
* a free space. Metaslab groups that have more free space than
72+
* zfs_mg_noalloc_threshold are always eligible for allocations. Once
73+
* a metaslab group's free space is less than or equal to the
74+
* zfs_mg_noalloc_threshold the allocator will avoid allocating to that
75+
* group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
76+
* Once all groups in the pool reach zfs_mg_noalloc_threshold then all
77+
* groups are allowed to accept allocations. Gang blocks are always
78+
* eligible to allocate on any metaslab group. The default value of 0 means
79+
* no metaslab group will be excluded based on this criterion.
80+
*/
81+
int zfs_mg_noalloc_threshold = 0;
6682

6783
/*
6884
* Metaslab debugging: when set, keeps all space maps in core to verify frees.
@@ -223,6 +239,53 @@ metaslab_compare(const void *x1, const void *x2)
223239
return (0);
224240
}
225241

242+
/*
243+
* Update the allocatable flag and the metaslab group's capacity.
244+
* The allocatable flag is set to true if the capacity is below
245+
* the zfs_mg_noalloc_threshold. If a metaslab group transitions
246+
* from allocatable to non-allocatable or vice versa then the metaslab
247+
* group's class is updated to reflect the transition.
248+
*/
249+
static void
250+
metaslab_group_alloc_update(metaslab_group_t *mg)
251+
{
252+
vdev_t *vd = mg->mg_vd;
253+
metaslab_class_t *mc = mg->mg_class;
254+
vdev_stat_t *vs = &vd->vdev_stat;
255+
boolean_t was_allocatable;
256+
257+
ASSERT(vd == vd->vdev_top);
258+
259+
mutex_enter(&mg->mg_lock);
260+
was_allocatable = mg->mg_allocatable;
261+
262+
mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
263+
(vs->vs_space + 1);
264+
265+
mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold);
266+
267+
/*
268+
* The mc_alloc_groups maintains a count of the number of
269+
* groups in this metaslab class that are still above the
270+
* zfs_mg_noalloc_threshold. This is used by the allocating
271+
* threads to determine if they should avoid allocations to
272+
* a given group. The allocator will avoid allocations to a group
273+
* if that group has reached or is below the zfs_mg_noalloc_threshold
274+
* and there are still other groups that are above the threshold.
275+
* When a group transitions from allocatable to non-allocatable or
276+
* vice versa we update the metaslab class to reflect that change.
277+
* When the mc_alloc_groups value drops to 0 that means that all
278+
* groups have reached the zfs_mg_noalloc_threshold making all groups
279+
* eligible for allocations. This effectively means that all devices
280+
* are balanced again.
281+
*/
282+
if (was_allocatable && !mg->mg_allocatable)
283+
mc->mc_alloc_groups--;
284+
else if (!was_allocatable && mg->mg_allocatable)
285+
mc->mc_alloc_groups++;
286+
mutex_exit(&mg->mg_lock);
287+
}
288+
226289
metaslab_group_t *
227290
metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
228291
{
@@ -273,6 +336,7 @@ metaslab_group_activate(metaslab_group_t *mg)
273336
return;
274337

275338
mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
339+
metaslab_group_alloc_update(mg);
276340

277341
if ((mgprev = mc->mc_rotor) == NULL) {
278342
mg->mg_prev = mg;
@@ -357,6 +421,29 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
357421
mutex_exit(&mg->mg_lock);
358422
}
359423

424+
/*
425+
* Determine if a given metaslab group should skip allocations. A metaslab
426+
* group should avoid allocations if its used capacity has crossed the
427+
* zfs_mg_noalloc_threshold and there is at least one metaslab group
428+
* that can still handle allocations.
429+
*/
430+
static boolean_t
431+
metaslab_group_allocatable(metaslab_group_t *mg)
432+
{
433+
vdev_t *vd = mg->mg_vd;
434+
spa_t *spa = vd->vdev_spa;
435+
metaslab_class_t *mc = mg->mg_class;
436+
437+
/*
438+
* A metaslab group is considered allocatable if its free capacity
439+
* is greater than the set value of zfs_mg_noalloc_threshold, it's
440+
* associated with a slog, or there are no other metaslab groups
441+
* with free capacity greater than zfs_mg_noalloc_threshold.
442+
*/
443+
return (mg->mg_free_capacity > zfs_mg_noalloc_threshold ||
444+
mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
445+
}
446+
360447
/*
361448
* ==========================================================================
362449
* Common allocator routines
@@ -1301,6 +1388,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
13011388
vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
13021389
}
13031390

1391+
metaslab_group_alloc_update(mg);
1392+
13041393
/*
13051394
* If the map is loaded but no longer active, evict it as soon as all
13061395
* future allocations have synced. (If we unloaded it now and then
@@ -1430,6 +1519,8 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
14301519
if (msp == NULL)
14311520
return (-1ULL);
14321521

1522+
mutex_enter(&msp->ms_lock);
1523+
14331524
/*
14341525
* If we've already reached the allowable number of failed
14351526
* allocation attempts on this metaslab group then we
@@ -1446,11 +1537,10 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
14461537
"asize %llu, failures %llu", spa_name(spa),
14471538
mg->mg_vd->vdev_id, txg, mg, psize, asize,
14481539
mg->mg_alloc_failures);
1540+
mutex_exit(&msp->ms_lock);
14491541
return (-1ULL);
14501542
}
14511543

1452-
mutex_enter(&msp->ms_lock);
1453-
14541544
/*
14551545
* Ensure that the metaslab we have selected is still
14561546
* capable of handling our request. It's possible that
@@ -1615,6 +1705,21 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
16151705
} else {
16161706
allocatable = vdev_allocatable(vd);
16171707
}
1708+
1709+
/*
1710+
* Determine if the selected metaslab group is eligible
1711+
* for allocations. If we're ganging or have requested
1712+
* an allocation for the smallest gang block size
1713+
* then we don't want to avoid allocating to the this
1714+
* metaslab group. If we're in this condition we should
1715+
* try to allocate from any device possible so that we
1716+
* don't inadvertently return ENOSPC and suspend the pool
1717+
* even though space is still available.
1718+
*/
1719+
if (allocatable && CAN_FASTGANG(flags) &&
1720+
psize > SPA_GANGBLOCKSIZE)
1721+
allocatable = metaslab_group_allocatable(mg);
1722+
16181723
if (!allocatable)
16191724
goto next;
16201725

module/zfs/zfs_ioctl.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5351,7 +5351,7 @@ zfs_ioctl_init(void)
53515351
zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
53525352

53535353
zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear,
5354-
zfs_secpolicy_config, B_TRUE, POOL_CHECK_SUSPENDED);
5354+
zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
53555355
zfs_ioctl_register_pool(ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen,
53565356
zfs_secpolicy_config, B_TRUE, POOL_CHECK_SUSPENDED);
53575357

module/zfs/zio.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,8 @@ zio_init(void)
227227
* The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs
228228
* to fail 3 times per txg or 8 failures, whichever is greater.
229229
*/
230-
zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8);
230+
if (zfs_mg_alloc_failures == 0)
231+
zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8);
231232

232233
zio_inject_init();
233234

@@ -2518,7 +2519,7 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size,
25182519
if (error) {
25192520
error = metaslab_alloc(spa, spa_normal_class(spa), size,
25202521
new_bp, 1, txg, NULL,
2521-
METASLAB_FASTWRITE | METASLAB_GANG_AVOID);
2522+
METASLAB_FASTWRITE);
25222523
}
25232524

25242525
if (error == 0) {

0 commit comments

Comments
 (0)