@@ -60,9 +60,25 @@ int zfs_condense_pct = 200;
60
60
/*
61
61
* This value defines the number of allowed allocation failures per vdev.
62
62
* If a device reaches this threshold in a given txg then we consider skipping
63
- * allocations on that device.
63
+ * allocations on that device. The value of zfs_mg_alloc_failures is computed
64
+ * in zio_init() unless it has been overridden in /etc/system.
64
65
*/
65
- int zfs_mg_alloc_failures ;
66
+ int zfs_mg_alloc_failures = 0 ;
67
+
68
+ /*
69
+ * The zfs_mg_noalloc_threshold defines which metaslab groups should
70
+ * be eligible for allocation. The value is defined as a percentage of
71
+ * a free space. Metaslab groups that have more free space than
72
+ * zfs_mg_noalloc_threshold are always eligible for allocations. Once
73
+ * a metaslab group's free space is less than or equal to the
74
+ * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
75
+ * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
76
+ * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
77
+ * groups are allowed to accept allocations. Gang blocks are always
78
+ * eligible to allocate on any metaslab group. The default value of 0 means
79
+ * no metaslab group will be excluded based on this criterion.
80
+ */
81
+ int zfs_mg_noalloc_threshold = 0 ;
66
82
67
83
/*
68
84
* Metaslab debugging: when set, keeps all space maps in core to verify frees.
@@ -223,6 +239,53 @@ metaslab_compare(const void *x1, const void *x2)
223
239
return (0 );
224
240
}
225
241
242
+ /*
243
+ * Update the allocatable flag and the metaslab group's capacity.
244
+ * The allocatable flag is set to true if the capacity is below
245
+ * the zfs_mg_noalloc_threshold. If a metaslab group transitions
246
+ * from allocatable to non-allocatable or vice versa then the metaslab
247
+ * group's class is updated to reflect the transition.
248
+ */
249
+ static void
250
+ metaslab_group_alloc_update (metaslab_group_t * mg )
251
+ {
252
+ vdev_t * vd = mg -> mg_vd ;
253
+ metaslab_class_t * mc = mg -> mg_class ;
254
+ vdev_stat_t * vs = & vd -> vdev_stat ;
255
+ boolean_t was_allocatable ;
256
+
257
+ ASSERT (vd == vd -> vdev_top );
258
+
259
+ mutex_enter (& mg -> mg_lock );
260
+ was_allocatable = mg -> mg_allocatable ;
261
+
262
+ mg -> mg_free_capacity = ((vs -> vs_space - vs -> vs_alloc ) * 100 ) /
263
+ (vs -> vs_space + 1 );
264
+
265
+ mg -> mg_allocatable = (mg -> mg_free_capacity > zfs_mg_noalloc_threshold );
266
+
267
+ /*
268
+ * The mc_alloc_groups maintains a count of the number of
269
+ * groups in this metaslab class that are still above the
270
+ * zfs_mg_noalloc_threshold. This is used by the allocating
271
+ * threads to determine if they should avoid allocations to
272
+ * a given group. The allocator will avoid allocations to a group
273
+ * if that group has reached or is below the zfs_mg_noalloc_threshold
274
+ * and there are still other groups that are above the threshold.
275
+ * When a group transitions from allocatable to non-allocatable or
276
+ * vice versa we update the metaslab class to reflect that change.
277
+ * When the mc_alloc_groups value drops to 0 that means that all
278
+ * groups have reached the zfs_mg_noalloc_threshold making all groups
279
+ * eligible for allocations. This effectively means that all devices
280
+ * are balanced again.
281
+ */
282
+ if (was_allocatable && !mg -> mg_allocatable )
283
+ mc -> mc_alloc_groups -- ;
284
+ else if (!was_allocatable && mg -> mg_allocatable )
285
+ mc -> mc_alloc_groups ++ ;
286
+ mutex_exit (& mg -> mg_lock );
287
+ }
288
+
226
289
metaslab_group_t *
227
290
metaslab_group_create (metaslab_class_t * mc , vdev_t * vd )
228
291
{
@@ -273,6 +336,7 @@ metaslab_group_activate(metaslab_group_t *mg)
273
336
return ;
274
337
275
338
mg -> mg_aliquot = metaslab_aliquot * MAX (1 , mg -> mg_vd -> vdev_children );
339
+ metaslab_group_alloc_update (mg );
276
340
277
341
if ((mgprev = mc -> mc_rotor ) == NULL ) {
278
342
mg -> mg_prev = mg ;
@@ -357,6 +421,29 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
357
421
mutex_exit (& mg -> mg_lock );
358
422
}
359
423
424
+ /*
425
+ * Determine if a given metaslab group should skip allocations. A metaslab
426
+ * group should avoid allocations if its used capacity has crossed the
427
+ * zfs_mg_noalloc_threshold and there is at least one metaslab group
428
+ * that can still handle allocations.
429
+ */
430
+ static boolean_t
431
+ metaslab_group_allocatable (metaslab_group_t * mg )
432
+ {
433
+ vdev_t * vd = mg -> mg_vd ;
434
+ spa_t * spa = vd -> vdev_spa ;
435
+ metaslab_class_t * mc = mg -> mg_class ;
436
+
437
+ /*
438
+ * A metaslab group is considered allocatable if its free capacity
439
+ * is greater than the set value of zfs_mg_noalloc_threshold, it's
440
+ * associated with a slog, or there are no other metaslab groups
441
+ * with free capacity greater than zfs_mg_noalloc_threshold.
442
+ */
443
+ return (mg -> mg_free_capacity > zfs_mg_noalloc_threshold ||
444
+ mc != spa_normal_class (spa ) || mc -> mc_alloc_groups == 0 );
445
+ }
446
+
360
447
/*
361
448
* ==========================================================================
362
449
* Common allocator routines
@@ -1301,6 +1388,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
1301
1388
vdev_dirty (vd , VDD_METASLAB , msp , txg + 1 );
1302
1389
}
1303
1390
1391
+ metaslab_group_alloc_update (mg );
1392
+
1304
1393
/*
1305
1394
* If the map is loaded but no longer active, evict it as soon as all
1306
1395
* future allocations have synced. (If we unloaded it now and then
@@ -1430,6 +1519,8 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
1430
1519
if (msp == NULL )
1431
1520
return (-1ULL );
1432
1521
1522
+ mutex_enter (& msp -> ms_lock );
1523
+
1433
1524
/*
1434
1525
* If we've already reached the allowable number of failed
1435
1526
* allocation attempts on this metaslab group then we
@@ -1446,11 +1537,10 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
1446
1537
"asize %llu, failures %llu" , spa_name (spa ),
1447
1538
mg -> mg_vd -> vdev_id , txg , mg , psize , asize ,
1448
1539
mg -> mg_alloc_failures );
1540
+ mutex_exit (& msp -> ms_lock );
1449
1541
return (-1ULL );
1450
1542
}
1451
1543
1452
- mutex_enter (& msp -> ms_lock );
1453
-
1454
1544
/*
1455
1545
* Ensure that the metaslab we have selected is still
1456
1546
* capable of handling our request. It's possible that
@@ -1615,6 +1705,21 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
1615
1705
} else {
1616
1706
allocatable = vdev_allocatable (vd );
1617
1707
}
1708
+
1709
+ /*
1710
+ * Determine if the selected metaslab group is eligible
1711
+ * for allocations. If we're ganging or have requested
1712
+ * an allocation for the smallest gang block size
1713
+ * then we don't want to avoid allocating to the this
1714
+ * metaslab group. If we're in this condition we should
1715
+ * try to allocate from any device possible so that we
1716
+ * don't inadvertently return ENOSPC and suspend the pool
1717
+ * even though space is still available.
1718
+ */
1719
+ if (allocatable && CAN_FASTGANG (flags ) &&
1720
+ psize > SPA_GANGBLOCKSIZE )
1721
+ allocatable = metaslab_group_allocatable (mg );
1722
+
1618
1723
if (!allocatable )
1619
1724
goto next ;
1620
1725
0 commit comments