20
20
*/
21
21
/*
22
22
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23
+ * Copyright (c) 2011 by Delphix. All rights reserved.
23
24
*/
24
25
25
26
#include <sys/zfs_context.h>
30
31
#include <sys/vdev_impl.h>
31
32
#include <sys/zio.h>
32
33
33
- #define WITH_NDF_BLOCK_ALLOCATOR
34
+ #define WITH_DF_BLOCK_ALLOCATOR
35
+
36
+ /*
37
+ * Allow allocations to switch to gang blocks quickly. We do this to
38
+ * avoid having to load lots of space_maps in a given txg. There are,
39
+ * however, some cases where we want to avoid "fast" ganging and instead
40
+ * we want to do an exhaustive search of all metaslabs on this device.
41
+ * Currently we don't allow any gang or dump device related allocations
42
+ * to "fast" gang.
43
+ */
44
+ #define CAN_FASTGANG (flags ) \
45
+ (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
46
+ METASLAB_GANG_AVOID)))
34
47
35
48
uint64_t metaslab_aliquot = 512ULL << 10 ;
36
49
uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1 ; /* force gang blocks */
37
50
51
+ /*
52
+ * This value defines the number of allowed allocation failures per vdev.
53
+ * If a device reaches this threshold in a given txg then we consider skipping
54
+ * allocations on that device.
55
+ */
56
+ int zfs_mg_alloc_failures ;
57
+
38
58
/*
39
59
* Metaslab debugging: when set, keeps all space maps in core to verify frees.
40
60
*/
@@ -865,7 +885,7 @@ metaslab_prefetch(metaslab_group_t *mg)
865
885
}
866
886
867
887
static int
868
- metaslab_activate (metaslab_t * msp , uint64_t activation_weight , uint64_t size )
888
+ metaslab_activate (metaslab_t * msp , uint64_t activation_weight )
869
889
{
870
890
metaslab_group_t * mg = msp -> ms_group ;
871
891
space_map_t * sm = & msp -> ms_map ;
@@ -899,13 +919,6 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
899
919
mutex_exit (& mg -> mg_lock );
900
920
}
901
921
902
- /*
903
- * If we were able to load the map then make sure
904
- * that this map is still able to satisfy our request.
905
- */
906
- if (msp -> ms_weight < size )
907
- return (ENOSPC );
908
-
909
922
metaslab_group_sort (msp -> ms_group , msp ,
910
923
msp -> ms_weight | activation_weight );
911
924
}
@@ -1123,6 +1136,7 @@ void
1123
1136
metaslab_sync_reassess (metaslab_group_t * mg )
1124
1137
{
1125
1138
vdev_t * vd = mg -> mg_vd ;
1139
+ int64_t failures = mg -> mg_alloc_failures ;
1126
1140
int m ;
1127
1141
1128
1142
/*
@@ -1140,6 +1154,8 @@ metaslab_sync_reassess(metaslab_group_t *mg)
1140
1154
mutex_exit (& msp -> ms_lock );
1141
1155
}
1142
1156
1157
+ atomic_add_64 (& mg -> mg_alloc_failures , - failures );
1158
+
1143
1159
/*
1144
1160
* Prefetch the next potential metaslabs
1145
1161
*/
@@ -1164,9 +1180,10 @@ metaslab_distance(metaslab_t *msp, dva_t *dva)
1164
1180
}
1165
1181
1166
1182
static uint64_t
1167
- metaslab_group_alloc (metaslab_group_t * mg , uint64_t size , uint64_t txg ,
1168
- uint64_t min_distance , dva_t * dva , int d )
1183
+ metaslab_group_alloc (metaslab_group_t * mg , uint64_t psize , uint64_t asize ,
1184
+ uint64_t txg , uint64_t min_distance , dva_t * dva , int d , int flags )
1169
1185
{
1186
+ spa_t * spa = mg -> mg_vd -> vdev_spa ;
1170
1187
metaslab_t * msp = NULL ;
1171
1188
uint64_t offset = -1ULL ;
1172
1189
avl_tree_t * t = & mg -> mg_metaslab_tree ;
@@ -1187,11 +1204,17 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
1187
1204
1188
1205
mutex_enter (& mg -> mg_lock );
1189
1206
for (msp = avl_first (t ); msp ; msp = AVL_NEXT (t , msp )) {
1190
- if (msp -> ms_weight < size ) {
1207
+ if (msp -> ms_weight < asize ) {
1208
+ spa_dbgmsg (spa , "%s: failed to meet weight "
1209
+ "requirement: vdev %llu, txg %llu, mg %p, "
1210
+ "msp %p, psize %llu, asize %llu, "
1211
+ "failures %llu, weight %llu" ,
1212
+ spa_name (spa ), mg -> mg_vd -> vdev_id , txg ,
1213
+ mg , msp , psize , asize ,
1214
+ mg -> mg_alloc_failures , msp -> ms_weight );
1191
1215
mutex_exit (& mg -> mg_lock );
1192
1216
return (-1ULL );
1193
1217
}
1194
-
1195
1218
was_active = msp -> ms_weight & METASLAB_ACTIVE_MASK ;
1196
1219
if (activation_weight == METASLAB_WEIGHT_PRIMARY )
1197
1220
break ;
@@ -1210,6 +1233,25 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
1210
1233
if (msp == NULL )
1211
1234
return (-1ULL );
1212
1235
1236
+ /*
1237
+ * If we've already reached the allowable number of failed
1238
+ * allocation attempts on this metaslab group then we
1239
+ * consider skipping it. We skip it only if we're allowed
1240
+ * to "fast" gang, the physical size is larger than
1241
+ * a gang block, and we're attempting to allocate from
1242
+ * the primary metaslab.
1243
+ */
1244
+ if (mg -> mg_alloc_failures > zfs_mg_alloc_failures &&
1245
+ CAN_FASTGANG (flags ) && psize > SPA_GANGBLOCKSIZE &&
1246
+ activation_weight == METASLAB_WEIGHT_PRIMARY ) {
1247
+ spa_dbgmsg (spa , "%s: skipping metaslab group: "
1248
+ "vdev %llu, txg %llu, mg %p, psize %llu, "
1249
+ "asize %llu, failures %llu" , spa_name (spa ),
1250
+ mg -> mg_vd -> vdev_id , txg , mg , psize , asize ,
1251
+ mg -> mg_alloc_failures );
1252
+ return (-1ULL );
1253
+ }
1254
+
1213
1255
mutex_enter (& msp -> ms_lock );
1214
1256
1215
1257
/*
@@ -1218,7 +1260,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
1218
1260
* another thread may have changed the weight while we
1219
1261
* were blocked on the metaslab lock.
1220
1262
*/
1221
- if (msp -> ms_weight < size || (was_active &&
1263
+ if (msp -> ms_weight < asize || (was_active &&
1222
1264
!(msp -> ms_weight & METASLAB_ACTIVE_MASK ) &&
1223
1265
activation_weight == METASLAB_WEIGHT_PRIMARY )) {
1224
1266
mutex_exit (& msp -> ms_lock );
@@ -1233,14 +1275,16 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
1233
1275
continue ;
1234
1276
}
1235
1277
1236
- if (metaslab_activate (msp , activation_weight , size ) != 0 ) {
1278
+ if (metaslab_activate (msp , activation_weight ) != 0 ) {
1237
1279
mutex_exit (& msp -> ms_lock );
1238
1280
continue ;
1239
1281
}
1240
1282
1241
- if ((offset = space_map_alloc (& msp -> ms_map , size )) != -1ULL )
1283
+ if ((offset = space_map_alloc (& msp -> ms_map , asize )) != -1ULL )
1242
1284
break ;
1243
1285
1286
+ atomic_inc_64 (& mg -> mg_alloc_failures );
1287
+
1244
1288
metaslab_passivate (msp , space_map_maxsize (& msp -> ms_map ));
1245
1289
1246
1290
mutex_exit (& msp -> ms_lock );
@@ -1249,7 +1293,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
1249
1293
if (msp -> ms_allocmap [txg & TXG_MASK ].sm_space == 0 )
1250
1294
vdev_dirty (mg -> mg_vd , VDD_METASLAB , msp , txg );
1251
1295
1252
- space_map_add (& msp -> ms_allocmap [txg & TXG_MASK ], offset , size );
1296
+ space_map_add (& msp -> ms_allocmap [txg & TXG_MASK ], offset , asize );
1253
1297
1254
1298
mutex_exit (& msp -> ms_lock );
1255
1299
@@ -1376,7 +1420,8 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
1376
1420
asize = vdev_psize_to_asize (vd , psize );
1377
1421
ASSERT (P2PHASE (asize , 1ULL << vd -> vdev_ashift ) == 0 );
1378
1422
1379
- offset = metaslab_group_alloc (mg , asize , txg , distance , dva , d );
1423
+ offset = metaslab_group_alloc (mg , psize , asize , txg , distance ,
1424
+ dva , d , flags );
1380
1425
if (offset != -1ULL ) {
1381
1426
/*
1382
1427
* If we've just selected this metaslab group,
@@ -1388,18 +1433,24 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
1388
1433
vdev_stat_t * vs = & vd -> vdev_stat ;
1389
1434
int64_t vu , cu ;
1390
1435
1391
- /*
1392
- * Determine percent used in units of 0..1024.
1393
- * (This is just to avoid floating point.)
1394
- */
1395
- vu = (vs -> vs_alloc << 10 ) / (vs -> vs_space + 1 );
1396
- cu = (mc -> mc_alloc << 10 ) / (mc -> mc_space + 1 );
1436
+ vu = (vs -> vs_alloc * 100 ) / (vs -> vs_space + 1 );
1437
+ cu = (mc -> mc_alloc * 100 ) / (mc -> mc_space + 1 );
1397
1438
1398
1439
/*
1399
- * Bias by at most +/- 25% of the aliquot.
1440
+ * Calculate how much more or less we should
1441
+ * try to allocate from this device during
1442
+ * this iteration around the rotor.
1443
+ * For example, if a device is 80% full
1444
+ * and the pool is 20% full then we should
1445
+ * reduce allocations by 60% on this device.
1446
+ *
1447
+ * mg_bias = (20 - 80) * 512K / 100 = -307K
1448
+ *
1449
+ * This reduces allocations by 307K for this
1450
+ * iteration.
1400
1451
*/
1401
1452
mg -> mg_bias = ((cu - vu ) *
1402
- (int64_t )mg -> mg_aliquot ) / ( 1024 * 4 ) ;
1453
+ (int64_t )mg -> mg_aliquot ) / 100 ;
1403
1454
}
1404
1455
1405
1456
if (atomic_add_64_nv (& mc -> mc_aliquot , asize ) >=
@@ -1513,7 +1564,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
1513
1564
mutex_enter (& msp -> ms_lock );
1514
1565
1515
1566
if ((txg != 0 && spa_writeable (spa )) || !msp -> ms_map .sm_loaded )
1516
- error = metaslab_activate (msp , METASLAB_WEIGHT_SECONDARY , 0 );
1567
+ error = metaslab_activate (msp , METASLAB_WEIGHT_SECONDARY );
1517
1568
1518
1569
if (error == 0 && !space_map_contains (& msp -> ms_map , offset , size ))
1519
1570
error = ENOENT ;
0 commit comments