Skip to content

Commit b02fe35

Browse files
Alex Reecebehlendorf
authored andcommitted
Illumos 4958 zdb trips assert on pools with ashift >= 0xe
4958 zdb trips assert on pools with ashift >= 0xe Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Max Grossman <max.grossman@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Christopher Siden <christopher.siden@delphix.com> Approved by: Garrett D'Amore <garrett@damore.org> References: https://www.illumos.org/issues/4958 illumos/illumos-gate@2a104a5 Porting notes: Keep the ZIO_FLAG_FASTWRITE define. This is for a feature present in Linux but not yet in *BSD. Ported by: Turbo Fredriksson <turbo@bayour.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #2697
1 parent adc90e9 commit b02fe35

File tree

8 files changed

+131
-46
lines changed

8 files changed

+131
-46
lines changed

cmd/ztest/ztest.c

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -859,7 +859,7 @@ static uint64_t
859859
ztest_get_ashift(void)
860860
{
861861
if (ztest_opts.zo_ashift == 0)
862-
return (SPA_MINBLOCKSHIFT + ztest_random(3));
862+
return (SPA_MINBLOCKSHIFT + ztest_random(5));
863863
return (ztest_opts.zo_ashift);
864864
}
865865

@@ -1021,11 +1021,28 @@ ztest_random_spa_version(uint64_t initial_version)
10211021
return (version);
10221022
}
10231023

1024+
/*
1025+
* Find the largest ashift used
1026+
*/
1027+
static uint64_t
1028+
ztest_spa_get_ashift() {
1029+
uint64_t i;
1030+
uint64_t ashift = SPA_MINBLOCKSHIFT;
1031+
vdev_t *rvd = ztest_spa->spa_root_vdev;
1032+
1033+
for (i = 0; i < rvd->vdev_children; i++) {
1034+
ashift = MAX(ashift, rvd->vdev_child[i]->vdev_ashift);
1035+
}
1036+
return (ashift);
1037+
}
1038+
10241039
static int
10251040
ztest_random_blocksize(void)
10261041
{
1027-
return (1 << (SPA_MINBLOCKSHIFT +
1028-
ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)));
1042+
// Choose a block size >= the ashift.
1043+
uint64_t block_shift =
1044+
ztest_random(SPA_MAXBLOCKSHIFT - ztest_spa_get_ashift() + 1);
1045+
return (1 << (SPA_MINBLOCKSHIFT + block_shift));
10291046
}
10301047

10311048
static int
@@ -5963,17 +5980,31 @@ ztest_freeze(void)
59635980
*/
59645981
spa_freeze(spa);
59655982

5983+
/*
5984+
* Because it is hard to predict how much space a write will actually
5985+
* require beforehand, we leave ourselves some fudge space to write over
5986+
* capacity.
5987+
*/
5988+
uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2;
5989+
59665990
/*
59675991
* Run tests that generate log records but don't alter the pool config
59685992
* or depend on DSL sync tasks (snapshots, objset create/destroy, etc).
59695993
* We do a txg_wait_synced() after each iteration to force the txg
59705994
* to increase well beyond the last synced value in the uberblock.
59715995
* The ZIL should be OK with that.
5996+
*
5997+
* Run a random number of times less than zo_maxloops and ensure we do
5998+
* not run out of space on the pool.
59725999
*/
59736000
while (ztest_random(10) != 0 &&
5974-
numloops++ < ztest_opts.zo_maxloops) {
5975-
ztest_dmu_write_parallel(zd, 0);
5976-
ztest_dmu_object_alloc_free(zd, 0);
6001+
numloops++ < ztest_opts.zo_maxloops &&
6002+
metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) {
6003+
ztest_od_t od;
6004+
ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
6005+
VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE));
6006+
ztest_io(zd, od.od_object,
6007+
ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
59776008
txg_wait_synced(spa_get_dsl(spa), 0);
59786009
}
59796010

include/sys/vdev_impl.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
*/
2121
/*
2222
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23-
* Copyright (c) 2013 by Delphix. All rights reserved.
23+
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
2424
*/
2525

2626
#ifndef _SYS_VDEV_IMPL_H
@@ -239,8 +239,11 @@ struct vdev {
239239
#define VDEV_PHYS_SIZE (112 << 10)
240240
#define VDEV_UBERBLOCK_RING (128 << 10)
241241

242+
/* The largest uberblock we support is 8k. */
243+
#define MAX_UBERBLOCK_SHIFT (13)
242244
#define VDEV_UBERBLOCK_SHIFT(vd) \
243-
MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT)
245+
MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \
246+
MAX_UBERBLOCK_SHIFT)
244247
#define VDEV_UBERBLOCK_COUNT(vd) \
245248
(VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
246249
#define VDEV_UBERBLOCK_OFFSET(vd, n) \

include/sys/zio.h

Lines changed: 24 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -167,48 +167,49 @@ enum zio_flag {
167167
ZIO_FLAG_RESILVER = 1 << 3,
168168
ZIO_FLAG_SCRUB = 1 << 4,
169169
ZIO_FLAG_SCAN_THREAD = 1 << 5,
170+
ZIO_FLAG_PHYSICAL = 1 << 6,
170171

171172
#define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1)
172173

173174
/*
174175
* Flags inherited by ddt, gang, and vdev children.
175176
*/
176-
ZIO_FLAG_CANFAIL = 1 << 6, /* must be first for INHERIT */
177-
ZIO_FLAG_SPECULATIVE = 1 << 7,
178-
ZIO_FLAG_CONFIG_WRITER = 1 << 8,
179-
ZIO_FLAG_DONT_RETRY = 1 << 9,
180-
ZIO_FLAG_DONT_CACHE = 1 << 10,
181-
ZIO_FLAG_NODATA = 1 << 11,
182-
ZIO_FLAG_INDUCE_DAMAGE = 1 << 12,
177+
ZIO_FLAG_CANFAIL = 1 << 7, /* must be first for INHERIT */
178+
ZIO_FLAG_SPECULATIVE = 1 << 8,
179+
ZIO_FLAG_CONFIG_WRITER = 1 << 9,
180+
ZIO_FLAG_DONT_RETRY = 1 << 10,
181+
ZIO_FLAG_DONT_CACHE = 1 << 11,
182+
ZIO_FLAG_NODATA = 1 << 12,
183+
ZIO_FLAG_INDUCE_DAMAGE = 1 << 13,
183184

184185
#define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1)
185186
#define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1)
186187

187188
/*
188189
* Flags inherited by vdev children.
189190
*/
190-
ZIO_FLAG_IO_RETRY = 1 << 13, /* must be first for INHERIT */
191-
ZIO_FLAG_PROBE = 1 << 14,
192-
ZIO_FLAG_TRYHARD = 1 << 15,
193-
ZIO_FLAG_OPTIONAL = 1 << 16,
191+
ZIO_FLAG_IO_RETRY = 1 << 14, /* must be first for INHERIT */
192+
ZIO_FLAG_PROBE = 1 << 15,
193+
ZIO_FLAG_TRYHARD = 1 << 16,
194+
ZIO_FLAG_OPTIONAL = 1 << 17,
194195

195196
#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1)
196197

197198
/*
198199
* Flags not inherited by any children.
199200
*/
200-
ZIO_FLAG_DONT_QUEUE = 1 << 17, /* must be first for INHERIT */
201-
ZIO_FLAG_DONT_PROPAGATE = 1 << 18,
202-
ZIO_FLAG_IO_BYPASS = 1 << 19,
203-
ZIO_FLAG_IO_REWRITE = 1 << 20,
204-
ZIO_FLAG_RAW = 1 << 21,
205-
ZIO_FLAG_GANG_CHILD = 1 << 22,
206-
ZIO_FLAG_DDT_CHILD = 1 << 23,
207-
ZIO_FLAG_GODFATHER = 1 << 24,
208-
ZIO_FLAG_NOPWRITE = 1 << 25,
209-
ZIO_FLAG_REEXECUTED = 1 << 26,
210-
ZIO_FLAG_DELEGATED = 1 << 27,
211-
ZIO_FLAG_FASTWRITE = 1 << 28
201+
ZIO_FLAG_DONT_QUEUE = 1 << 18, /* must be first for INHERIT */
202+
ZIO_FLAG_DONT_PROPAGATE = 1 << 19,
203+
ZIO_FLAG_IO_BYPASS = 1 << 20,
204+
ZIO_FLAG_IO_REWRITE = 1 << 21,
205+
ZIO_FLAG_RAW = 1 << 22,
206+
ZIO_FLAG_GANG_CHILD = 1 << 23,
207+
ZIO_FLAG_DDT_CHILD = 1 << 24,
208+
ZIO_FLAG_GODFATHER = 1 << 25,
209+
ZIO_FLAG_NOPWRITE = 1 << 26,
210+
ZIO_FLAG_REEXECUTED = 1 << 27,
211+
ZIO_FLAG_DELEGATED = 1 << 28,
212+
ZIO_FLAG_FASTWRITE = 1 << 29,
212213
};
213214

214215
#define ZIO_FLAG_MUSTSUCCEED 0

module/zfs/dsl_pool.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
*/
2121
/*
2222
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23-
* Copyright (c) 2013 by Delphix. All rights reserved.
23+
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
2424
* Copyright (c) 2013 Steven Hartland. All rights reserved.
2525
*/
2626

module/zfs/metaslab.c

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,21 @@ uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
6464
*/
6565
int zfs_condense_pct = 200;
6666

67+
/*
68+
* Condensing a metaslab is not guaranteed to actually reduce the amount of
69+
* space used on disk. In particular, a space map uses data in increments of
70+
* MAX(1 << ashift, SPACE_MAP_INITIAL_BLOCKSIZE), so a metaslab might use the
71+
* same number of blocks after condensing. Since the goal of condensing is to
72+
* reduce the number of IOPs required to read the space map, we only want to
73+
* condense when we can be sure we will reduce the number of blocks used by the
74+
* space map. Unfortunately, we cannot precisely compute whether or not this is
75+
* the case in metaslab_should_condense since we are holding ms_lock. Instead,
76+
* we apply the following heuristic: do not condense a spacemap unless the
77+
* uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
78+
* blocks.
79+
*/
80+
int zfs_metaslab_condense_block_threshold = 4;
81+
6782
/*
6883
* The zfs_mg_noalloc_threshold defines which metaslab groups should
6984
* be eligible for allocation. The value is defined as a percentage of
@@ -1633,6 +1648,8 @@ metaslab_group_preload(metaslab_group_t *mg)
16331648
* times the size than the free space range tree representation
16341649
* (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
16351650
*
1651+
* 3. The on-disk size of the space map should actually decrease.
1652+
*
16361653
* Checking the first condition is tricky since we don't want to walk
16371654
* the entire AVL tree calculating the estimated on-disk size. Instead we
16381655
* use the size-ordered range tree in the metaslab and calculate the
@@ -1643,13 +1660,21 @@ metaslab_group_preload(metaslab_group_t *mg)
16431660
* To determine the second criterion we use a best-case estimate and assume
16441661
* each segment can be represented on-disk as a single 64-bit entry. We refer
16451662
* to this best-case estimate as the space map's minimal form.
1663+
*
1664+
* Unfortunately, we cannot compute the on-disk size of the space map in this
1665+
* context because we cannot accurately compute the effects of compression, etc.
1666+
* Instead, we apply the heuristic described in the block comment for
1667+
* zfs_metaslab_condense_block_threshold - we only condense if the space used
1668+
* is greater than a threshold number of blocks.
16461669
*/
16471670
static boolean_t
16481671
metaslab_should_condense(metaslab_t *msp)
16491672
{
16501673
space_map_t *sm = msp->ms_sm;
16511674
range_seg_t *rs;
1652-
uint64_t size, entries, segsz;
1675+
uint64_t size, entries, segsz, object_size, optimal_size, record_size;
1676+
dmu_object_info_t doi;
1677+
uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;
16531678

16541679
ASSERT(MUTEX_HELD(&msp->ms_lock));
16551680
ASSERT(msp->ms_loaded);
@@ -1674,9 +1699,15 @@ metaslab_should_condense(metaslab_t *msp)
16741699
entries = size / (MIN(size, SM_RUN_MAX));
16751700
segsz = entries * sizeof (uint64_t);
16761701

1677-
return (segsz <= space_map_length(msp->ms_sm) &&
1678-
space_map_length(msp->ms_sm) >= (zfs_condense_pct *
1679-
sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root)) / 100);
1702+
optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
1703+
object_size = space_map_length(msp->ms_sm);
1704+
1705+
dmu_object_info_from_db(sm->sm_dbuf, &doi);
1706+
record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
1707+
1708+
return (segsz <= object_size &&
1709+
object_size >= (optimal_size * zfs_condense_pct / 100) &&
1710+
object_size > zfs_metaslab_condense_block_threshold * record_size);
16801711
}
16811712

16821713
/*

module/zfs/spa_misc.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
*/
2121
/*
2222
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23-
* Copyright (c) 2013 by Delphix. All rights reserved.
23+
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
2424
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
2525
*/
2626

module/zfs/zfs_debug.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
*/
2121
/*
2222
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23-
* Copyright (c) 2013 by Delphix. All rights reserved.
23+
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
2424
*/
2525

2626
#include <sys/zfs_context.h>
@@ -136,7 +136,10 @@ zfs_dbgmsg_fini(void)
136136
* echo ::zfs_dbgmsg | mdb -k
137137
*
138138
* Monitor these messages by running:
139-
* dtrace -q -n 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}'
139+
* dtrace -qn 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}'
140+
*
141+
* When used with libzpool, monitor with:
142+
* dtrace -qn 'zfs$pid::zfs_dbgmsg:probe1{printf("%s\n", copyinstr(arg1))}'
140143
*/
141144
void
142145
zfs_dbgmsg(const char *fmt, ...)

module/zfs/zio.c

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -889,8 +889,8 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
889889
ASSERT3U(offset + size, <=, vd->vdev_psize);
890890

891891
zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
892-
ZIO_TYPE_READ, priority, flags, vd, offset, NULL,
893-
ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
892+
ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
893+
NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
894894

895895
zio->io_prop.zp_checksum = checksum;
896896

@@ -910,8 +910,8 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
910910
ASSERT3U(offset + size, <=, vd->vdev_psize);
911911

912912
zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
913-
ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL,
914-
ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
913+
ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
914+
NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
915915

916916
zio->io_prop.zp_checksum = checksum;
917917

@@ -2642,7 +2642,9 @@ zio_vdev_io_start(zio_t *zio)
26422642

26432643
align = 1ULL << vd->vdev_top->vdev_ashift;
26442644

2645-
if (P2PHASE(zio->io_size, align) != 0) {
2645+
if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
2646+
P2PHASE(zio->io_size, align) != 0) {
2647+
/* Transform logical writes to be a full physical block size. */
26462648
uint64_t asize = P2ROUNDUP(zio->io_size, align);
26472649
char *abuf = zio_buf_alloc(asize);
26482650
ASSERT(vd == vd->vdev_top);
@@ -2653,8 +2655,22 @@ zio_vdev_io_start(zio_t *zio)
26532655
zio_push_transform(zio, abuf, asize, asize, zio_subblock);
26542656
}
26552657

2656-
ASSERT(P2PHASE(zio->io_offset, align) == 0);
2657-
ASSERT(P2PHASE(zio->io_size, align) == 0);
2658+
/*
2659+
* If this is not a physical io, make sure that it is properly aligned
2660+
* before proceeding.
2661+
*/
2662+
if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
2663+
ASSERT0(P2PHASE(zio->io_offset, align));
2664+
ASSERT0(P2PHASE(zio->io_size, align));
2665+
} else {
2666+
/*
2667+
* For physical writes, we allow 512b aligned writes and assume
2668+
* the device will perform a read-modify-write as necessary.
2669+
*/
2670+
ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
2671+
ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
2672+
}
2673+
26582674
VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
26592675

26602676
/*

0 commit comments

Comments
 (0)