Skip to content

Commit

Permalink
Review Feedback
Browse files Browse the repository at this point in the history
* Fix extended iostat reporting.  We should consider bundling
  the trim and autotrim IOs together to keep the reporting
  clear.  While useful during development I don't see a use
  case for keeping them split.

* Include aggregate TRIMs in request size histograms.  Currently
  TRIMs will never be aggregated but that's not set in stone.  So
  let's report this in the same way as all other IOs.  A module
  options was added to allow aggreation for testing.

* Adding missing NULL to IOS_LATENCY headers.

* Prevent print_iostat_labels from underflowing when calculating
  the number of spaces to print.

* Reduce number of threads allowing in TRIM taskq

* Updated man pages.

* Switched the functional/trim/* ZTS tests to file based vdev,
  this allows for testing raidz3.  Additionally, reduced the
  maximum file size and number of directories to speed up the
  test.

* In vdev_autotrim_thread() check the vdev_autotrim_thread under
  the appropriate lock.

* Fix ztest ASSERT where a top-level device is replaced by its
  sole child.  In this case the autotrim process must be stopped
  on the tlv and then restarted if needed on the new tld.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
  • Loading branch information
behlendorf committed Feb 20, 2019
1 parent becdf2b commit d6b68ad
Show file tree
Hide file tree
Showing 18 changed files with 147 additions and 79 deletions.
32 changes: 21 additions & 11 deletions cmd/zpool/zpool_main.c
Expand Up @@ -195,7 +195,7 @@ enum iostat_type {
* of all the nvlists a flag requires. Also specifies the order in
* which data gets printed in zpool iostat.
*/
static const char *vsx_type_to_nvlist[IOS_COUNT][13] = {
static const char *vsx_type_to_nvlist[IOS_COUNT][15] = {
[IOS_L_HISTO] = {
ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,
ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO,
Expand Down Expand Up @@ -238,7 +238,9 @@ static const char *vsx_type_to_nvlist[IOS_COUNT][13] = {
ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO,
ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO,
ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO,
ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO,
ZPOOL_CONFIG_VDEV_IND_AUTOTRIM_HISTO,
ZPOOL_CONFIG_VDEV_AGG_AUTOTRIM_HISTO,
NULL},
};

Expand Down Expand Up @@ -3445,15 +3447,16 @@ static const name_and_columns_t iostat_top_labels[][IOSTAT_MAX_LABELS] =
[IOS_DEFAULT] = {{"capacity", 2}, {"operations", 2}, {"bandwidth", 2},
{NULL}},
[IOS_LATENCY] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2},
{"asyncq_wait", 2}, {"scrub"}, {"trim"}, {"autotrim"}},
{"asyncq_wait", 2}, {"scrub", 1}, {"trim", 1}, {"atrim", 1},
{NULL}},
[IOS_QUEUES] = {{"syncq_read", 2}, {"syncq_write", 2},
{"asyncq_read", 2}, {"asyncq_write", 2}, {"scrubq_read", 2},
{"trim", 2}, {NULL}},
{"trimq_read", 2}, {"atrim_read", 2}, {NULL}},
[IOS_L_HISTO] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2},
{"asyncq_wait", 2}, {"trimq", 2}, {"autotrimq", 2}, {NULL}},
{"asyncq_wait", 2}, {NULL}},
[IOS_RQ_HISTO] = {{"sync_read", 2}, {"sync_write", 2},
{"async_read", 2}, {"async_write", 2}, {"scrub", 2},
{"trim", 2}, {NULL}},
{"trim", 2}, {"autotrim", 2}, {NULL}},
};

/* Shorthand - if "columns" field not set, default to 1 column */
Expand All @@ -3469,9 +3472,10 @@ static const name_and_columns_t iostat_bottom_labels[][IOSTAT_MAX_LABELS] =
{"pend"}, {"activ"}, {"pend"}, {"activ"}, {NULL}},
[IOS_L_HISTO] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"},
{"write"}, {"read"}, {"write"}, {"scrub"}, {"trim"},
{"autotrim"}, {NULL}},
{"atrim"}, {NULL}},
[IOS_RQ_HISTO] = {{"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"},
{"ind"}, {"agg"}, {"ind"}, {"agg"}, {"man"}, {"auto"}, {NULL}},
{"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"},
{"ind"}, {"agg"}, {NULL}},
};

static const char *histo_to_title[] = {
Expand Down Expand Up @@ -3530,6 +3534,8 @@ default_column_width(iostat_cbdata_t *cb, enum iostat_type type)
[IOS_DEFAULT] = 15, /* 1PB capacity */
[IOS_LATENCY] = 10, /* 1B ns = 10sec */
[IOS_QUEUES] = 6, /* 1M queue entries */
[IOS_L_HISTO] = 10, /* 1B ns = 10sec */
[IOS_RQ_HISTO] = 6, /* 1M queue entries */
};

if (cb->cb_literal)
Expand All @@ -3552,7 +3558,7 @@ print_iostat_labels(iostat_cbdata_t *cb, unsigned int force_column_width,
const name_and_columns_t labels[][IOSTAT_MAX_LABELS])
{
int i, idx, s;
unsigned int text_start, rw_column_width, spaces_to_end;
int text_start, rw_column_width, spaces_to_end;
uint64_t flags = cb->cb_flags;
uint64_t f;
unsigned int column_width = force_column_width;
Expand All @@ -3576,8 +3582,10 @@ print_iostat_labels(iostat_cbdata_t *cb, unsigned int force_column_width,
rw_column_width = (column_width * columns) +
(2 * (columns - 1));

text_start = (int)((rw_column_width)/columns -
slen/columns);
text_start = (int)(rw_column_width) / columns -
slen / columns, 1;
if (text_start < 0)
text_start = 0;

printf(" "); /* Two spaces between columns */

Expand All @@ -3589,9 +3597,11 @@ print_iostat_labels(iostat_cbdata_t *cb, unsigned int force_column_width,

/* Print space after label to end of column */
spaces_to_end = rw_column_width - text_start - slen;
if (spaces_to_end < 0)
spaces_to_end = 0;

for (s = 0; s < spaces_to_end; s++)
printf(" ");

}
}
}
Expand Down
6 changes: 4 additions & 2 deletions include/sys/fs/zfs.h
Expand Up @@ -667,13 +667,15 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO "vdev_async_ind_r_histo"
#define ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO "vdev_async_ind_w_histo"
#define ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO "vdev_ind_scrub_histo"
#define ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO "vdev_ind_trim_histo"
#define ZPOOL_CONFIG_VDEV_IND_AUTOTRIM_HISTO "vdev_ind_autotrim_histo"
#define ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO "vdev_sync_agg_r_histo"
#define ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO "vdev_sync_agg_w_histo"
#define ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO "vdev_async_agg_r_histo"
#define ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO "vdev_async_agg_w_histo"
#define ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO "vdev_agg_scrub_histo"
#define ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO "vdev_ind_trim_histo"
#define ZPOOL_CONFIG_VDEV_IND_AUTOTRIM_HISTO "vdev_ind_autotrim_histo"
#define ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO "vdev_agg_trim_histo"
#define ZPOOL_CONFIG_VDEV_AGG_AUTOTRIM_HISTO "vdev_agg_autotrim_histo"

/* Number of slow IOs */
#define ZPOOL_CONFIG_VDEV_SLOW_IOS "vdev_slow_ios"
Expand Down
13 changes: 13 additions & 0 deletions man/man5/zfs-module-parameters.5
Expand Up @@ -2410,6 +2410,19 @@ Flush dirty data to disk at least every N seconds (maximum txg duration)
Default value: \fB5\fR.
.RE

.sp
.ne 2
.na
\fBzfs_vdev_aggregate_trim\fR (int)
.ad
.RS 12n
Allow TRIM I/Os to be aggregated. This is normally not helpful because
the extents to be trimmed will have been already been aggregated by the
metaslab. This option is provided for debugging and performance analysis.
.sp
Default value: \fB0\fR.
.RE

.sp
.ne 2
.na
Expand Down
25 changes: 11 additions & 14 deletions man/man8/zpool.8
Expand Up @@ -811,12 +811,11 @@ Prints out a message to the console and generates a system crash dump.
.It Sy autotrim Ns = Ns Sy on Ns | Ns Sy off
When set to
.Sy on
space which has been recently freed, and is is no longer allocated
by the pool, will be periodically trimmed. This allows thinly provisioned
devices to reclaim unused blocks. This feature is supported on file vdevs via
hole punching when supported by the underlying file system, and on block
device vdevs when the underlying driver supports BLKDISCARD. The default
setting for this property is
space which has been recently freed, and is no longer allocated by the pool,
will be periodically trimmed. This allows block device vdevs which support
BLKDISCARD, such as SSDs, or file vdevs on which the underlying file system
supports hole-punching to reclaim unused blocks. The default setting for
this property is
.Sy off .
.Pp
Please note that automatic trimming of data blocks can put significant stress
Expand Down Expand Up @@ -1780,11 +1779,9 @@ flag.
.It Fl r
Print request size histograms for the leaf vdev's IO. This includes
histograms of individual IOs (ind) and aggregate IOs (agg). TRIM IOs are
not aggregated and are split in to manual (man) and automatic (auto).
TRIM requests which exceed 16M in size are counted as 16M requests. These
stats can be useful for observing how well IO aggregation is working. This
stats should not be confused with the block layer requests; it's possible
these IOs will be broken up or merged before being sent to the block device.
not aggregated and requests which exceed 16M in size are counted as 16M
requests. These stats can be useful for observing how well IO aggregation
is working.
.It Fl v
Verbose statistics Reports usage statistics for individual vdevs within the
pool, in addition to the pool-wide statistics.
Expand Down Expand Up @@ -2157,9 +2154,9 @@ resilver will be added to the new one.
.Op Ar device Ns ...
.Xc
Initiates an immediate on-demand TRIM operation for all of the free space in
a pool. This operation informs the underlying storage devices of all of
blocks that the pool no longer considers to be allocated. This allows thinly
provisioned storage devices to reclaim this space.
a pool. This operation informs the underlying storage devices of all blocks
in the pool which are no longer allocated and allows thinly provisioned
devices to reclaim the space.
.Pp
A manual on-demand TRIM operation can be initiated irrespective of the
.Sy autotrim
Expand Down
8 changes: 4 additions & 4 deletions module/zfs/metaslab.c
Expand Up @@ -1617,7 +1617,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,

/*
* The ms_trim tree is a subset of ms_allocatable which is kept
* in-core as long as the autotrim property is set. It's purpose
* in-core as long as the autotrim property is set. Its purpose
* is to aggregate freed ranges to facilitate efficient trimming.
*/
ms->ms_trim = range_tree_create(NULL, NULL);
Expand Down Expand Up @@ -2742,12 +2742,12 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
metaslab_load_wait(msp);

/*
* When auto-trimming is enabled then free ranges which are added
* to ms_allocatable are also be added to ms_trim. This tree is
* When auto-trimming is enabled, free ranges which are added to
* ms_allocatable are also be added to ms_trim. The ms_trim tree is
* periodically consumed by the vdev_autotrim_thread() which issues
* trims for all ranges and then vacates the tree. The ms_trim tree
* can be discarded at any time with the sole consequence of recent
* frees will not be trimmed.
* frees not being trimmed.
*/
if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) {
range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim);
Expand Down
12 changes: 9 additions & 3 deletions module/zfs/spa.c
Expand Up @@ -133,7 +133,7 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
* number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
* macros. Other operations process a large amount of data; the ZTI_BATCH
* macro causes us to create a taskq oriented for throughput. Some operations
* are so high frequency and short-lived that the taskq itself can become a a
* are so high frequency and short-lived that the taskq itself can become a
* point of lock contention. The ZTI_P(#, #) macro indicates that we need an
* additional degree of parallelism specified by the number of threads per-
* taskq and the number of taskqs; when dispatching an event in this case, the
Expand All @@ -151,7 +151,7 @@ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
{ ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */
{ ZTI_N(8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */
{ ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */
};

static void spa_sync_version(void *arg, dmu_tx_t *tx);
Expand Down Expand Up @@ -6305,7 +6305,6 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
vdev_remove_parent(cvd);
}


/*
* We don't set tvd until now because the parent we just removed
* may have been the previous top-level vdev.
Expand All @@ -6330,6 +6329,13 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
vdev_expand(tvd, txg);
}

/*
* If the 'autotrim' property is set and the previous top-level
* device was removed and replaced by its child, then the autotrim
* needs to be restarted on the new top-level device.
*/
vdev_autotrim_restart(spa);

vdev_config_dirty(tvd);

/*
Expand Down
2 changes: 1 addition & 1 deletion module/zfs/spa_stats.c
Expand Up @@ -905,7 +905,7 @@ static spa_iostats_t spa_iostats_template = {
#define SPA_IOSTATS_ADD(stat, val) \
atomic_add_64(&iostats->stat.value.ui64, (val));

extern void
void
spa_iostats_trim_add(spa_t *spa, zio_priority_t priority,
uint64_t extents_written, uint64_t bytes_written,
uint64_t extents_skipped, uint64_t bytes_skipped,
Expand Down
6 changes: 6 additions & 0 deletions module/zfs/vdev.c
Expand Up @@ -1193,6 +1193,12 @@ vdev_remove_parent(vdev_t *cvd)
*/
if (!cvd->vdev_spa->spa_autoexpand)
cvd->vdev_asize = mvd->vdev_asize;

/*
* If the pool has autotrim enabled the autotrim thread needs
* to be stopped on the top-level vdev before it can be freed.
*/
vdev_autotrim_stop_wait(mvd);
}
cvd->vdev_id = mvd->vdev_id;
vdev_add_child(pvd, cvd);
Expand Down
8 changes: 8 additions & 0 deletions module/zfs/vdev_label.c
Expand Up @@ -373,6 +373,14 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB],
ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB]));

fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO,
vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM],
ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM]));

fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_AUTOTRIM_HISTO,
vsx->vsx_agg_histo[ZIO_PRIORITY_AUTOTRIM],
ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_AUTOTRIM]));

/* IO delays */
fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios);

Expand Down
13 changes: 11 additions & 2 deletions module/zfs/vdev_queue.c
Expand Up @@ -204,6 +204,12 @@ int zfs_vdev_queue_depth_pct = 300;
*/
int zfs_vdev_def_queue_depth = 32;

/*
* Allow TRIM I/Os to be aggregated. This should normally not be needed since
* TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M) can be submitted
* by the TRIM code in zfs_trim.c.
*/
int zfs_vdev_aggregate_trim = 0;

int
vdev_queue_offset_compare(const void *x1, const void *x2)
Expand Down Expand Up @@ -574,9 +580,9 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)

/*
* While TRIM commands could be aggregated based on offset this
* behavior is disable until it's determined to be beneficial.
* behavior is disabled until it's determined to be beneficial.
*/
if (zio->io_type == ZIO_TYPE_TRIM)
if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim)
return (NULL);

first = last = zio;
Expand Down Expand Up @@ -945,6 +951,9 @@ vdev_queue_last_offset(vdev_t *vd)
module_param(zfs_vdev_aggregation_limit, int, 0644);
MODULE_PARM_DESC(zfs_vdev_aggregation_limit, "Max vdev I/O aggregation size");

module_param(zfs_vdev_aggregate_trim, int, 0644);
MODULE_PARM_DESC(zfs_vdev_aggregate_trim, "Allow TRIM I/O to be aggreated");

module_param(zfs_vdev_read_gap_limit, int, 0644);
MODULE_PARM_DESC(zfs_vdev_read_gap_limit, "Aggregate read I/O over gap");

Expand Down
2 changes: 1 addition & 1 deletion module/zfs/vdev_raidz.c
Expand Up @@ -37,7 +37,7 @@
#include <sys/vdev_raidz_impl.h>

#ifdef ZFS_DEBUG
#include <sys/vdev.h> /* vdev_xlate testing */
#include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */
#endif

/*
Expand Down
4 changes: 3 additions & 1 deletion module/zfs/vdev_trim.c
Expand Up @@ -744,7 +744,7 @@ vdev_trim_range_add(void *arg, uint64_t start, uint64_t size)

/*
* Each (manual) trim thread is responsible for trimming the unallocated
* space for each leaf vdev as described by its top-level ms->allocable.
* space for each leaf vdev as described by its top-level ms_allocatable.
*/
static void
vdev_trim_thread(void *arg)
Expand Down Expand Up @@ -1064,8 +1064,10 @@ vdev_autotrim_thread(void *arg)
abd_t *deadbeef = NULL;
int shift = 0;

mutex_enter(&vd->vdev_autotrim_lock);
ASSERT3P(vd->vdev_top, ==, vd);
ASSERT3P(vd->vdev_autotrim_thread, !=, NULL);
mutex_exit(&vd->vdev_autotrim_lock);
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);

/*
Expand Down
4 changes: 2 additions & 2 deletions tests/zfs-tests/tests/functional/trim/autotrim_config.ksh
Expand Up @@ -55,8 +55,8 @@ function cleanup

log_must rm -f $TRIM_VDEVS

log_must set_tunable64 zfs_trim_extent_bytes_min $trim_extent_bytes_min
log_must set_tunable64 zfs_trim_txg_batch $trim_txg_batch
log_must set_tunable64 zfs_trim_extent_bytes_min $trim_extent_bytes_min
log_must set_tunable64 zfs_trim_txg_batch $trim_txg_batch
log_must set_tunable64 zfs_vdev_min_ms_count $vdev_min_ms_count
}
log_onexit cleanup
Expand Down

0 comments on commit d6b68ad

Please sign in to comment.