Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Illumos #3581 spa_zio_taskq[ZIO_TYPE_FREE][ZIO_TASKQ_ISSUE]->tq_lock …
…contention

3581 spa_zio_taskq[ZIO_TYPE_FREE][ZIO_TASKQ_ISSUE]->tq_lock is piping hot

Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
Approved by: Richard Lowe <richlowe@richlowe.net>

References:
  illumos/illumos-gate@ec94d32
  https://illumos.org/issues/3581

Notes for Linux port:

Earlier commit 08d08eb reduced contention on this taskq lock by simply
reducing the number of z_fr_iss threads from 100 to one-per-CPU.  We
also optimized the taskq implementation in openzfs/spl@3c6ed54.
These changes significantly improved unlink performance to acceptable
levels.

This patch further reduces time spent spinning on this lock by
randomly dispatching the work items over multiple independent task
queues.  The Illumos ZFS developers stated that this lock contention
only arose after "3329 spa_sync() spends 10-20% of its time in
spa_free_sync_cb()" was landed.  It's not clear if 3329 affects the
Linux port or not.  I didn't see spa_free_sync_cb() show up in
oprofile sessions while unlinking large files, but I may just not
have used the right test case.

I tested unlinking a 1 TB of data with and without the patch and
didn't observe a meaningful difference in elapsed time.  However,
oprofile showed that the percent time spent in taskq_thread() was
reduced from about 16% to about 5%.  Aside from a possible slight
performance benefit this may be worth landing if only for the sake of
maintaining consistency with upstream.

Ported-by: Ned Bass <bass6@llnl.gov>
Closes #1327
  • Loading branch information
Adam Leventhal authored and behlendorf committed May 6, 2013
1 parent 55d85d5 commit 7ef5e54
Show file tree
Hide file tree
Showing 4 changed files with 170 additions and 79 deletions.
16 changes: 12 additions & 4 deletions include/sys/spa_impl.h
Expand Up @@ -80,16 +80,16 @@ typedef struct spa_config_dirent {
char *scd_path;
} spa_config_dirent_t;

enum zio_taskq_type {
typedef enum zio_taskq_type {
ZIO_TASKQ_ISSUE = 0,
ZIO_TASKQ_ISSUE_HIGH,
ZIO_TASKQ_INTERRUPT,
ZIO_TASKQ_INTERRUPT_HIGH,
ZIO_TASKQ_TYPES
};
} zio_taskq_type_t;

/*
* State machine for the zpool-pooname process. The states transitions
* State machine for the zpool-poolname process. The states transitions
* are done as follows:
*
* From To Routine
Expand All @@ -107,6 +107,11 @@ typedef enum spa_proc_state {
SPA_PROC_GONE /* spa_thread() is exiting, spa_proc = &p0 */
} spa_proc_state_t;

typedef struct spa_taskqs {
uint_t stqs_count;
taskq_t **stqs_taskq;
} spa_taskqs_t;

struct spa {
/*
* Fields protected by spa_namespace_lock.
Expand All @@ -125,7 +130,7 @@ struct spa {
uint8_t spa_sync_on; /* sync threads are running */
spa_load_state_t spa_load_state; /* current load operation */
uint64_t spa_import_flags; /* import specific flags */
taskq_t *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
spa_taskqs_t spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
dsl_pool_t *spa_dsl_pool;
boolean_t spa_is_initializing; /* true while opening pool */
metaslab_class_t *spa_normal_class; /* normal data class */
Expand Down Expand Up @@ -243,6 +248,9 @@ struct spa {

extern char *spa_config_path;

extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent);

#ifdef __cplusplus
}
#endif
Expand Down
203 changes: 140 additions & 63 deletions module/zfs/spa.c
Expand Up @@ -78,41 +78,56 @@
#include "zfs_comutil.h"

typedef enum zti_modes {
zti_mode_fixed, /* value is # of threads (min 1) */
zti_mode_online_percent, /* value is % of online CPUs */
zti_mode_batch, /* cpu-intensive; value is ignored */
zti_mode_null, /* don't create a taskq */
zti_nmodes
ZTI_MODE_FIXED, /* value is # of threads (min 1) */
ZTI_MODE_ONLINE_PERCENT, /* value is % of online CPUs */
ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */
ZTI_MODE_NULL, /* don't create a taskq */
ZTI_NMODES
} zti_modes_t;

#define ZTI_FIX(n) { zti_mode_fixed, (n) }
#define ZTI_PCT(n) { zti_mode_online_percent, (n) }
#define ZTI_BATCH { zti_mode_batch, 0 }
#define ZTI_NULL { zti_mode_null, 0 }
#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) }
#define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 }
#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 }
#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 }

#define ZTI_ONE ZTI_FIX(1)
#define ZTI_N(n) ZTI_P(n, 1)
#define ZTI_ONE ZTI_N(1)

typedef struct zio_taskq_info {
enum zti_modes zti_mode;
zti_modes_t zti_mode;
uint_t zti_value;
uint_t zti_count;
} zio_taskq_info_t;

static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
"iss", "iss_h", "int", "int_h"
};

/*
* Define the taskq threads for the following I/O types:
* NULL, READ, WRITE, FREE, CLAIM, and IOCTL
* This table defines the taskq settings for each ZFS I/O type. When
* initializing a pool, we use this table to create an appropriately sized
* taskq. Some operations are low volume and therefore have a small, static
* number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
* macros. Other operations process a large amount of data; the ZTI_BATCH
* macro causes us to create a taskq oriented for throughput. Some operations
* are so high frequency and short-lived that the taskq itself can become a a
* point of lock contention. The ZTI_P(#, #) macro indicates that we need an
* additional degree of parallelism specified by the number of threads per-
* taskq and the number of taskqs; when dispatching an event in this case, the
* particular taskq is chosen at random.
*
* The different taskq priorities are to handle the different contexts (issue
* and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
* need to be handled with minimum delay.
*/
const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
/* ISSUE ISSUE_HIGH INTR INTR_HIGH */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
{ ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL },
{ ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(16), ZTI_FIX(5) },
{ ZTI_FIX(8), ZTI_NULL, ZTI_ONE, ZTI_NULL },
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */
{ ZTI_N(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, /* READ */
{ ZTI_BATCH, ZTI_N(5), ZTI_N(16), ZTI_N(5) }, /* WRITE */
{ ZTI_P(4, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */
};

static dsl_syncfunc_t spa_sync_version;
Expand Down Expand Up @@ -794,48 +809,121 @@ spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
offsetof(spa_error_entry_t, se_avl));
}

static taskq_t *
spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode,
uint_t value)
static void
spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
{
uint_t flags = TASKQ_PREPOPULATE;
const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
enum zti_modes mode = ztip->zti_mode;
uint_t value = ztip->zti_value;
uint_t count = ztip->zti_count;
spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
char name[32];
uint_t i, flags = 0;
boolean_t batch = B_FALSE;

switch (mode) {
case zti_mode_null:
return (NULL); /* no taskq needed */
if (mode == ZTI_MODE_NULL) {
tqs->stqs_count = 0;
tqs->stqs_taskq = NULL;
return;
}

case zti_mode_fixed:
ASSERT3U(value, >=, 1);
value = MAX(value, 1);
break;
ASSERT3U(count, >, 0);

case zti_mode_batch:
batch = B_TRUE;
flags |= TASKQ_THREADS_CPU_PCT;
value = zio_taskq_batch_pct;
break;
tqs->stqs_count = count;
tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);

case zti_mode_online_percent:
flags |= TASKQ_THREADS_CPU_PCT;
break;
for (i = 0; i < count; i++) {
taskq_t *tq;

default:
panic("unrecognized mode for %s taskq (%u:%u) in "
"spa_activate()",
name, mode, value);
break;
switch (mode) {
case ZTI_MODE_FIXED:
ASSERT3U(value, >=, 1);
value = MAX(value, 1);
break;

case ZTI_MODE_BATCH:
batch = B_TRUE;
flags |= TASKQ_THREADS_CPU_PCT;
value = zio_taskq_batch_pct;
break;

case ZTI_MODE_ONLINE_PERCENT:
flags |= TASKQ_THREADS_CPU_PCT;
break;

default:
panic("unrecognized mode for %s_%s taskq (%u:%u) in "
"spa_activate()",
zio_type_name[t], zio_taskq_types[q], mode, value);
break;
}

if (count > 1) {
(void) snprintf(name, sizeof (name), "%s_%s_%u",
zio_type_name[t], zio_taskq_types[q], i);
} else {
(void) snprintf(name, sizeof (name), "%s_%s",
zio_type_name[t], zio_taskq_types[q]);
}

if (zio_taskq_sysdc && spa->spa_proc != &p0) {
if (batch)
flags |= TASKQ_DC_BATCH;

tq = taskq_create_sysdc(name, value, 50, INT_MAX,
spa->spa_proc, zio_taskq_basedc, flags);
} else {
tq = taskq_create_proc(name, value, maxclsyspri, 50,
INT_MAX, spa->spa_proc, flags);
}

tqs->stqs_taskq[i] = tq;
}
}

static void
spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
{
spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
uint_t i;

if (zio_taskq_sysdc && spa->spa_proc != &p0) {
if (batch)
flags |= TASKQ_DC_BATCH;
if (tqs->stqs_taskq == NULL) {
ASSERT3U(tqs->stqs_count, ==, 0);
return;
}

return (taskq_create_sysdc(name, value, 50, INT_MAX,
spa->spa_proc, zio_taskq_basedc, flags));
for (i = 0; i < tqs->stqs_count; i++) {
ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
taskq_destroy(tqs->stqs_taskq[i]);
}
return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX,
spa->spa_proc, flags));

kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
tqs->stqs_taskq = NULL;
}

/*
* Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
* Note that a type may have multiple discrete taskqs to avoid lock contention
* on the taskq itself. In that case we choose which taskq at random by using
* the low bits of gethrtime().
*/
void
spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
{
spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
taskq_t *tq;

ASSERT3P(tqs->stqs_taskq, !=, NULL);
ASSERT3U(tqs->stqs_count, !=, 0);

if (tqs->stqs_count == 1) {
tq = tqs->stqs_taskq[0];
} else {
tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count];
}

taskq_dispatch_ent(tq, func, arg, flags, ent);
}

static void
Expand All @@ -845,16 +933,7 @@ spa_create_zio_taskqs(spa_t *spa)

for (t = 0; t < ZIO_TYPES; t++) {
for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
enum zti_modes mode = ztip->zti_mode;
uint_t value = ztip->zti_value;
char name[32];

(void) snprintf(name, sizeof (name),
"%s_%s", zio_type_name[t], zio_taskq_types[q]);

spa->spa_zio_taskq[t][q] =
spa_taskq_create(spa, name, mode, value);
spa_taskqs_init(spa, t, q);
}
}
}
Expand Down Expand Up @@ -1017,9 +1096,7 @@ spa_deactivate(spa_t *spa)

for (t = 0; t < ZIO_TYPES; t++) {
for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
if (spa->spa_zio_taskq[t][q] != NULL)
taskq_destroy(spa->spa_zio_taskq[t][q]);
spa->spa_zio_taskq[t][q] = NULL;
spa_taskqs_fini(spa, t, q);
}
}

Expand Down
2 changes: 1 addition & 1 deletion module/zfs/vdev_file.c
Expand Up @@ -185,7 +185,7 @@ vdev_file_io_start(zio_t *zio)
return (ZIO_PIPELINE_CONTINUE);
}

taskq_dispatch_ent(spa->spa_zio_taskq[ZIO_TYPE_FREE][ZIO_TASKQ_ISSUE],
spa_taskq_dispatch_ent(spa, ZIO_TYPE_FREE, ZIO_TASKQ_ISSUE,
vdev_file_io_strategy, zio, 0, &zio->io_tqent);

return (ZIO_PIPELINE_STOP);
Expand Down
28 changes: 17 additions & 11 deletions module/zfs/zio.c
Expand Up @@ -1165,7 +1165,7 @@ zio_free_bp_init(zio_t *zio)
*/

static void
zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
{
spa_t *spa = zio->io_spa;
zio_type_t t = zio->io_type;
Expand All @@ -1186,10 +1186,11 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
t = ZIO_TYPE_NULL;

/*
* If this is a high priority I/O, then use the high priority taskq.
* If this is a high priority I/O, then use the high priority taskq if
* available.
*/
if (zio->io_priority == ZIO_PRIORITY_NOW &&
spa->spa_zio_taskq[t][q + 1] != NULL)
spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
q++;

ASSERT3U(q, <, ZIO_TASKQ_TYPES);
Expand All @@ -1200,20 +1201,25 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
* to dispatch the zio to another taskq at the same time.
*/
ASSERT(taskq_empty_ent(&zio->io_tqent));
taskq_dispatch_ent(spa->spa_zio_taskq[t][q],
(task_func_t *)zio_execute, zio, flags, &zio->io_tqent);
spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
flags, &zio->io_tqent);
}

static boolean_t
zio_taskq_member(zio_t *zio, enum zio_taskq_type q)
zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
{
kthread_t *executor = zio->io_executor;
spa_t *spa = zio->io_spa;
zio_type_t t;

for (t = 0; t < ZIO_TYPES; t++)
if (taskq_member(spa->spa_zio_taskq[t][q], executor))
return (B_TRUE);
for (t = 0; t < ZIO_TYPES; t++) {
spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
uint_t i;
for (i = 0; i < tqs->stqs_count; i++) {
if (taskq_member(tqs->stqs_taskq[i], executor))
return (B_TRUE);
}
}

return (B_FALSE);
}
Expand Down Expand Up @@ -3078,8 +3084,8 @@ zio_done(zio_t *zio)
* Hand it off to the otherwise-unused claim taskq.
*/
ASSERT(taskq_empty_ent(&zio->io_tqent));
taskq_dispatch_ent(
zio->io_spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
spa_taskq_dispatch_ent(zio->io_spa,
ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
(task_func_t *)zio_reexecute, zio, 0,
&zio->io_tqent);
}
Expand Down

0 comments on commit 7ef5e54

Please sign in to comment.