Skip to content

Commit 7ef5e54

Browse files
Adam Leventhalbehlendorf
authored andcommitted
Illumos #3581 spa_zio_taskq[ZIO_TYPE_FREE][ZIO_TASKQ_ISSUE]->tq_lock contention
3581 spa_zio_taskq[ZIO_TYPE_FREE][ZIO_TASKQ_ISSUE]->tq_lock is piping hot Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Christopher Siden <christopher.siden@delphix.com> Reviewed by: Gordon Ross <gordon.ross@nexenta.com> Approved by: Richard Lowe <richlowe@richlowe.net> References: illumos/illumos-gate@ec94d32 https://illumos.org/issues/3581 Notes for Linux port: Earlier commit 08d08eb reduced contention on this taskq lock by simply reducing the number of z_fr_iss threads from 100 to one-per-CPU. We also optimized the taskq implementation in openzfs/spl@3c6ed54. These changes significantly improved unlink performance to acceptable levels. This patch further reduces time spent spinning on this lock by randomly dispatching the work items over multiple independent task queues. The Illumos ZFS developers stated that this lock contention only arose after "3329 spa_sync() spends 10-20% of its time in spa_free_sync_cb()" was landed. It's not clear if 3329 affects the Linux port or not. I didn't see spa_free_sync_cb() show up in oprofile sessions while unlinking large files, but I may just not have used the right test case. I tested unlinking a 1 TB of data with and without the patch and didn't observe a meaningful difference in elapsed time. However, oprofile showed that the percent time spent in taskq_thread() was reduced from about 16% to about 5%. Aside from a possible slight performance benefit this may be worth landing if only for the sake of maintaining consistency with upstream. Ported-by: Ned Bass <bass6@llnl.gov> Closes #1327
1 parent 55d85d5 commit 7ef5e54

File tree

4 files changed

+170
-79
lines changed

4 files changed

+170
-79
lines changed

include/sys/spa_impl.h

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -80,16 +80,16 @@ typedef struct spa_config_dirent {
8080
char *scd_path;
8181
} spa_config_dirent_t;
8282

83-
enum zio_taskq_type {
83+
typedef enum zio_taskq_type {
8484
ZIO_TASKQ_ISSUE = 0,
8585
ZIO_TASKQ_ISSUE_HIGH,
8686
ZIO_TASKQ_INTERRUPT,
8787
ZIO_TASKQ_INTERRUPT_HIGH,
8888
ZIO_TASKQ_TYPES
89-
};
89+
} zio_taskq_type_t;
9090

9191
/*
92-
* State machine for the zpool-pooname process. The states transitions
92+
* State machine for the zpool-poolname process. The states transitions
9393
* are done as follows:
9494
*
9595
* From To Routine
@@ -107,6 +107,11 @@ typedef enum spa_proc_state {
107107
SPA_PROC_GONE /* spa_thread() is exiting, spa_proc = &p0 */
108108
} spa_proc_state_t;
109109

110+
typedef struct spa_taskqs {
111+
uint_t stqs_count;
112+
taskq_t **stqs_taskq;
113+
} spa_taskqs_t;
114+
110115
struct spa {
111116
/*
112117
* Fields protected by spa_namespace_lock.
@@ -125,7 +130,7 @@ struct spa {
125130
uint8_t spa_sync_on; /* sync threads are running */
126131
spa_load_state_t spa_load_state; /* current load operation */
127132
uint64_t spa_import_flags; /* import specific flags */
128-
taskq_t *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
133+
spa_taskqs_t spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
129134
dsl_pool_t *spa_dsl_pool;
130135
boolean_t spa_is_initializing; /* true while opening pool */
131136
metaslab_class_t *spa_normal_class; /* normal data class */
@@ -243,6 +248,9 @@ struct spa {
243248

244249
extern char *spa_config_path;
245250

251+
extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
252+
task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent);
253+
246254
#ifdef __cplusplus
247255
}
248256
#endif

module/zfs/spa.c

Lines changed: 140 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -78,41 +78,56 @@
7878
#include "zfs_comutil.h"
7979

8080
typedef enum zti_modes {
81-
zti_mode_fixed, /* value is # of threads (min 1) */
82-
zti_mode_online_percent, /* value is % of online CPUs */
83-
zti_mode_batch, /* cpu-intensive; value is ignored */
84-
zti_mode_null, /* don't create a taskq */
85-
zti_nmodes
81+
ZTI_MODE_FIXED, /* value is # of threads (min 1) */
82+
ZTI_MODE_ONLINE_PERCENT, /* value is % of online CPUs */
83+
ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */
84+
ZTI_MODE_NULL, /* don't create a taskq */
85+
ZTI_NMODES
8686
} zti_modes_t;
8787

88-
#define ZTI_FIX(n) { zti_mode_fixed, (n) }
89-
#define ZTI_PCT(n) { zti_mode_online_percent, (n) }
90-
#define ZTI_BATCH { zti_mode_batch, 0 }
91-
#define ZTI_NULL { zti_mode_null, 0 }
88+
#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) }
89+
#define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 }
90+
#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 }
91+
#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 }
9292

93-
#define ZTI_ONE ZTI_FIX(1)
93+
#define ZTI_N(n) ZTI_P(n, 1)
94+
#define ZTI_ONE ZTI_N(1)
9495

9596
typedef struct zio_taskq_info {
96-
enum zti_modes zti_mode;
97+
zti_modes_t zti_mode;
9798
uint_t zti_value;
99+
uint_t zti_count;
98100
} zio_taskq_info_t;
99101

100102
static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
101103
"iss", "iss_h", "int", "int_h"
102104
};
103105

104106
/*
105-
* Define the taskq threads for the following I/O types:
106-
* NULL, READ, WRITE, FREE, CLAIM, and IOCTL
107+
* This table defines the taskq settings for each ZFS I/O type. When
108+
* initializing a pool, we use this table to create an appropriately sized
109+
* taskq. Some operations are low volume and therefore have a small, static
110+
* number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
111+
* macros. Other operations process a large amount of data; the ZTI_BATCH
112+
* macro causes us to create a taskq oriented for throughput. Some operations
113+
* are so high frequency and short-lived that the taskq itself can become a a
114+
* point of lock contention. The ZTI_P(#, #) macro indicates that we need an
115+
* additional degree of parallelism specified by the number of threads per-
116+
* taskq and the number of taskqs; when dispatching an event in this case, the
117+
* particular taskq is chosen at random.
118+
*
119+
* The different taskq priorities are to handle the different contexts (issue
120+
* and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
121+
* need to be handled with minimum delay.
107122
*/
108123
const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
109124
/* ISSUE ISSUE_HIGH INTR INTR_HIGH */
110-
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
111-
{ ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL },
112-
{ ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(16), ZTI_FIX(5) },
113-
{ ZTI_FIX(8), ZTI_NULL, ZTI_ONE, ZTI_NULL },
114-
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
115-
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
125+
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */
126+
{ ZTI_N(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, /* READ */
127+
{ ZTI_BATCH, ZTI_N(5), ZTI_N(16), ZTI_N(5) }, /* WRITE */
128+
{ ZTI_P(4, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */
129+
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */
130+
{ ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */
116131
};
117132

118133
static dsl_syncfunc_t spa_sync_version;
@@ -794,48 +809,121 @@ spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
794809
offsetof(spa_error_entry_t, se_avl));
795810
}
796811

797-
static taskq_t *
798-
spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode,
799-
uint_t value)
812+
static void
813+
spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
800814
{
801-
uint_t flags = TASKQ_PREPOPULATE;
815+
const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
816+
enum zti_modes mode = ztip->zti_mode;
817+
uint_t value = ztip->zti_value;
818+
uint_t count = ztip->zti_count;
819+
spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
820+
char name[32];
821+
uint_t i, flags = 0;
802822
boolean_t batch = B_FALSE;
803823

804-
switch (mode) {
805-
case zti_mode_null:
806-
return (NULL); /* no taskq needed */
824+
if (mode == ZTI_MODE_NULL) {
825+
tqs->stqs_count = 0;
826+
tqs->stqs_taskq = NULL;
827+
return;
828+
}
807829

808-
case zti_mode_fixed:
809-
ASSERT3U(value, >=, 1);
810-
value = MAX(value, 1);
811-
break;
830+
ASSERT3U(count, >, 0);
812831

813-
case zti_mode_batch:
814-
batch = B_TRUE;
815-
flags |= TASKQ_THREADS_CPU_PCT;
816-
value = zio_taskq_batch_pct;
817-
break;
832+
tqs->stqs_count = count;
833+
tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
818834

819-
case zti_mode_online_percent:
820-
flags |= TASKQ_THREADS_CPU_PCT;
821-
break;
835+
for (i = 0; i < count; i++) {
836+
taskq_t *tq;
822837

823-
default:
824-
panic("unrecognized mode for %s taskq (%u:%u) in "
825-
"spa_activate()",
826-
name, mode, value);
827-
break;
838+
switch (mode) {
839+
case ZTI_MODE_FIXED:
840+
ASSERT3U(value, >=, 1);
841+
value = MAX(value, 1);
842+
break;
843+
844+
case ZTI_MODE_BATCH:
845+
batch = B_TRUE;
846+
flags |= TASKQ_THREADS_CPU_PCT;
847+
value = zio_taskq_batch_pct;
848+
break;
849+
850+
case ZTI_MODE_ONLINE_PERCENT:
851+
flags |= TASKQ_THREADS_CPU_PCT;
852+
break;
853+
854+
default:
855+
panic("unrecognized mode for %s_%s taskq (%u:%u) in "
856+
"spa_activate()",
857+
zio_type_name[t], zio_taskq_types[q], mode, value);
858+
break;
859+
}
860+
861+
if (count > 1) {
862+
(void) snprintf(name, sizeof (name), "%s_%s_%u",
863+
zio_type_name[t], zio_taskq_types[q], i);
864+
} else {
865+
(void) snprintf(name, sizeof (name), "%s_%s",
866+
zio_type_name[t], zio_taskq_types[q]);
867+
}
868+
869+
if (zio_taskq_sysdc && spa->spa_proc != &p0) {
870+
if (batch)
871+
flags |= TASKQ_DC_BATCH;
872+
873+
tq = taskq_create_sysdc(name, value, 50, INT_MAX,
874+
spa->spa_proc, zio_taskq_basedc, flags);
875+
} else {
876+
tq = taskq_create_proc(name, value, maxclsyspri, 50,
877+
INT_MAX, spa->spa_proc, flags);
878+
}
879+
880+
tqs->stqs_taskq[i] = tq;
828881
}
882+
}
883+
884+
static void
885+
spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
886+
{
887+
spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
888+
uint_t i;
829889

830-
if (zio_taskq_sysdc && spa->spa_proc != &p0) {
831-
if (batch)
832-
flags |= TASKQ_DC_BATCH;
890+
if (tqs->stqs_taskq == NULL) {
891+
ASSERT3U(tqs->stqs_count, ==, 0);
892+
return;
893+
}
833894

834-
return (taskq_create_sysdc(name, value, 50, INT_MAX,
835-
spa->spa_proc, zio_taskq_basedc, flags));
895+
for (i = 0; i < tqs->stqs_count; i++) {
896+
ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
897+
taskq_destroy(tqs->stqs_taskq[i]);
836898
}
837-
return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX,
838-
spa->spa_proc, flags));
899+
900+
kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
901+
tqs->stqs_taskq = NULL;
902+
}
903+
904+
/*
905+
* Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
906+
* Note that a type may have multiple discrete taskqs to avoid lock contention
907+
* on the taskq itself. In that case we choose which taskq at random by using
908+
* the low bits of gethrtime().
909+
*/
910+
void
911+
spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
912+
task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
913+
{
914+
spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
915+
taskq_t *tq;
916+
917+
ASSERT3P(tqs->stqs_taskq, !=, NULL);
918+
ASSERT3U(tqs->stqs_count, !=, 0);
919+
920+
if (tqs->stqs_count == 1) {
921+
tq = tqs->stqs_taskq[0];
922+
} else {
923+
tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count];
924+
}
925+
926+
taskq_dispatch_ent(tq, func, arg, flags, ent);
839927
}
840928

841929
static void
@@ -845,16 +933,7 @@ spa_create_zio_taskqs(spa_t *spa)
845933

846934
for (t = 0; t < ZIO_TYPES; t++) {
847935
for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
848-
const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
849-
enum zti_modes mode = ztip->zti_mode;
850-
uint_t value = ztip->zti_value;
851-
char name[32];
852-
853-
(void) snprintf(name, sizeof (name),
854-
"%s_%s", zio_type_name[t], zio_taskq_types[q]);
855-
856-
spa->spa_zio_taskq[t][q] =
857-
spa_taskq_create(spa, name, mode, value);
936+
spa_taskqs_init(spa, t, q);
858937
}
859938
}
860939
}
@@ -1017,9 +1096,7 @@ spa_deactivate(spa_t *spa)
10171096

10181097
for (t = 0; t < ZIO_TYPES; t++) {
10191098
for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
1020-
if (spa->spa_zio_taskq[t][q] != NULL)
1021-
taskq_destroy(spa->spa_zio_taskq[t][q]);
1022-
spa->spa_zio_taskq[t][q] = NULL;
1099+
spa_taskqs_fini(spa, t, q);
10231100
}
10241101
}
10251102

module/zfs/vdev_file.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ vdev_file_io_start(zio_t *zio)
185185
return (ZIO_PIPELINE_CONTINUE);
186186
}
187187

188-
taskq_dispatch_ent(spa->spa_zio_taskq[ZIO_TYPE_FREE][ZIO_TASKQ_ISSUE],
188+
spa_taskq_dispatch_ent(spa, ZIO_TYPE_FREE, ZIO_TASKQ_ISSUE,
189189
vdev_file_io_strategy, zio, 0, &zio->io_tqent);
190190

191191
return (ZIO_PIPELINE_STOP);

module/zfs/zio.c

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1165,7 +1165,7 @@ zio_free_bp_init(zio_t *zio)
11651165
*/
11661166

11671167
static void
1168-
zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
1168+
zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
11691169
{
11701170
spa_t *spa = zio->io_spa;
11711171
zio_type_t t = zio->io_type;
@@ -1186,10 +1186,11 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
11861186
t = ZIO_TYPE_NULL;
11871187

11881188
/*
1189-
* If this is a high priority I/O, then use the high priority taskq.
1189+
* If this is a high priority I/O, then use the high priority taskq if
1190+
* available.
11901191
*/
11911192
if (zio->io_priority == ZIO_PRIORITY_NOW &&
1192-
spa->spa_zio_taskq[t][q + 1] != NULL)
1193+
spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
11931194
q++;
11941195

11951196
ASSERT3U(q, <, ZIO_TASKQ_TYPES);
@@ -1200,20 +1201,25 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
12001201
* to dispatch the zio to another taskq at the same time.
12011202
*/
12021203
ASSERT(taskq_empty_ent(&zio->io_tqent));
1203-
taskq_dispatch_ent(spa->spa_zio_taskq[t][q],
1204-
(task_func_t *)zio_execute, zio, flags, &zio->io_tqent);
1204+
spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
1205+
flags, &zio->io_tqent);
12051206
}
12061207

12071208
static boolean_t
1208-
zio_taskq_member(zio_t *zio, enum zio_taskq_type q)
1209+
zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
12091210
{
12101211
kthread_t *executor = zio->io_executor;
12111212
spa_t *spa = zio->io_spa;
12121213
zio_type_t t;
12131214

1214-
for (t = 0; t < ZIO_TYPES; t++)
1215-
if (taskq_member(spa->spa_zio_taskq[t][q], executor))
1216-
return (B_TRUE);
1215+
for (t = 0; t < ZIO_TYPES; t++) {
1216+
spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1217+
uint_t i;
1218+
for (i = 0; i < tqs->stqs_count; i++) {
1219+
if (taskq_member(tqs->stqs_taskq[i], executor))
1220+
return (B_TRUE);
1221+
}
1222+
}
12171223

12181224
return (B_FALSE);
12191225
}
@@ -3078,8 +3084,8 @@ zio_done(zio_t *zio)
30783084
* Hand it off to the otherwise-unused claim taskq.
30793085
*/
30803086
ASSERT(taskq_empty_ent(&zio->io_tqent));
3081-
taskq_dispatch_ent(
3082-
zio->io_spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
3087+
spa_taskq_dispatch_ent(zio->io_spa,
3088+
ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
30833089
(task_func_t *)zio_reexecute, zio, 0,
30843090
&zio->io_tqent);
30853091
}

0 commit comments

Comments
 (0)