Skip to content
This repository
Browse code

Illumos #3581 spa_zio_taskq[ZIO_TYPE_FREE][ZIO_TASKQ_ISSUE]->tq_lock …

…contention

3581 spa_zio_taskq[ZIO_TYPE_FREE][ZIO_TASKQ_ISSUE]->tq_lock is piping hot

Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Gordon Ross <gordon.ross@nexenta.com>
Approved by: Richard Lowe <richlowe@richlowe.net>

References:
  illumos/illumos-gate@ec94d32
  https://illumos.org/issues/3581

Notes for Linux port:

Earlier commit 08d08eb reduced contention on this taskq lock by simply
reducing the number of z_fr_iss threads from 100 to one-per-CPU.  We
also optimized the taskq implementation in zfsonlinux/spl@3c6ed54.
These changes significantly improved unlink performance to acceptable
levels.

This patch further reduces time spent spinning on this lock by
randomly dispatching the work items over multiple independent task
queues.  The Illumos ZFS developers stated that this lock contention
only arose after "3329 spa_sync() spends 10-20% of its time in
spa_free_sync_cb()" was landed.  It's not clear if 3329 affects the
Linux port or not.  I didn't see spa_free_sync_cb() show up in
oprofile sessions while unlinking large files, but I may just not
have used the right test case.

I tested unlinking a 1 TB of data with and without the patch and
didn't observe a meaningful difference in elapsed time.  However,
oprofile showed that the percent time spent in taskq_thread() was
reduced from about 16% to about 5%.  Aside from a possible slight
performance benefit this may be worth landing if only for the sake of
maintaining consistency with upstream.

Ported-by: Ned Bass <bass6@llnl.gov>
Closes #1327
  • Loading branch information...
commit 7ef5e54e2e28884a04dc800657967b891239e933 1 parent 55d85d5
authored May 06, 2013 behlendorf committed May 06, 2013
16  include/sys/spa_impl.h
@@ -80,16 +80,16 @@ typedef struct spa_config_dirent {
80 80
 	char		*scd_path;
81 81
 } spa_config_dirent_t;
82 82
 
83  
-enum zio_taskq_type {
  83
+typedef enum zio_taskq_type {
84 84
 	ZIO_TASKQ_ISSUE = 0,
85 85
 	ZIO_TASKQ_ISSUE_HIGH,
86 86
 	ZIO_TASKQ_INTERRUPT,
87 87
 	ZIO_TASKQ_INTERRUPT_HIGH,
88 88
 	ZIO_TASKQ_TYPES
89  
-};
  89
+} zio_taskq_type_t;
90 90
 
91 91
 /*
92  
- * State machine for the zpool-pooname process.  The states transitions
  92
+ * State machine for the zpool-poolname process.  The states transitions
93 93
  * are done as follows:
94 94
  *
95 95
  *	From		   To			Routine
@@ -107,6 +107,11 @@ typedef enum spa_proc_state {
107 107
 	SPA_PROC_GONE		/* spa_thread() is exiting, spa_proc = &p0 */
108 108
 } spa_proc_state_t;
109 109
 
  110
+typedef struct spa_taskqs {
  111
+	uint_t stqs_count;
  112
+	taskq_t **stqs_taskq;
  113
+} spa_taskqs_t;
  114
+
110 115
 struct spa {
111 116
 	/*
112 117
 	 * Fields protected by spa_namespace_lock.
@@ -125,7 +130,7 @@ struct spa {
125 130
 	uint8_t		spa_sync_on;		/* sync threads are running */
126 131
 	spa_load_state_t spa_load_state;	/* current load operation */
127 132
 	uint64_t	spa_import_flags;	/* import specific flags */
128  
-	taskq_t		*spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
  133
+	spa_taskqs_t	spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
129 134
 	dsl_pool_t	*spa_dsl_pool;
130 135
 	boolean_t	spa_is_initializing;	/* true while opening pool */
131 136
 	metaslab_class_t *spa_normal_class;	/* normal data class */
@@ -243,6 +248,9 @@ struct spa {
243 248
 
244 249
 extern char *spa_config_path;
245 250
 
  251
+extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
  252
+    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent);
  253
+
246 254
 #ifdef	__cplusplus
247 255
 }
248 256
 #endif
203  module/zfs/spa.c
@@ -78,23 +78,25 @@
78 78
 #include "zfs_comutil.h"
79 79
 
80 80
 typedef enum zti_modes {
81  
-	zti_mode_fixed,			/* value is # of threads (min 1) */
82  
-	zti_mode_online_percent,	/* value is % of online CPUs */
83  
-	zti_mode_batch,			/* cpu-intensive; value is ignored */
84  
-	zti_mode_null,			/* don't create a taskq */
85  
-	zti_nmodes
  81
+	ZTI_MODE_FIXED,			/* value is # of threads (min 1) */
  82
+	ZTI_MODE_ONLINE_PERCENT,	/* value is % of online CPUs */
  83
+	ZTI_MODE_BATCH,			/* cpu-intensive; value is ignored */
  84
+	ZTI_MODE_NULL,			/* don't create a taskq */
  85
+	ZTI_NMODES
86 86
 } zti_modes_t;
87 87
 
88  
-#define	ZTI_FIX(n)	{ zti_mode_fixed, (n) }
89  
-#define	ZTI_PCT(n)	{ zti_mode_online_percent, (n) }
90  
-#define	ZTI_BATCH	{ zti_mode_batch, 0 }
91  
-#define	ZTI_NULL	{ zti_mode_null, 0 }
  88
+#define	ZTI_P(n, q)	{ ZTI_MODE_FIXED, (n), (q) }
  89
+#define	ZTI_PCT(n)	{ ZTI_MODE_ONLINE_PERCENT, (n), 1 }
  90
+#define	ZTI_BATCH	{ ZTI_MODE_BATCH, 0, 1 }
  91
+#define	ZTI_NULL	{ ZTI_MODE_NULL, 0, 0 }
92 92
 
93  
-#define	ZTI_ONE		ZTI_FIX(1)
  93
+#define	ZTI_N(n)	ZTI_P(n, 1)
  94
+#define	ZTI_ONE		ZTI_N(1)
94 95
 
95 96
 typedef struct zio_taskq_info {
96  
-	enum zti_modes zti_mode;
  97
+	zti_modes_t zti_mode;
97 98
 	uint_t zti_value;
  99
+	uint_t zti_count;
98 100
 } zio_taskq_info_t;
99 101
 
100 102
 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
@@ -102,17 +104,30 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
102 104
 };
103 105
 
104 106
 /*
105  
- * Define the taskq threads for the following I/O types:
106  
- * 	NULL, READ, WRITE, FREE, CLAIM, and IOCTL
  107
+ * This table defines the taskq settings for each ZFS I/O type. When
  108
+ * initializing a pool, we use this table to create an appropriately sized
  109
+ * taskq. Some operations are low volume and therefore have a small, static
  110
+ * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
  111
+ * macros. Other operations process a large amount of data; the ZTI_BATCH
  112
+ * macro causes us to create a taskq oriented for throughput. Some operations
  113
+ * are so high frequency and short-lived that the taskq itself can become a a
  114
+ * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
  115
+ * additional degree of parallelism specified by the number of threads per-
  116
+ * taskq and the number of taskqs; when dispatching an event in this case, the
  117
+ * particular taskq is chosen at random.
  118
+ *
  119
+ * The different taskq priorities are to handle the different contexts (issue
  120
+ * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
  121
+ * need to be handled with minimum delay.
107 122
  */
108 123
 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
109 124
 	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
110  
-	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
111  
-	{ ZTI_FIX(8),	ZTI_NULL,	ZTI_BATCH,	ZTI_NULL },
112  
-	{ ZTI_BATCH,	ZTI_FIX(5),	ZTI_FIX(16),	ZTI_FIX(5) },
113  
-	{ ZTI_FIX(8),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
114  
-	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
115  
-	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
  125
+	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* NULL */
  126
+	{ ZTI_N(8),	ZTI_NULL,	ZTI_BATCH,	ZTI_NULL }, /* READ */
  127
+	{ ZTI_BATCH,	ZTI_N(5),	ZTI_N(16),	ZTI_N(5) }, /* WRITE */
  128
+	{ ZTI_P(4, 8),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FREE */
  129
+	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* CLAIM */
  130
+	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* IOCTL */
116 131
 };
117 132
 
118 133
 static dsl_syncfunc_t spa_sync_version;
@@ -794,48 +809,121 @@ spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
794 809
 	    offsetof(spa_error_entry_t, se_avl));
795 810
 }
796 811
 
797  
-static taskq_t *
798  
-spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode,
799  
-    uint_t value)
  812
+static void
  813
+spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
800 814
 {
801  
-	uint_t flags = TASKQ_PREPOPULATE;
  815
+	const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
  816
+	enum zti_modes mode = ztip->zti_mode;
  817
+	uint_t value = ztip->zti_value;
  818
+	uint_t count = ztip->zti_count;
  819
+	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
  820
+	char name[32];
  821
+	uint_t i, flags = 0;
802 822
 	boolean_t batch = B_FALSE;
803 823
 
804  
-	switch (mode) {
805  
-	case zti_mode_null:
806  
-		return (NULL);		/* no taskq needed */
  824
+	if (mode == ZTI_MODE_NULL) {
  825
+		tqs->stqs_count = 0;
  826
+		tqs->stqs_taskq = NULL;
  827
+		return;
  828
+	}
807 829
 
808  
-	case zti_mode_fixed:
809  
-		ASSERT3U(value, >=, 1);
810  
-		value = MAX(value, 1);
811  
-		break;
  830
+	ASSERT3U(count, >, 0);
812 831
 
813  
-	case zti_mode_batch:
814  
-		batch = B_TRUE;
815  
-		flags |= TASKQ_THREADS_CPU_PCT;
816  
-		value = zio_taskq_batch_pct;
817  
-		break;
  832
+	tqs->stqs_count = count;
  833
+	tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
818 834
 
819  
-	case zti_mode_online_percent:
820  
-		flags |= TASKQ_THREADS_CPU_PCT;
821  
-		break;
  835
+	for (i = 0; i < count; i++) {
  836
+		taskq_t *tq;
822 837
 
823  
-	default:
824  
-		panic("unrecognized mode for %s taskq (%u:%u) in "
825  
-		    "spa_activate()",
826  
-		    name, mode, value);
827  
-		break;
  838
+		switch (mode) {
  839
+		case ZTI_MODE_FIXED:
  840
+			ASSERT3U(value, >=, 1);
  841
+			value = MAX(value, 1);
  842
+			break;
  843
+
  844
+		case ZTI_MODE_BATCH:
  845
+			batch = B_TRUE;
  846
+			flags |= TASKQ_THREADS_CPU_PCT;
  847
+			value = zio_taskq_batch_pct;
  848
+			break;
  849
+
  850
+		case ZTI_MODE_ONLINE_PERCENT:
  851
+			flags |= TASKQ_THREADS_CPU_PCT;
  852
+			break;
  853
+
  854
+		default:
  855
+			panic("unrecognized mode for %s_%s taskq (%u:%u) in "
  856
+			    "spa_activate()",
  857
+			    zio_type_name[t], zio_taskq_types[q], mode, value);
  858
+			break;
  859
+		}
  860
+
  861
+		if (count > 1) {
  862
+			(void) snprintf(name, sizeof (name), "%s_%s_%u",
  863
+			    zio_type_name[t], zio_taskq_types[q], i);
  864
+		} else {
  865
+			(void) snprintf(name, sizeof (name), "%s_%s",
  866
+			    zio_type_name[t], zio_taskq_types[q]);
  867
+		}
  868
+
  869
+		if (zio_taskq_sysdc && spa->spa_proc != &p0) {
  870
+			if (batch)
  871
+				flags |= TASKQ_DC_BATCH;
  872
+
  873
+			tq = taskq_create_sysdc(name, value, 50, INT_MAX,
  874
+			    spa->spa_proc, zio_taskq_basedc, flags);
  875
+		} else {
  876
+			tq = taskq_create_proc(name, value, maxclsyspri, 50,
  877
+			    INT_MAX, spa->spa_proc, flags);
  878
+		}
  879
+
  880
+		tqs->stqs_taskq[i] = tq;
828 881
 	}
  882
+}
  883
+
  884
+static void
  885
+spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
  886
+{
  887
+	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
  888
+	uint_t i;
829 889
 
830  
-	if (zio_taskq_sysdc && spa->spa_proc != &p0) {
831  
-		if (batch)
832  
-			flags |= TASKQ_DC_BATCH;
  890
+	if (tqs->stqs_taskq == NULL) {
  891
+		ASSERT3U(tqs->stqs_count, ==, 0);
  892
+		return;
  893
+	}
833 894
 
834  
-		return (taskq_create_sysdc(name, value, 50, INT_MAX,
835  
-		    spa->spa_proc, zio_taskq_basedc, flags));
  895
+	for (i = 0; i < tqs->stqs_count; i++) {
  896
+		ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
  897
+		taskq_destroy(tqs->stqs_taskq[i]);
836 898
 	}
837  
-	return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX,
838  
-	    spa->spa_proc, flags));
  899
+
  900
+	kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
  901
+	tqs->stqs_taskq = NULL;
  902
+}
  903
+
  904
+/*
  905
+ * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
  906
+ * Note that a type may have multiple discrete taskqs to avoid lock contention
  907
+ * on the taskq itself. In that case we choose which taskq at random by using
  908
+ * the low bits of gethrtime().
  909
+ */
  910
+void
  911
+spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
  912
+    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
  913
+{
  914
+	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
  915
+	taskq_t *tq;
  916
+
  917
+	ASSERT3P(tqs->stqs_taskq, !=, NULL);
  918
+	ASSERT3U(tqs->stqs_count, !=, 0);
  919
+
  920
+	if (tqs->stqs_count == 1) {
  921
+		tq = tqs->stqs_taskq[0];
  922
+	} else {
  923
+		tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count];
  924
+	}
  925
+
  926
+	taskq_dispatch_ent(tq, func, arg, flags, ent);
839 927
 }
840 928
 
841 929
 static void
@@ -845,16 +933,7 @@ spa_create_zio_taskqs(spa_t *spa)
845 933
 
846 934
 	for (t = 0; t < ZIO_TYPES; t++) {
847 935
 		for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
848  
-			const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
849  
-			enum zti_modes mode = ztip->zti_mode;
850  
-			uint_t value = ztip->zti_value;
851  
-			char name[32];
852  
-
853  
-			(void) snprintf(name, sizeof (name),
854  
-			    "%s_%s", zio_type_name[t], zio_taskq_types[q]);
855  
-
856  
-			spa->spa_zio_taskq[t][q] =
857  
-			    spa_taskq_create(spa, name, mode, value);
  936
+			spa_taskqs_init(spa, t, q);
858 937
 		}
859 938
 	}
860 939
 }
@@ -1017,9 +1096,7 @@ spa_deactivate(spa_t *spa)
1017 1096
 
1018 1097
 	for (t = 0; t < ZIO_TYPES; t++) {
1019 1098
 		for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
1020  
-			if (spa->spa_zio_taskq[t][q] != NULL)
1021  
-				taskq_destroy(spa->spa_zio_taskq[t][q]);
1022  
-			spa->spa_zio_taskq[t][q] = NULL;
  1099
+			spa_taskqs_fini(spa, t, q);
1023 1100
 		}
1024 1101
 	}
1025 1102
 
2  module/zfs/vdev_file.c
@@ -185,7 +185,7 @@ vdev_file_io_start(zio_t *zio)
185 185
 		return (ZIO_PIPELINE_CONTINUE);
186 186
 	}
187 187
 
188  
-	taskq_dispatch_ent(spa->spa_zio_taskq[ZIO_TYPE_FREE][ZIO_TASKQ_ISSUE],
  188
+	spa_taskq_dispatch_ent(spa, ZIO_TYPE_FREE, ZIO_TASKQ_ISSUE,
189 189
 	    vdev_file_io_strategy, zio, 0, &zio->io_tqent);
190 190
 
191 191
 	return (ZIO_PIPELINE_STOP);
28  module/zfs/zio.c
@@ -1165,7 +1165,7 @@ zio_free_bp_init(zio_t *zio)
1165 1165
  */
1166 1166
 
1167 1167
 static void
1168  
-zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
  1168
+zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
1169 1169
 {
1170 1170
 	spa_t *spa = zio->io_spa;
1171 1171
 	zio_type_t t = zio->io_type;
@@ -1186,10 +1186,11 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
1186 1186
 		t = ZIO_TYPE_NULL;
1187 1187
 
1188 1188
 	/*
1189  
-	 * If this is a high priority I/O, then use the high priority taskq.
  1189
+	 * If this is a high priority I/O, then use the high priority taskq if
  1190
+	 * available.
1190 1191
 	 */
1191 1192
 	if (zio->io_priority == ZIO_PRIORITY_NOW &&
1192  
-	    spa->spa_zio_taskq[t][q + 1] != NULL)
  1193
+	    spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
1193 1194
 		q++;
1194 1195
 
1195 1196
 	ASSERT3U(q, <, ZIO_TASKQ_TYPES);
@@ -1200,20 +1201,25 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
1200 1201
 	 * to dispatch the zio to another taskq at the same time.
1201 1202
 	 */
1202 1203
 	ASSERT(taskq_empty_ent(&zio->io_tqent));
1203  
-	taskq_dispatch_ent(spa->spa_zio_taskq[t][q],
1204  
-	    (task_func_t *)zio_execute, zio, flags, &zio->io_tqent);
  1204
+	spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
  1205
+	    flags, &zio->io_tqent);
1205 1206
 }
1206 1207
 
1207 1208
 static boolean_t
1208  
-zio_taskq_member(zio_t *zio, enum zio_taskq_type q)
  1209
+zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
1209 1210
 {
1210 1211
 	kthread_t *executor = zio->io_executor;
1211 1212
 	spa_t *spa = zio->io_spa;
1212 1213
 	zio_type_t t;
1213 1214
 
1214  
-	for (t = 0; t < ZIO_TYPES; t++)
1215  
-		if (taskq_member(spa->spa_zio_taskq[t][q], executor))
1216  
-			return (B_TRUE);
  1215
+	for (t = 0; t < ZIO_TYPES; t++) {
  1216
+		spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
  1217
+		uint_t i;
  1218
+		for (i = 0; i < tqs->stqs_count; i++) {
  1219
+			if (taskq_member(tqs->stqs_taskq[i], executor))
  1220
+				return (B_TRUE);
  1221
+		}
  1222
+	}
1217 1223
 
1218 1224
 	return (B_FALSE);
1219 1225
 }
@@ -3078,8 +3084,8 @@ zio_done(zio_t *zio)
3078 3084
 			 * Hand it off to the otherwise-unused claim taskq.
3079 3085
 			 */
3080 3086
 			ASSERT(taskq_empty_ent(&zio->io_tqent));
3081  
-			taskq_dispatch_ent(
3082  
-			    zio->io_spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
  3087
+			spa_taskq_dispatch_ent(zio->io_spa,
  3088
+			    ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
3083 3089
 			    (task_func_t *)zio_reexecute, zio, 0,
3084 3090
 			    &zio->io_tqent);
3085 3091
 		}

0 notes on commit 7ef5e54

Please sign in to comment.
Something went wrong with that request. Please try again.