Skip to content

Commit cc92e9d

Browse files
grwilsonbehlendorf
authored andcommitted
3246 ZFS I/O deadman thread
Reviewed by: Matt Ahrens <matthew.ahrens@delphix.com> Reviewed by: Eric Schrock <eric.schrock@delphix.com> Reviewed by: Christopher Siden <chris.siden@delphix.com> Approved by: Garrett D'Amore <garrett@damore.org> NOTES: This patch has been reworked from the original in the following ways to accomidate Linux ZFS implementation *) Usage of the cyclic interface was replaced by the delayed taskq interface. This avoids the need to implement new compatibility code and allows us to rely on the existing taskq implementation. *) An extern for zfs_txg_synctime_ms was added to sys/dsl_pool.h because declaring externs in source files as was done in the original patch is just plain wrong. *) Instead of panicing the system when the deadman triggers a zevent describing the blocked vdev and the first pending I/O is posted. If the panic behavior is desired Linux provides other generic methods to panic the system when threads are observed to hang. *) For reference, to delay zios by 30 seconds for testing you can use zinject as follows: 'zinject -d <vdev> -D30 <pool>' References: illumos/illumos-gate@283b846 https://www.illumos.org/issues/3246 Ported-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #1396
1 parent 57f5a20 commit cc92e9d

File tree

20 files changed

+275
-50
lines changed

20 files changed

+275
-50
lines changed

cmd/zinject/translate.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
*/
2121
/*
2222
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
23+
* Copyright (c) 2012 by Delphix. All rights reserved.
2324
*/
2425

2526
#include <libzfs.h>
@@ -476,6 +477,20 @@ translate_device(const char *pool, const char *device, err_type_t label_type,
476477
&record->zi_guid) == 0);
477478
}
478479

480+
/*
481+
* Device faults can take on three different forms:
482+
* 1). delayed or hanging I/O
483+
* 2). zfs label faults
484+
* 3). generic disk faults
485+
*/
486+
if (record->zi_timer != 0) {
487+
record->zi_cmd = ZINJECT_DELAY_IO;
488+
} else if (label_type != TYPE_INVAL) {
489+
record->zi_cmd = ZINJECT_LABEL_FAULT;
490+
} else {
491+
record->zi_cmd = ZINJECT_DEVICE_FAULT;
492+
}
493+
479494
switch (label_type) {
480495
default:
481496
break;

cmd/zinject/zinject.c

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
*/
2121
/*
2222
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23+
* Copyright (c) 2012 by Delphix. All rights reserved.
2324
*/
2425

2526
/*
@@ -235,8 +236,8 @@ usage(void)
235236
"\t\t'pad1', or 'pad2'.\n"
236237
"\t\t'errno' can be 'nxio' (the default), 'io', or 'dtl'.\n"
237238
"\n"
238-
"\tzinject -d device -A <degrade|fault> pool\n"
239-
"\t\tPerform a specific action on a particular device\n"
239+
"\tzinject -d device -A <degrade|fault> -D <delay secs> pool\n"
240+
"\t\tPerform a specific action on a particular device.\n"
240241
"\n"
241242
"\tzinject -I [-s <seconds> | -g <txgs>] pool\n"
242243
"\t\tCause the pool to stop writing blocks yet not\n"
@@ -589,7 +590,7 @@ main(int argc, char **argv)
589590
}
590591

591592
while ((c = getopt(argc, argv,
592-
":aA:b:d:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
593+
":aA:b:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
593594
switch (c) {
594595
case 'a':
595596
flags |= ZINJECT_FLUSH_ARC;
@@ -615,6 +616,16 @@ main(int argc, char **argv)
615616
case 'd':
616617
device = optarg;
617618
break;
619+
case 'D':
620+
errno = 0;
621+
record.zi_timer = strtoull(optarg, &end, 10);
622+
if (errno != 0 || *end != '\0') {
623+
(void) fprintf(stderr, "invalid i/o delay "
624+
"value: '%s'\n", optarg);
625+
usage();
626+
return (1);
627+
}
628+
break;
618629
case 'e':
619630
if (strcasecmp(optarg, "io") == 0) {
620631
error = EIO;
@@ -679,6 +690,7 @@ main(int argc, char **argv)
679690
case 'p':
680691
(void) strlcpy(record.zi_func, optarg,
681692
sizeof (record.zi_func));
693+
record.zi_cmd = ZINJECT_PANIC;
682694
break;
683695
case 'q':
684696
quiet = 1;
@@ -762,13 +774,15 @@ main(int argc, char **argv)
762774
return (1);
763775
}
764776

777+
if (record.zi_duration != 0)
778+
record.zi_cmd = ZINJECT_IGNORED_WRITES;
779+
765780
if (cancel != NULL) {
766781
/*
767782
* '-c' is invalid with any other options.
768783
*/
769784
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
770-
level != 0 || record.zi_func[0] != '\0' ||
771-
record.zi_duration != 0) {
785+
level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) {
772786
(void) fprintf(stderr, "cancel (-c) incompatible with "
773787
"any other options\n");
774788
usage();
@@ -800,8 +814,7 @@ main(int argc, char **argv)
800814
* for doing injection, so handle it separately here.
801815
*/
802816
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
803-
level != 0 || record.zi_func[0] != '\0' ||
804-
record.zi_duration != 0) {
817+
level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) {
805818
(void) fprintf(stderr, "device (-d) incompatible with "
806819
"data error injection\n");
807820
usage();
@@ -835,7 +848,7 @@ main(int argc, char **argv)
835848

836849
} else if (raw != NULL) {
837850
if (range != NULL || type != TYPE_INVAL || level != 0 ||
838-
record.zi_func[0] != '\0' || record.zi_duration != 0) {
851+
record.zi_cmd != ZINJECT_UNINITIALIZED) {
839852
(void) fprintf(stderr, "raw (-b) format with "
840853
"any other options\n");
841854
usage();
@@ -858,13 +871,14 @@ main(int argc, char **argv)
858871
return (1);
859872
}
860873

874+
record.zi_cmd = ZINJECT_DATA_FAULT;
861875
if (translate_raw(raw, &record) != 0)
862876
return (1);
863877
if (!error)
864878
error = EIO;
865-
} else if (record.zi_func[0] != '\0') {
879+
} else if (record.zi_cmd == ZINJECT_PANIC) {
866880
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
867-
level != 0 || device != NULL || record.zi_duration != 0) {
881+
level != 0 || device != NULL) {
868882
(void) fprintf(stderr, "panic (-p) incompatible with "
869883
"other options\n");
870884
usage();
@@ -882,7 +896,7 @@ main(int argc, char **argv)
882896
if (argv[1] != NULL)
883897
record.zi_type = atoi(argv[1]);
884898
dataset[0] = '\0';
885-
} else if (record.zi_duration != 0) {
899+
} else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) {
886900
if (nowrites == 0) {
887901
(void) fprintf(stderr, "-s or -g meaningless "
888902
"without -I (ignore writes)\n");
@@ -936,6 +950,7 @@ main(int argc, char **argv)
936950
return (1);
937951
}
938952

953+
record.zi_cmd = ZINJECT_DATA_FAULT;
939954
if (translate_record(type, argv[0], range, level, &record, pool,
940955
dataset) != 0)
941956
return (1);

include/sys/dsl_pool.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@
4141
extern "C" {
4242
#endif
4343

44+
extern int zfs_txg_synctime_ms;
45+
4446
struct objset;
4547
struct dsl_dir;
4648
struct dsl_dataset;

include/sys/fm/fs/zfs.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ extern "C" {
7373
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU "vdev_fru"
7474
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE "vdev_state"
7575
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT "vdev_ashift"
76+
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS "vdev_complete_ts"
77+
#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS "vdev_delta_ts"
7678
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid"
7779
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type"
7880
#define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path"
@@ -88,6 +90,9 @@ extern "C" {
8890
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE "zio_stage"
8991
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE "zio_pipeline"
9092
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY "zio_delay"
93+
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP "zio_timestamp"
94+
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_DEADLINE "zio_deadline"
95+
#define FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA "zio_delta"
9196
#define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state"
9297
#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED "cksum_expected"
9398
#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL "cksum_actual"

include/sys/spa.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -570,6 +570,7 @@ extern int spa_offline_log(spa_t *spa);
570570

571571
/* Log claim callback */
572572
extern void spa_claim_notify(zio_t *zio);
573+
extern void spa_deadman(void *);
573574

574575
/* Accessor functions */
575576
extern boolean_t spa_shutting_down(spa_t *spa);
@@ -604,6 +605,7 @@ extern boolean_t spa_suspended(spa_t *spa);
604605
extern uint64_t spa_bootfs(spa_t *spa);
605606
extern uint64_t spa_delegation(spa_t *spa);
606607
extern objset_t *spa_meta_objset(spa_t *spa);
608+
extern uint64_t spa_deadman_synctime(spa_t *spa);
607609

608610
/* Miscellaneous support routines */
609611
extern void spa_activate_mos_feature(spa_t *spa, const char *feature);

include/sys/spa_impl.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,10 @@ struct spa {
227227
uint64_t spa_feat_for_write_obj; /* required to write to pool */
228228
uint64_t spa_feat_for_read_obj; /* required to read from pool */
229229
uint64_t spa_feat_desc_obj; /* Feature descriptions */
230+
taskqid_t spa_deadman_tqid; /* Task id */
231+
uint64_t spa_deadman_calls; /* number of deadman calls */
232+
uint64_t spa_sync_starttime; /* starting time fo spa_sync */
233+
uint64_t spa_deadman_synctime; /* deadman expiration timer */
230234
/*
231235
* spa_refcnt & spa_config_lock must be the last elements
232236
* because refcount_t changes size based on compilation options.

include/sys/vdev.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ extern void vdev_metaslab_fini(vdev_t *vd);
7878
extern void vdev_metaslab_set_size(vdev_t *);
7979
extern void vdev_expand(vdev_t *vd, uint64_t txg);
8080
extern void vdev_split(vdev_t *vd);
81+
extern void vdev_deadman(vdev_t *vd);
8182

8283

8384
extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);

include/sys/vdev_impl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,8 @@ struct vdev_queue {
105105
avl_tree_t vq_read_tree;
106106
avl_tree_t vq_write_tree;
107107
avl_tree_t vq_pending_tree;
108+
uint64_t vq_io_complete_ts;
109+
uint64_t vq_io_delta_ts;
108110
list_t vq_io_list;
109111
kmutex_t vq_lock;
110112
};

include/sys/zfs_context.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
/*
2626
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
2727
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
28+
* Copyright (c) 2012 by Delphix. All rights reserved.
2829
*/
2930

3031
#ifndef _SYS_ZFS_CONTEXT_H
@@ -400,13 +401,16 @@ extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t);
400401
#define taskq_create_sysdc(a, b, d, e, p, dc, f) \
401402
(taskq_create(a, b, maxclsyspri, d, e, f))
402403
extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
404+
extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, uint_t,
405+
clock_t);
403406
extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
404407
taskq_ent_t *);
405408
extern int taskq_empty_ent(taskq_ent_t *);
406409
extern void taskq_init_ent(taskq_ent_t *);
407410
extern void taskq_destroy(taskq_t *);
408411
extern void taskq_wait(taskq_t *);
409412
extern int taskq_member(taskq_t *, kthread_t *);
413+
extern int taskq_cancel_id(taskq_t *, taskqid_t);
410414
extern void system_taskq_init(void);
411415
extern void system_taskq_fini(void);
412416

@@ -523,6 +527,11 @@ extern vnode_t *rootdir;
523527

524528
extern void delay(clock_t ticks);
525529

530+
#define SEC_TO_TICK(sec) ((sec) * hz)
531+
#define MSEC_TO_TICK(msec) ((msec) / (MILLISEC / hz))
532+
#define USEC_TO_TICK(usec) ((usec) / (MICROSEC / hz))
533+
#define NSEC_TO_TICK(usec) ((usec) / (NANOSEC / hz))
534+
526535
#define gethrestime_sec() time(NULL)
527536
#define gethrestime(t) \
528537
do {\

include/sys/zfs_ioctl.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,8 @@ typedef struct zinject_record {
236236
uint32_t zi_iotype;
237237
int32_t zi_duration;
238238
uint64_t zi_timer;
239+
uint32_t zi_cmd;
240+
uint32_t zi_pad;
239241
} zinject_record_t;
240242

241243
#define ZINJECT_NULL 0x1
@@ -245,6 +247,16 @@ typedef struct zinject_record {
245247
#define ZEVENT_NONBLOCK 0x1
246248
#define ZEVENT_SIZE 1024
247249

250+
typedef enum zinject_type {
251+
ZINJECT_UNINITIALIZED,
252+
ZINJECT_DATA_FAULT,
253+
ZINJECT_DEVICE_FAULT,
254+
ZINJECT_LABEL_FAULT,
255+
ZINJECT_IGNORED_WRITES,
256+
ZINJECT_PANIC,
257+
ZINJECT_DELAY_IO,
258+
} zinject_type_t;
259+
248260
typedef struct zfs_share {
249261
uint64_t z_exportdata;
250262
uint64_t z_sharedata;

0 commit comments

Comments
 (0)