diff --git a/.github/workflows/docker_image.yml b/.github/workflows/docker_image.yml deleted file mode 100644 index 0ef8a8830e66..000000000000 --- a/.github/workflows/docker_image.yml +++ /dev/null @@ -1,28 +0,0 @@ -name: build_image - -on: - push: - branches: - - 'truenas/zfs-2.2-release' - -jobs: - docker: - runs-on: ubuntu-latest - steps: - - name: Set up QEMU - uses: docker/setup-qemu-action@v1 - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v1 - - name: Login to DockerHub - uses: docker/login-action@v1 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - - name: Build and push - id: docker_build - uses: docker/build-push-action@v2 - with: - push: true - tags: ixsystems/zfs:latest - - name: Image digest - run: echo ${{ steps.docker_build.outputs.digest }} diff --git a/.mailmap b/.mailmap index 46ef016b93f8..32bdb5209613 100644 --- a/.mailmap +++ b/.mailmap @@ -30,6 +30,7 @@ Andreas Dilger Andrew Walker Benedikt Neuffer Chengfei Zhu +ChenHao Lu <18302010006@fudan.edu.cn> Chris Lindee Colm Buckley Crag Wang @@ -43,6 +44,7 @@ Glenn Washburn Gordan Bobic Gregory Bartholomew hedong zhang +Ilkka Sovanto InsanePrawn Jason Cohen Jason Harmening @@ -57,6 +59,7 @@ KernelOfTruth Liu Hua Liu Qing loli10K +Mart Frauenlob Matthias Blankertz Michael Gmelin Olivier Mazouffre @@ -73,6 +76,9 @@ WHR Yanping Gao Youzhong Yang +# Signed-off-by: overriding Author: +Yuxin Wang + # Commits from strange places, long ago Brian Behlendorf Brian Behlendorf @@ -102,12 +108,15 @@ Brandon Thetford buzzingwires <131118055+buzzingwires@users.noreply.github.com> Cedric Maunoury <38213715+cedricmaunoury@users.noreply.github.com> Charles Suh +Chris Peredun <126915832+chrisperedun@users.noreply.github.com> Dacian Reece-Stremtan <35844628+dacianstremtan@users.noreply.github.com> Damian Szuberski <30863496+szubersk@users.noreply.github.com> Daniel Hiepler <32984777+heeplr@users.noreply.github.com> Daniel Kobras Daniel Reichelt David Quigley +Dennis R. Friedrichsen <31087738+dennisfriedrichsen@users.noreply.github.com> +Dex Wood DHE Dmitri John Ledkov <19779+xnox@users.noreply.github.com> Dries Michiels <32487486+driesmp@users.noreply.github.com> @@ -128,6 +137,7 @@ Harry Mallon <1816667+hjmallon@users.noreply.github.com> Hiếu Lê Jake Howard James Cowgill +Jaron Kent-Dobias Jason King Jeff Dike <52420226+jdike@users.noreply.github.com> Jitendra Patidar <53164267+jsai20@users.noreply.github.com> @@ -137,7 +147,9 @@ John L. Hammond <35266395+jhammond-intel@users.noreply. John-Mark Gurney John Ramsden Jonathon Fernyhough <559369+jonathonf@users.noreply.github.com> +Jose Luis Duran Justin Hibbits +Kevin Greene <104801862+kxgreene@users.noreply.github.com> Kevin Jin <33590050+jxdking@users.noreply.github.com> Kevin P. Fleming Krzysztof Piecuch <3964215+pikrzysztof@users.noreply.github.com> @@ -148,9 +160,11 @@ Lorenz Hüdepohl Luís Henriques <73643340+lumigch@users.noreply.github.com> Marcin Skarbek Matt Fiddaman <81489167+matt-fidd@users.noreply.github.com> +Maxim Filimonov Max Zettlmeißl <6818198+maxz@users.noreply.github.com> Michael Niewöhner Michael Zhivich <33133421+mzhivich@users.noreply.github.com> +MigeljanImeri <78048439+MigeljanImeri@users.noreply.github.com> Mo Zhou <5723047+cdluminate@users.noreply.github.com> Nick Mattis omni <79493359+omnivagant@users.noreply.github.com> @@ -164,6 +178,7 @@ Ping Huang <101400146+hpingfs@users.noreply.github.com> Piotr P. Stefaniak Richard Allen <33836503+belperite@users.noreply.github.com> Rich Ercolani <214141+rincebrain@users.noreply.github.com> +Rick Macklem <64620010+rmacklem@users.noreply.github.com> Rob Wing <98866084+rob-wing@users.noreply.github.com> Roman Strashkin Ryan Hirasaki <4690732+RyanHir@users.noreply.github.com> @@ -174,6 +189,8 @@ Scott Colby Sean Eric Fagan Spencer Kinny <30333052+Spencer-Kinny@users.noreply.github.com> Srikanth N S <75025422+nssrikanth@users.noreply.github.com> +Stefan Lendl <1321542+stfl@users.noreply.github.com> +Thomas Bertschinger <101425190+bertschinger@users.noreply.github.com> Thomas Geppert Tim Crawford Tom Matthews @@ -181,6 +198,7 @@ Tony Perkins <62951051+tony-zfs@users.noreply.github.com> Torsten Wörtwein Tulsi Jain Václav Skála <33496485+vaclavskala@users.noreply.github.com> +Vaibhav Bhanawat <88050553+vaibhav-delphix@users.noreply.github.com> Violet Purcell <66446404+vimproved@users.noreply.github.com> Vipin Kumar Verma <75025470+vermavipinkumar@users.noreply.github.com> Wolfgang Bumiller diff --git a/AUTHORS b/AUTHORS index be1efb87b34c..d7d55f42d2e7 100644 --- a/AUTHORS +++ b/AUTHORS @@ -88,9 +88,11 @@ CONTRIBUTORS: Bassu Ben Allen Ben Cordero + Benda Xu Benedikt Neuffer Benjamin Albrecht Benjamin Gentil + Benjamin Sherman Ben McGough Ben Rubson Ben Wolsieffer @@ -111,6 +113,7 @@ CONTRIBUTORS: bzzz77 cable2999 Caleb James DeLisle + Cameron Harr Cao Xuewen Carlo Landmeter Carlos Alberto Lopez Perez @@ -120,12 +123,15 @@ CONTRIBUTORS: Chen Can Chengfei Zhu Chen Haiquan + ChenHao Lu <18302010006@fudan.edu.cn> Chip Parker Chris Burroughs + Chris Davidson Chris Dunlap Chris Dunlop Chris Lindee Chris McDonough + Chris Peredun Chris Siden Chris Siebenmann Christer Ekholm @@ -144,6 +150,7 @@ CONTRIBUTORS: Clint Armstrong Coleman Kane Colin Ian King + Colin Percival Colm Buckley Crag Wang Craig Loomis @@ -156,6 +163,7 @@ CONTRIBUTORS: Damiano Albani Damian Szuberski Damian Wojsław + Daniel Berlin Daniel Hiepler Daniel Hoffman Daniel Kobras @@ -176,8 +184,10 @@ CONTRIBUTORS: David Quigley Debabrata Banerjee D. Ebdrup + Dennis R. Friedrichsen Denys Rtveliashvili Derek Dai + Dex Wood DHE Didier Roche Dimitri John Ledkov @@ -235,9 +245,11 @@ CONTRIBUTORS: Gionatan Danti Giuseppe Di Natale Glenn Washburn + gofaster Gordan Bobic Gordon Bergling Gordon Ross + Gordon Tetlow Graham Christensen Graham Perrin Gregor Kopka @@ -265,6 +277,7 @@ CONTRIBUTORS: Igor Kozhukhov Igor Lvovsky ilbsmart + Ilkka Sovanto illiliti ilovezfs InsanePrawn @@ -280,9 +293,11 @@ CONTRIBUTORS: Jan Engelhardt Jan Kryl Jan Sanislo + Jaron Kent-Dobias Jason Cohen Jason Harmening Jason King + Jason Lee Jason Zaman Javen Wu Jean-Baptiste Lallement @@ -313,6 +328,7 @@ CONTRIBUTORS: Jonathon Fernyhough Jorgen Lundman Josef 'Jeff' Sipek + Jose Luis Duran Josh Soref Joshua M. Clulow José Luis Salvador Rufo @@ -336,8 +352,10 @@ CONTRIBUTORS: Kash Pande Kay Pedersen Keith M Wesolowski + Kent Ross KernelOfTruth Kevin Bowling + Kevin Greene Kevin Jin Kevin P. Fleming Kevin Tanguy @@ -389,6 +407,7 @@ CONTRIBUTORS: Mark Shellenbaum marku89 Mark Wright + Mart Frauenlob Martin Matuska Martin Rüegg Massimo Maggi @@ -405,6 +424,7 @@ CONTRIBUTORS: Matus Kral Mauricio Faria de Oliveira Max Grossman + Maxim Filimonov Maximilian Mehnert Max Zettlmeißl Md Islam @@ -417,6 +437,7 @@ CONTRIBUTORS: Michael Niewöhner Michael Zhivich Michal Vasilek + MigeljanImeri Mike Gerdts Mike Harsch Mike Leddy @@ -448,6 +469,7 @@ CONTRIBUTORS: Olaf Faaland Oleg Drokin Oleg Stepura + Olivier Certner Olivier Mazouffre omni Orivej Desh @@ -479,6 +501,7 @@ CONTRIBUTORS: Prasad Joshi privb0x23 P.SCH + Quartz Quentin Zdanis Rafael Kitover RageLtMan @@ -491,11 +514,15 @@ CONTRIBUTORS: Riccardo Schirone Richard Allen Richard Elling + Richard Kojedzinszky Richard Laager Richard Lowe Richard Sharpe Richard Yao Rich Ercolani + Rick Macklem + rilysh + Robert Evans Robert Novak Roberto Ricci Rob Norris @@ -509,7 +536,9 @@ CONTRIBUTORS: Ryan Lahfa Ryan Libby Ryan Moeller + Sam Atkinson Sam Hathaway + Sam James Sam Lunt Samuel VERSCHELDE Samuel Wycliffe @@ -530,6 +559,8 @@ CONTRIBUTORS: Shaan Nobee Shampavman Shaun Tancheff + Shawn Bayern + Shengqi Chen Shen Yan Simon Guest Simon Klinkert @@ -537,6 +568,7 @@ CONTRIBUTORS: Spencer Kinny Srikanth N S Stanislav Seletskiy + Stefan Lendl Steffen Müthing Stephen Blinick sterlingjensen @@ -557,6 +589,7 @@ CONTRIBUTORS: Teodor Spæren TerraTech Thijs Cramer + Thomas Bertschinger Thomas Geppert Thomas Lamprecht Till Maas @@ -586,6 +619,7 @@ CONTRIBUTORS: Turbo Fredriksson Tyler J. Stachecki Umer Saleem + Vaibhav Bhanawat Valmiky Arquissandas Val Packett Vince van Oosten @@ -614,6 +648,7 @@ CONTRIBUTORS: yuina822 YunQiang Su Yuri Pankov + Yuxin Wang Yuxuan Shui Zachary Bedell Zach Dykstra diff --git a/META b/META index d64414e32225..383fa37fd42a 100644 --- a/META +++ b/META @@ -1,10 +1,10 @@ Meta: 1 Name: zfs Branch: 1.0 -Version: 2.2.3 +Version: 2.2.4 Release: 1 Release-Tags: relext License: CDDL Author: OpenZFS -Linux-Maximum: 6.7 +Linux-Maximum: 6.8 Linux-Minimum: 3.10 diff --git a/cmd/arcstat.in b/cmd/arcstat.in index 8df1c62f7e86..c4f10a1d6d3b 100755 --- a/cmd/arcstat.in +++ b/cmd/arcstat.in @@ -157,6 +157,16 @@ cols = { "free": [5, 1024, "ARC free memory"], "avail": [5, 1024, "ARC available memory"], "waste": [5, 1024, "Wasted memory due to round up to pagesize"], + "ztotal": [6, 1000, "zfetch total prefetcher calls per second"], + "zhits": [5, 1000, "zfetch stream hits per second"], + "zahead": [6, 1000, "zfetch hits ahead of streams per second"], + "zpast": [5, 1000, "zfetch hits behind streams per second"], + "zmisses": [7, 1000, "zfetch stream misses per second"], + "zmax": [4, 1000, "zfetch limit reached per second"], + "zfuture": [7, 1000, "zfetch stream future per second"], + "zstride": [7, 1000, "zfetch stream strides per second"], + "zissued": [7, 1000, "zfetch prefetches issued per second"], + "zactive": [7, 1000, "zfetch prefetches active per second"], } v = {} @@ -164,6 +174,8 @@ hdr = ["time", "read", "ddread", "ddh%", "dmread", "dmh%", "pread", "ph%", "size", "c", "avail"] xhdr = ["time", "mfu", "mru", "mfug", "mrug", "unc", "eskip", "mtxmis", "dread", "pread", "read"] +zhdr = ["time", "ztotal", "zhits", "zahead", "zpast", "zmisses", "zmax", + "zfuture", "zstride", "zissued", "zactive"] sint = 1 # Default interval is 1 second count = 1 # Default count is 1 hdr_intr = 20 # Print header every 20 lines of output @@ -188,6 +200,8 @@ if sys.platform.startswith('freebsd'): k = [ctl for ctl in sysctl.filter('kstat.zfs.misc.arcstats') if ctl.type != sysctl.CTLTYPE_NODE] + k += [ctl for ctl in sysctl.filter('kstat.zfs.misc.zfetchstats') + if ctl.type != sysctl.CTLTYPE_NODE] if not k: sys.exit(1) @@ -199,19 +213,28 @@ if sys.platform.startswith('freebsd'): continue name, value = s.name, s.value - # Trims 'kstat.zfs.misc.arcstats' from the name - kstat[name[24:]] = int(value) + + if "arcstats" in name: + # Trims 'kstat.zfs.misc.arcstats' from the name + kstat[name[24:]] = int(value) + else: + kstat["zfetch_" + name[27:]] = int(value) elif sys.platform.startswith('linux'): def kstat_update(): global kstat - k = [line.strip() for line in open('/proc/spl/kstat/zfs/arcstats')] + k1 = [line.strip() for line in open('/proc/spl/kstat/zfs/arcstats')] - if not k: + k2 = ["zfetch_" + line.strip() for line in + open('/proc/spl/kstat/zfs/zfetchstats')] + + if k1 is None or k2 is None: sys.exit(1) - del k[0:2] + del k1[0:2] + del k2[0:2] + k = k1 + k2 kstat = {} for s in k: @@ -239,6 +262,7 @@ def usage(): sys.stderr.write("\t -v : List all possible field headers and definitions" "\n") sys.stderr.write("\t -x : Print extended stats\n") + sys.stderr.write("\t -z : Print zfetch stats\n") sys.stderr.write("\t -f : Specify specific fields to print (see -v)\n") sys.stderr.write("\t -o : Redirect output to the specified file\n") sys.stderr.write("\t -s : Override default field separator with custom " @@ -357,6 +381,7 @@ def init(): global count global hdr global xhdr + global zhdr global opfile global sep global out @@ -368,15 +393,17 @@ def init(): xflag = False hflag = False vflag = False + zflag = False i = 1 try: opts, args = getopt.getopt( sys.argv[1:], - "axo:hvs:f:p", + "axzo:hvs:f:p", [ "all", "extended", + "zfetch", "outfile", "help", "verbose", @@ -410,13 +437,15 @@ def init(): i += 1 if opt in ('-p', '--parsable'): pretty_print = False + if opt in ('-z', '--zfetch'): + zflag = True i += 1 argv = sys.argv[i:] sint = int(argv[0]) if argv else sint count = int(argv[1]) if len(argv) > 1 else (0 if len(argv) > 0 else 1) - if hflag or (xflag and desired_cols): + if hflag or (xflag and zflag) or ((zflag or xflag) and desired_cols): usage() if vflag: @@ -425,6 +454,9 @@ def init(): if xflag: hdr = xhdr + if zflag: + hdr = zhdr + update_hdr_intr() # check if L2ARC exists @@ -569,6 +601,17 @@ def calculate(): v["el2mru"] = d["evict_l2_eligible_mru"] // sint v["el2inel"] = d["evict_l2_ineligible"] // sint v["mtxmis"] = d["mutex_miss"] // sint + v["ztotal"] = (d["zfetch_hits"] + d["zfetch_future"] + d["zfetch_stride"] + + d["zfetch_past"] + d["zfetch_misses"]) // sint + v["zhits"] = d["zfetch_hits"] // sint + v["zahead"] = (d["zfetch_future"] + d["zfetch_stride"]) // sint + v["zpast"] = d["zfetch_past"] // sint + v["zmisses"] = d["zfetch_misses"] // sint + v["zmax"] = d["zfetch_max_streams"] // sint + v["zfuture"] = d["zfetch_future"] // sint + v["zstride"] = d["zfetch_stride"] // sint + v["zissued"] = d["zfetch_io_issued"] // sint + v["zactive"] = d["zfetch_io_active"] // sint if l2exist: v["l2hits"] = d["l2_hits"] // sint diff --git a/cmd/zed/agents/fmd_api.c b/cmd/zed/agents/fmd_api.c index 4a6cfbf8c05c..fe43e2ab971e 100644 --- a/cmd/zed/agents/fmd_api.c +++ b/cmd/zed/agents/fmd_api.c @@ -22,6 +22,7 @@ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. * * Copyright (c) 2016, Intel Corporation. + * Copyright (c) 2023, Klara Inc. */ /* @@ -231,28 +232,6 @@ fmd_prop_get_int32(fmd_hdl_t *hdl, const char *name) if (strcmp(name, "spare_on_remove") == 0) return (1); - if (strcmp(name, "io_N") == 0 || strcmp(name, "checksum_N") == 0) - return (10); /* N = 10 events */ - - return (0); -} - -int64_t -fmd_prop_get_int64(fmd_hdl_t *hdl, const char *name) -{ - (void) hdl; - - /* - * These can be looked up in mp->modinfo->fmdi_props - * For now we just hard code for phase 2. In the - * future, there can be a ZED based override. - */ - if (strcmp(name, "remove_timeout") == 0) - return (15ULL * 1000ULL * 1000ULL * 1000ULL); /* 15 sec */ - - if (strcmp(name, "io_T") == 0 || strcmp(name, "checksum_T") == 0) - return (1000ULL * 1000ULL * 1000ULL * 600ULL); /* 10 min */ - return (0); } @@ -535,20 +514,31 @@ fmd_serd_exists(fmd_hdl_t *hdl, const char *name) return (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL); } -void -fmd_serd_reset(fmd_hdl_t *hdl, const char *name) +int +fmd_serd_active(fmd_hdl_t *hdl, const char *name) { fmd_module_t *mp = (fmd_module_t *)hdl; fmd_serd_eng_t *sgp; if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) { zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name); - return; + return (0); } + return (fmd_serd_eng_fired(sgp) || !fmd_serd_eng_empty(sgp)); +} - fmd_serd_eng_reset(sgp); +void +fmd_serd_reset(fmd_hdl_t *hdl, const char *name) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + fmd_serd_eng_t *sgp; - fmd_hdl_debug(hdl, "serd_reset %s", name); + if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) { + zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name); + } else { + fmd_serd_eng_reset(sgp); + fmd_hdl_debug(hdl, "serd_reset %s", name); + } } int @@ -556,16 +546,21 @@ fmd_serd_record(fmd_hdl_t *hdl, const char *name, fmd_event_t *ep) { fmd_module_t *mp = (fmd_module_t *)hdl; fmd_serd_eng_t *sgp; - int err; if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) { zed_log_msg(LOG_ERR, "failed to add record to SERD engine '%s'", name); return (0); } - err = fmd_serd_eng_record(sgp, ep->ev_hrt); + return (fmd_serd_eng_record(sgp, ep->ev_hrt)); +} + +void +fmd_serd_gc(fmd_hdl_t *hdl) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; - return (err); + fmd_serd_hash_apply(&mp->mod_serds, fmd_serd_eng_gc, NULL); } /* FMD Timers */ @@ -579,7 +574,7 @@ _timer_notify(union sigval sv) const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; struct itimerspec its; - fmd_hdl_debug(hdl, "timer fired (%p)", ftp->ft_tid); + fmd_hdl_debug(hdl, "%s timer fired (%p)", mp->mod_name, ftp->ft_tid); /* disarm the timer */ memset(&its, 0, sizeof (struct itimerspec)); diff --git a/cmd/zed/agents/fmd_api.h b/cmd/zed/agents/fmd_api.h index b940d0d395ec..8471feecf33f 100644 --- a/cmd/zed/agents/fmd_api.h +++ b/cmd/zed/agents/fmd_api.h @@ -151,7 +151,6 @@ extern void fmd_hdl_vdebug(fmd_hdl_t *, const char *, va_list); extern void fmd_hdl_debug(fmd_hdl_t *, const char *, ...); extern int32_t fmd_prop_get_int32(fmd_hdl_t *, const char *); -extern int64_t fmd_prop_get_int64(fmd_hdl_t *, const char *); #define FMD_STAT_NOALLOC 0x0 /* fmd should use caller's memory */ #define FMD_STAT_ALLOC 0x1 /* fmd should allocate stats memory */ @@ -195,10 +194,12 @@ extern size_t fmd_buf_size(fmd_hdl_t *, fmd_case_t *, const char *); extern void fmd_serd_create(fmd_hdl_t *, const char *, uint_t, hrtime_t); extern void fmd_serd_destroy(fmd_hdl_t *, const char *); extern int fmd_serd_exists(fmd_hdl_t *, const char *); +extern int fmd_serd_active(fmd_hdl_t *, const char *); extern void fmd_serd_reset(fmd_hdl_t *, const char *); extern int fmd_serd_record(fmd_hdl_t *, const char *, fmd_event_t *); extern int fmd_serd_fired(fmd_hdl_t *, const char *); extern int fmd_serd_empty(fmd_hdl_t *, const char *); +extern void fmd_serd_gc(fmd_hdl_t *); extern id_t fmd_timer_install(fmd_hdl_t *, void *, fmd_event_t *, hrtime_t); extern void fmd_timer_remove(fmd_hdl_t *, id_t); diff --git a/cmd/zed/agents/fmd_serd.c b/cmd/zed/agents/fmd_serd.c index 0bb2c535f094..f942e62b3f48 100644 --- a/cmd/zed/agents/fmd_serd.c +++ b/cmd/zed/agents/fmd_serd.c @@ -310,8 +310,9 @@ fmd_serd_eng_reset(fmd_serd_eng_t *sgp) } void -fmd_serd_eng_gc(fmd_serd_eng_t *sgp) +fmd_serd_eng_gc(fmd_serd_eng_t *sgp, void *arg) { + (void) arg; fmd_serd_elem_t *sep, *nep; hrtime_t hrt; diff --git a/cmd/zed/agents/fmd_serd.h b/cmd/zed/agents/fmd_serd.h index 25b6888e61f2..80ff9a3b25b8 100644 --- a/cmd/zed/agents/fmd_serd.h +++ b/cmd/zed/agents/fmd_serd.h @@ -77,7 +77,7 @@ extern int fmd_serd_eng_fired(fmd_serd_eng_t *); extern int fmd_serd_eng_empty(fmd_serd_eng_t *); extern void fmd_serd_eng_reset(fmd_serd_eng_t *); -extern void fmd_serd_eng_gc(fmd_serd_eng_t *); +extern void fmd_serd_eng_gc(fmd_serd_eng_t *, void *); #ifdef __cplusplus } diff --git a/cmd/zed/agents/zfs_diagnosis.c b/cmd/zed/agents/zfs_diagnosis.c index f6ba334a3ba3..e0ad00800add 100644 --- a/cmd/zed/agents/zfs_diagnosis.c +++ b/cmd/zed/agents/zfs_diagnosis.c @@ -23,6 +23,7 @@ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2016, Intel Corporation. + * Copyright (c) 2023, Klara Inc. */ #include @@ -47,11 +48,16 @@ #define DEFAULT_CHECKSUM_T 600 /* seconds */ #define DEFAULT_IO_N 10 /* events */ #define DEFAULT_IO_T 600 /* seconds */ +#define DEFAULT_SLOW_IO_N 10 /* events */ +#define DEFAULT_SLOW_IO_T 30 /* seconds */ + +#define CASE_GC_TIMEOUT_SECS 43200 /* 12 hours */ /* - * Our serd engines are named 'zfs___{checksum,io}'. This - * #define reserves enough space for two 64-bit hex values plus the length of - * the longest string. + * Our serd engines are named in the following format: + * 'zfs___{checksum,io,slow_io}' + * This #define reserves enough space for two 64-bit hex values plus the + * length of the longest string. */ #define MAX_SERDLEN (16 * 2 + sizeof ("zfs___checksum")) @@ -68,6 +74,7 @@ typedef struct zfs_case_data { int zc_pool_state; char zc_serd_checksum[MAX_SERDLEN]; char zc_serd_io[MAX_SERDLEN]; + char zc_serd_slow_io[MAX_SERDLEN]; int zc_has_remove_timer; } zfs_case_data_t; @@ -114,7 +121,8 @@ zfs_de_stats_t zfs_stats = { { "resource_drops", FMD_TYPE_UINT64, "resource related ereports" } }; -static hrtime_t zfs_remove_timeout; +/* wait 15 seconds after a removal */ +static hrtime_t zfs_remove_timeout = SEC2NSEC(15); uu_list_pool_t *zfs_case_pool; uu_list_t *zfs_cases; @@ -124,6 +132,8 @@ uu_list_t *zfs_cases; #define ZFS_MAKE_EREPORT(type) \ FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type +static void zfs_purge_cases(fmd_hdl_t *hdl); + /* * Write out the persistent representation of an active case. */ @@ -170,6 +180,42 @@ zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp) return (zcp); } +/* + * count other unique slow-io cases in a pool + */ +static uint_t +zfs_other_slow_cases(fmd_hdl_t *hdl, const zfs_case_data_t *zfs_case) +{ + zfs_case_t *zcp; + uint_t cases = 0; + static hrtime_t next_check = 0; + + /* + * Note that plumbing in some external GC would require adding locking, + * since most of this module code is not thread safe and assumes there + * is only one thread running against the module. So we perform GC here + * inline periodically so that future delay induced faults will be + * possible once the issue causing multiple vdev delays is resolved. + */ + if (gethrestime_sec() > next_check) { + /* Periodically purge old SERD entries and stale cases */ + fmd_serd_gc(hdl); + zfs_purge_cases(hdl); + next_check = gethrestime_sec() + CASE_GC_TIMEOUT_SECS; + } + + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) { + if (zcp->zc_data.zc_pool_guid == zfs_case->zc_pool_guid && + zcp->zc_data.zc_vdev_guid != zfs_case->zc_vdev_guid && + zcp->zc_data.zc_serd_slow_io[0] != '\0' && + fmd_serd_active(hdl, zcp->zc_data.zc_serd_slow_io)) { + cases++; + } + } + return (cases); +} + /* * Iterate over any active cases. If any cases are associated with a pool or * vdev which is no longer present on the system, close the associated case. @@ -376,6 +422,14 @@ zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid, (long long unsigned int)vdev_guid, type); } +static void +zfs_case_retire(fmd_hdl_t *hdl, zfs_case_t *zcp) +{ + fmd_hdl_debug(hdl, "retiring case"); + + fmd_case_close(hdl, zcp->zc_case); +} + /* * Solve a given ZFS case. This first checks to make sure the diagnosis is * still valid, as well as cleaning up any pending timer associated with the @@ -632,9 +686,7 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) if (strcmp(class, ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DATA)) == 0 || strcmp(class, - ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0 || - strcmp(class, - ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) == 0) { + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0) { zfs_stats.resource_drops.fmds_value.ui64++; return; } @@ -702,6 +754,9 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) if (zcp->zc_data.zc_serd_checksum[0] != '\0') fmd_serd_reset(hdl, zcp->zc_data.zc_serd_checksum); + if (zcp->zc_data.zc_serd_slow_io[0] != '\0') + fmd_serd_reset(hdl, + zcp->zc_data.zc_serd_slow_io); } else if (fmd_nvl_class_match(hdl, nvl, ZFS_MAKE_RSRC(FM_RESOURCE_STATECHANGE))) { uint64_t state = 0; @@ -730,7 +785,11 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) if (fmd_case_solved(hdl, zcp->zc_case)) return; - fmd_hdl_debug(hdl, "error event '%s'", class); + if (vdev_guid) + fmd_hdl_debug(hdl, "error event '%s', vdev %llu", class, + vdev_guid); + else + fmd_hdl_debug(hdl, "error event '%s'", class); /* * Determine if we should solve the case and generate a fault. We solve @@ -779,6 +838,8 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) fmd_nvl_class_match(hdl, nvl, ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) || fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) || + fmd_nvl_class_match(hdl, nvl, ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) { const char *failmode = NULL; boolean_t checkremove = B_FALSE; @@ -814,6 +875,51 @@ zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) } if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep)) checkremove = B_TRUE; + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY))) { + uint64_t slow_io_n, slow_io_t; + + /* + * Create a slow io SERD engine when the VDEV has the + * 'vdev_slow_io_n' and 'vdev_slow_io_n' properties. + */ + if (zcp->zc_data.zc_serd_slow_io[0] == '\0' && + nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N, + &slow_io_n) == 0 && + nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T, + &slow_io_t) == 0) { + zfs_serd_name(zcp->zc_data.zc_serd_slow_io, + pool_guid, vdev_guid, "slow_io"); + fmd_serd_create(hdl, + zcp->zc_data.zc_serd_slow_io, + slow_io_n, + SEC2NSEC(slow_io_t)); + zfs_case_serialize(zcp); + } + /* Pass event to SERD engine and see if this triggers */ + if (zcp->zc_data.zc_serd_slow_io[0] != '\0' && + fmd_serd_record(hdl, zcp->zc_data.zc_serd_slow_io, + ep)) { + /* + * Ignore a slow io diagnosis when other + * VDEVs in the pool show signs of being slow. + */ + if (zfs_other_slow_cases(hdl, &zcp->zc_data)) { + zfs_case_retire(hdl, zcp); + fmd_hdl_debug(hdl, "pool %llu has " + "multiple slow io cases -- skip " + "degrading vdev %llu", + (u_longlong_t) + zcp->zc_data.zc_pool_guid, + (u_longlong_t) + zcp->zc_data.zc_vdev_guid); + } else { + zfs_case_solve(hdl, zcp, + "fault.fs.zfs.vdev.slow_io"); + } + } } else if (fmd_nvl_class_match(hdl, nvl, ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) { /* @@ -924,6 +1030,8 @@ zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs) fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum); if (zcp->zc_data.zc_serd_io[0] != '\0') fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io); + if (zcp->zc_data.zc_serd_slow_io[0] != '\0') + fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_slow_io); if (zcp->zc_data.zc_has_remove_timer) fmd_timer_remove(hdl, zcp->zc_remove_timer); @@ -932,30 +1040,15 @@ zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs) fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); } -/* - * We use the fmd gc entry point to look for old cases that no longer apply. - * This allows us to keep our set of case data small in a long running system. - */ -static void -zfs_fm_gc(fmd_hdl_t *hdl) -{ - zfs_purge_cases(hdl); -} - static const fmd_hdl_ops_t fmd_ops = { zfs_fm_recv, /* fmdo_recv */ zfs_fm_timeout, /* fmdo_timeout */ zfs_fm_close, /* fmdo_close */ NULL, /* fmdo_stats */ - zfs_fm_gc, /* fmdo_gc */ + NULL, /* fmdo_gc */ }; static const fmd_prop_t fmd_props[] = { - { "checksum_N", FMD_TYPE_UINT32, "10" }, - { "checksum_T", FMD_TYPE_TIME, "10min" }, - { "io_N", FMD_TYPE_UINT32, "10" }, - { "io_T", FMD_TYPE_TIME, "10min" }, - { "remove_timeout", FMD_TYPE_TIME, "15sec" }, { NULL, 0, NULL } }; @@ -996,8 +1089,6 @@ _zfs_diagnosis_init(fmd_hdl_t *hdl) (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (zfs_stats) / sizeof (fmd_stat_t), (fmd_stat_t *)&zfs_stats); - - zfs_remove_timeout = fmd_prop_get_int64(hdl, "remove_timeout"); } void diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c index a0e377a4a0c8..1ef5c631a438 100644 --- a/cmd/zed/agents/zfs_retire.c +++ b/cmd/zed/agents/zfs_retire.c @@ -523,6 +523,9 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, } else if (fmd_nvl_class_match(hdl, fault, "fault.fs.zfs.vdev.checksum")) { degrade_device = B_TRUE; + } else if (fmd_nvl_class_match(hdl, fault, + "fault.fs.zfs.vdev.slow_io")) { + degrade_device = B_TRUE; } else if (fmd_nvl_class_match(hdl, fault, "fault.fs.zfs.device")) { fault_device = B_FALSE; diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c index f1262ed772de..a11b6d0b7fac 100644 --- a/cmd/zinject/zinject.c +++ b/cmd/zinject/zinject.c @@ -1083,6 +1083,22 @@ main(int argc, char **argv) libzfs_fini(g_zfs); return (1); } + + if (record.zi_nlanes) { + switch (io_type) { + case ZIO_TYPE_READ: + case ZIO_TYPE_WRITE: + case ZIO_TYPES: + break; + default: + (void) fprintf(stderr, "I/O type for a delay " + "must be 'read' or 'write'\n"); + usage(); + libzfs_fini(g_zfs); + return (1); + } + } + if (!error) error = ENXIO; diff --git a/cmd/zpool/os/linux/zpool_vdev_os.c b/cmd/zpool/os/linux/zpool_vdev_os.c index 006a3a7d8e01..f194d28c55a9 100644 --- a/cmd/zpool/os/linux/zpool_vdev_os.c +++ b/cmd/zpool/os/linux/zpool_vdev_os.c @@ -438,7 +438,7 @@ static char *zpool_sysfs_gets(char *path) return (NULL); } - buf = calloc(sizeof (*buf), statbuf.st_size + 1); + buf = calloc(statbuf.st_size + 1, sizeof (*buf)); if (buf == NULL) { close(fd); return (NULL); @@ -458,7 +458,7 @@ static char *zpool_sysfs_gets(char *path) } /* Remove trailing newline */ - if (buf[count - 1] == '\n') + if (count > 0 && buf[count - 1] == '\n') buf[count - 1] = 0; close(fd); diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 69bf9649acf6..ed0b8d7a12d7 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright (c) 2012 by Frederik Wessels. All rights reserved. * Copyright (c) 2012 by Cyril Plisko. All rights reserved. * Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved. @@ -131,6 +131,13 @@ static int zpool_do_help(int argc, char **argv); static zpool_compat_status_t zpool_do_load_compat( const char *, boolean_t *); +enum zpool_options { + ZPOOL_OPTION_POWER = 1024, + ZPOOL_OPTION_ALLOW_INUSE, + ZPOOL_OPTION_ALLOW_REPLICATION_MISMATCH, + ZPOOL_OPTION_ALLOW_ASHIFT_MISMATCH +}; + /* * These libumem hooks provide a reasonable set of defaults for the allocator's * debugging facilities. @@ -347,7 +354,7 @@ get_usage(zpool_help_t idx) { switch (idx) { case HELP_ADD: - return (gettext("\tadd [-fgLnP] [-o property=value] " + return (gettext("\tadd [-afgLnP] [-o property=value] " " ...\n")); case HELP_ATTACH: return (gettext("\tattach [-fsw] [-o property=value] " @@ -413,7 +420,7 @@ get_usage(zpool_help_t idx) "[ ...]\n")); case HELP_STATUS: return (gettext("\tstatus [--power] [-c [script1,script2,...]] " - "[-igLpPstvxD] [-T d|u] [pool] ... \n" + "[-DegiLpPstvx] [-T d|u] [pool] ...\n" "\t [interval [count]]\n")); case HELP_UPGRADE: return (gettext("\tupgrade\n" @@ -1009,8 +1016,9 @@ add_prop_list_default(const char *propname, const char *propval, } /* - * zpool add [-fgLnP] [-o property=value] ... + * zpool add [-afgLnP] [-o property=value] ... * + * -a Disable the ashift validation checks * -f Force addition of devices, even if they appear in use * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. @@ -1026,8 +1034,11 @@ add_prop_list_default(const char *propname, const char *propval, int zpool_do_add(int argc, char **argv) { - boolean_t force = B_FALSE; + boolean_t check_replication = B_TRUE; + boolean_t check_inuse = B_TRUE; boolean_t dryrun = B_FALSE; + boolean_t check_ashift = B_TRUE; + boolean_t force = B_FALSE; int name_flags = 0; int c; nvlist_t *nvroot; @@ -1038,8 +1049,18 @@ zpool_do_add(int argc, char **argv) nvlist_t *props = NULL; char *propval; + struct option long_options[] = { + {"allow-in-use", no_argument, NULL, ZPOOL_OPTION_ALLOW_INUSE}, + {"allow-replication-mismatch", no_argument, NULL, + ZPOOL_OPTION_ALLOW_REPLICATION_MISMATCH}, + {"allow-ashift-mismatch", no_argument, NULL, + ZPOOL_OPTION_ALLOW_ASHIFT_MISMATCH}, + {0, 0, 0, 0} + }; + /* check options */ - while ((c = getopt(argc, argv, "fgLno:P")) != -1) { + while ((c = getopt_long(argc, argv, "fgLno:P", long_options, NULL)) + != -1) { switch (c) { case 'f': force = B_TRUE; @@ -1069,6 +1090,15 @@ zpool_do_add(int argc, char **argv) case 'P': name_flags |= VDEV_NAME_PATH; break; + case ZPOOL_OPTION_ALLOW_INUSE: + check_inuse = B_FALSE; + break; + case ZPOOL_OPTION_ALLOW_REPLICATION_MISMATCH: + check_replication = B_FALSE; + break; + case ZPOOL_OPTION_ALLOW_ASHIFT_MISMATCH: + check_ashift = B_FALSE; + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -1089,6 +1119,19 @@ zpool_do_add(int argc, char **argv) usage(B_FALSE); } + if (force) { + if (!check_inuse || !check_replication || !check_ashift) { + (void) fprintf(stderr, gettext("'-f' option is not " + "allowed with '--allow-replication-mismatch', " + "'--allow-ashift-mismatch', or " + "'--allow-in-use'\n")); + usage(B_FALSE); + } + check_inuse = B_FALSE; + check_replication = B_FALSE; + check_ashift = B_FALSE; + } + poolname = argv[0]; argc--; @@ -1119,8 +1162,8 @@ zpool_do_add(int argc, char **argv) } /* pass off to make_root_vdev for processing */ - nvroot = make_root_vdev(zhp, props, force, !force, B_FALSE, dryrun, - argc, argv); + nvroot = make_root_vdev(zhp, props, !check_inuse, + check_replication, B_FALSE, dryrun, argc, argv); if (nvroot == NULL) { zpool_close(zhp); return (1); @@ -1224,7 +1267,7 @@ zpool_do_add(int argc, char **argv) ret = 0; } else { - ret = (zpool_add(zhp, nvroot) != 0); + ret = (zpool_add(zhp, nvroot, check_ashift) != 0); } nvlist_free(props); @@ -2246,7 +2289,6 @@ print_status_initialize(vdev_stat_t *vs, boolean_t verbose) !vs->vs_scan_removing) { char zbuf[1024]; char tbuf[256]; - struct tm zaction_ts; time_t t = vs->vs_initialize_action_time; int initialize_pct = 100; @@ -2256,8 +2298,8 @@ print_status_initialize(vdev_stat_t *vs, boolean_t verbose) 100 / (vs->vs_initialize_bytes_est + 1)); } - (void) localtime_r(&t, &zaction_ts); - (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts); + (void) ctime_r(&t, tbuf); + tbuf[24] = 0; switch (vs->vs_initialize_state) { case VDEV_INITIALIZE_SUSPENDED: @@ -2297,7 +2339,6 @@ print_status_trim(vdev_stat_t *vs, boolean_t verbose) !vs->vs_scan_removing) { char zbuf[1024]; char tbuf[256]; - struct tm zaction_ts; time_t t = vs->vs_trim_action_time; int trim_pct = 100; @@ -2306,8 +2347,8 @@ print_status_trim(vdev_stat_t *vs, boolean_t verbose) 100 / (vs->vs_trim_bytes_est + 1)); } - (void) localtime_r(&t, &zaction_ts); - (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts); + (void) ctime_r(&t, tbuf); + tbuf[24] = 0; switch (vs->vs_trim_state) { case VDEV_TRIM_SUSPENDED: @@ -2569,7 +2610,13 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, break; case VDEV_AUX_ERR_EXCEEDED: - (void) printf(gettext("too many errors")); + if (vs->vs_read_errors + vs->vs_write_errors + + vs->vs_checksum_errors == 0 && children == 0 && + vs->vs_slow_ios > 0) { + (void) printf(gettext("too many slow I/Os")); + } else { + (void) printf(gettext("too many errors")); + } break; case VDEV_AUX_IO_FAILURE: @@ -3396,10 +3443,10 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, ms_status = zpool_enable_datasets(zhp, mntopts, 0); if (ms_status == EZFS_SHAREFAILED) { (void) fprintf(stderr, gettext("Import was " - "successful, but unable to share some datasets")); + "successful, but unable to share some datasets\n")); } else if (ms_status == EZFS_MOUNTFAILED) { (void) fprintf(stderr, gettext("Import was " - "successful, but unable to mount some datasets")); + "successful, but unable to mount some datasets\n")); } } @@ -7064,7 +7111,6 @@ zpool_do_split(int argc, char **argv) return (ret); } -#define POWER_OPT 1024 /* * zpool online [--power] ... @@ -7082,7 +7128,7 @@ zpool_do_online(int argc, char **argv) int flags = 0; boolean_t is_power_on = B_FALSE; struct option long_options[] = { - {"power", no_argument, NULL, POWER_OPT}, + {"power", no_argument, NULL, ZPOOL_OPTION_POWER}, {0, 0, 0, 0} }; @@ -7092,7 +7138,7 @@ zpool_do_online(int argc, char **argv) case 'e': flags |= ZFS_ONLINE_EXPAND; break; - case POWER_OPT: + case ZPOOL_OPTION_POWER: is_power_on = B_TRUE; break; case '?': @@ -7205,7 +7251,7 @@ zpool_do_offline(int argc, char **argv) boolean_t is_power_off = B_FALSE; struct option long_options[] = { - {"power", no_argument, NULL, POWER_OPT}, + {"power", no_argument, NULL, ZPOOL_OPTION_POWER}, {0, 0, 0, 0} }; @@ -7218,7 +7264,7 @@ zpool_do_offline(int argc, char **argv) case 't': istmp = B_TRUE; break; - case POWER_OPT: + case ZPOOL_OPTION_POWER: is_power_off = B_TRUE; break; case '?': @@ -7318,7 +7364,7 @@ zpool_do_clear(int argc, char **argv) char *pool, *device; struct option long_options[] = { - {"power", no_argument, NULL, POWER_OPT}, + {"power", no_argument, NULL, ZPOOL_OPTION_POWER}, {0, 0, 0, 0} }; @@ -7335,7 +7381,7 @@ zpool_do_clear(int argc, char **argv) case 'X': xtreme_rewind = B_TRUE; break; - case POWER_OPT: + case ZPOOL_OPTION_POWER: is_power_on = B_TRUE; break; case '?': @@ -8864,7 +8910,7 @@ status_callback(zpool_handle_t *zhp, void *data) printf_color(ANSI_BOLD, gettext("action: ")); printf_color(ANSI_YELLOW, gettext("Make sure the pool's devices" " are connected, then reboot your system and\n\timport the " - "pool.\n")); + "pool or run 'zpool clear' to resume the pool.\n")); break; case ZPOOL_STATUS_IO_FAILURE_WAIT: @@ -9064,22 +9110,22 @@ status_callback(zpool_handle_t *zhp, void *data) } /* - * zpool status [-c [script1,script2,...]] [-igLpPstvx] [--power] [-T d|u] ... + * zpool status [-c [script1,script2,...]] [-DegiLpPstvx] [--power] [-T d|u] ... * [pool] [interval [count]] * * -c CMD For each vdev, run command CMD + * -D Display dedup status (undocumented) * -e Display only unhealthy vdevs - * -i Display vdev initialization status. * -g Display guid for individual vdev name. + * -i Display vdev initialization status. * -L Follow links when resolving vdev path name. * -p Display values in parsable (exact) format. * -P Display full path for vdev name. * -s Display slow IOs column. - * -v Display complete error logs - * -x Display only pools with potential problems - * -D Display dedup status (undocumented) * -t Display vdev TRIM status. * -T Display a timestamp in date(1) or Unix format + * -v Display complete error logs + * -x Display only pools with potential problems * --power Display vdev enclosure slot power status * * Describes the health status of all pools or some subset. @@ -9095,12 +9141,12 @@ zpool_do_status(int argc, char **argv) char *cmd = NULL; struct option long_options[] = { - {"power", no_argument, NULL, POWER_OPT}, + {"power", no_argument, NULL, ZPOOL_OPTION_POWER}, {0, 0, 0, 0} }; /* check options */ - while ((c = getopt_long(argc, argv, "c:eigLpPsvxDtT:", long_options, + while ((c = getopt_long(argc, argv, "c:DegiLpPstT:vx", long_options, NULL)) != -1) { switch (c) { case 'c': @@ -9127,15 +9173,18 @@ zpool_do_status(int argc, char **argv) } cmd = optarg; break; + case 'D': + cb.cb_dedup_stats = B_TRUE; + break; case 'e': cb.cb_print_unhealthy = B_TRUE; break; - case 'i': - cb.cb_print_vdev_init = B_TRUE; - break; case 'g': cb.cb_name_flags |= VDEV_NAME_GUID; break; + case 'i': + cb.cb_print_vdev_init = B_TRUE; + break; case 'L': cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; break; @@ -9148,22 +9197,19 @@ zpool_do_status(int argc, char **argv) case 's': cb.cb_print_slow_ios = B_TRUE; break; - case 'v': - cb.cb_verbose = B_TRUE; - break; - case 'x': - cb.cb_explain = B_TRUE; - break; - case 'D': - cb.cb_dedup_stats = B_TRUE; - break; case 't': cb.cb_print_vdev_trim = B_TRUE; break; case 'T': get_timestamp_arg(*optarg); break; - case POWER_OPT: + case 'v': + cb.cb_verbose = B_TRUE; + break; + case 'x': + cb.cb_explain = B_TRUE; + break; + case ZPOOL_OPTION_POWER: cb.cb_print_power = B_TRUE; break; case '?': @@ -9202,7 +9248,6 @@ zpool_do_status(int argc, char **argv) if (cb.vcdl != NULL) free_vdev_cmd_data_list(cb.vcdl); - if (argc == 0 && cb.cb_count == 0) (void) fprintf(stderr, gettext("no pools available\n")); else if (cb.cb_explain && cb.cb_first && cb.cb_allpools) @@ -10638,11 +10683,10 @@ zpool_do_get(int argc, char **argv) } } else { /* - * The first arg isn't a pool name, + * The first arg isn't the name of a valid pool. */ - fprintf(stderr, gettext("missing pool name.\n")); - fprintf(stderr, "\n"); - usage(B_FALSE); + fprintf(stderr, gettext("Cannot get properties of %s: " + "no such pool available.\n"), argv[0]); return (1); } diff --git a/cmd/ztest.c b/cmd/ztest.c index 8cfbdfe1c2e2..34744d12b592 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Integros [integros.com] @@ -3270,7 +3270,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) "log" : NULL, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); - error = spa_vdev_add(spa, nvroot); + error = spa_vdev_add(spa, nvroot, B_FALSE); fnvlist_free(nvroot); switch (error) { @@ -3332,7 +3332,7 @@ ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, class, ztest_opts.zo_raid_children, zs->zs_mirrors, 1); - error = spa_vdev_add(spa, nvroot); + error = spa_vdev_add(spa, nvroot, B_FALSE); fnvlist_free(nvroot); if (error == ENOSPC) @@ -3439,7 +3439,7 @@ ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) */ nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1); - error = spa_vdev_add(spa, nvroot); + error = spa_vdev_add(spa, nvroot, B_FALSE); switch (error) { case 0: diff --git a/config/Substfiles.am b/config/Substfiles.am index 38e870b2f501..2459637abe6e 100644 --- a/config/Substfiles.am +++ b/config/Substfiles.am @@ -18,6 +18,7 @@ subst_sed_cmd = \ -e 's|@ASAN_ENABLED[@]|$(ASAN_ENABLED)|g' \ -e 's|@DEFAULT_INIT_NFS_SERVER[@]|$(DEFAULT_INIT_NFS_SERVER)|g' \ -e 's|@DEFAULT_INIT_SHELL[@]|$(DEFAULT_INIT_SHELL)|g' \ + -e 's|@IS_SYSV_RC[@]|$(IS_SYSV_RC)|g' \ -e 's|@LIBFETCH_DYNAMIC[@]|$(LIBFETCH_DYNAMIC)|g' \ -e 's|@LIBFETCH_SONAME[@]|$(LIBFETCH_SONAME)|g' \ -e 's|@PYTHON[@]|$(PYTHON)|g' \ @@ -43,4 +44,4 @@ SUBSTFILES = CLEANFILES += $(SUBSTFILES) dist_noinst_DATA += $(SUBSTFILES:=.in) -$(call SUBST,%,) +$(SUBSTFILES): $(call SUBST,%,) diff --git a/config/always-pyzfs.m4 b/config/always-pyzfs.m4 index 9b123b1b2db1..98c1cc230205 100644 --- a/config/always-pyzfs.m4 +++ b/config/always-pyzfs.m4 @@ -80,10 +80,11 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [ [AC_MSG_ERROR("Python $PYTHON_VERSION unknown")] ) - AX_PYTHON_DEVEL([$PYTHON_REQUIRED_VERSION], [ - AS_IF([test "x$enable_pyzfs" = xyes], [ - AC_MSG_ERROR("Python $PYTHON_REQUIRED_VERSION development library is not installed") - ], [test "x$enable_pyzfs" != xno], [ + AS_IF([test "x$enable_pyzfs" = xyes], [ + AX_PYTHON_DEVEL([$PYTHON_REQUIRED_VERSION]) + ], [ + AX_PYTHON_DEVEL([$PYTHON_REQUIRED_VERSION], [true]) + AS_IF([test "x$ax_python_devel_found" = xno], [ enable_pyzfs=no ]) ]) diff --git a/config/ax_python_devel.m4 b/config/ax_python_devel.m4 index f6d4b01444d6..1f480db6d233 100644 --- a/config/ax_python_devel.m4 +++ b/config/ax_python_devel.m4 @@ -4,18 +4,13 @@ # # SYNOPSIS # -# AX_PYTHON_DEVEL([version], [action-if-not-found]) +# AX_PYTHON_DEVEL([version[,optional]]) # # DESCRIPTION # # Note: Defines as a precious variable "PYTHON_VERSION". Don't override it # in your configure.ac. # -# Note: this is a slightly modified version of the original AX_PYTHON_DEVEL -# macro which accepts an additional [action-if-not-found] argument. This -# allow to detect if Python development is available without aborting the -# configure phase with an hard error in case it is not. -# # This macro checks for Python and tries to get the include path to # 'Python.h'. It provides the $(PYTHON_CPPFLAGS) and $(PYTHON_LIBS) output # variables. It also exports $(PYTHON_EXTRA_LIBS) and @@ -28,6 +23,11 @@ # version number. Don't use "PYTHON_VERSION" for this: that environment # variable is declared as precious and thus reserved for the end-user. # +# By default this will fail if it does not detect a development version of +# python. If you want it to continue, set optional to true, like +# AX_PYTHON_DEVEL([], [true]). The ax_python_devel_found variable will be +# "no" if it fails. +# # This macro should work for all versions of Python >= 2.1.0. As an end # user, you can disable the check for the python version by setting the # PYTHON_NOVERSIONCHECK environment variable to something else than the @@ -45,7 +45,6 @@ # Copyright (c) 2009 Matteo Settenvini # Copyright (c) 2009 Horst Knorr # Copyright (c) 2013 Daniel Mullner -# Copyright (c) 2018 loli10K # # This program is free software: you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the @@ -73,10 +72,18 @@ # modified version of the Autoconf Macro, you may extend this special # exception to the GPL to apply to your modified version as well. -#serial 21 +#serial 36 AU_ALIAS([AC_PYTHON_DEVEL], [AX_PYTHON_DEVEL]) AC_DEFUN([AX_PYTHON_DEVEL],[ + # Get whether it's optional + if test -z "$2"; then + ax_python_devel_optional=false + else + ax_python_devel_optional=$2 + fi + ax_python_devel_found=yes + # # Allow the use of a (user set) custom python version # @@ -87,23 +94,26 @@ AC_DEFUN([AX_PYTHON_DEVEL],[ AC_PATH_PROG([PYTHON],[python[$PYTHON_VERSION]]) if test -z "$PYTHON"; then - m4_ifvaln([$2],[$2],[ - AC_MSG_ERROR([Cannot find python$PYTHON_VERSION in your system path]) - PYTHON_VERSION="" - ]) + AC_MSG_WARN([Cannot find python$PYTHON_VERSION in your system path]) + if ! $ax_python_devel_optional; then + AC_MSG_ERROR([Giving up, python development not available]) + fi + ax_python_devel_found=no + PYTHON_VERSION="" fi - # - # Check for a version of Python >= 2.1.0 - # - AC_MSG_CHECKING([for a version of Python >= '2.1.0']) - ac_supports_python_ver=`$PYTHON -c "import sys; \ + if test $ax_python_devel_found = yes; then + # + # Check for a version of Python >= 2.1.0 + # + AC_MSG_CHECKING([for a version of Python >= '2.1.0']) + ac_supports_python_ver=`$PYTHON -c "import sys; \ ver = sys.version.split ()[[0]]; \ print (ver >= '2.1.0')"` - if test "$ac_supports_python_ver" != "True"; then + if test "$ac_supports_python_ver" != "True"; then if test -z "$PYTHON_NOVERSIONCHECK"; then AC_MSG_RESULT([no]) - AC_MSG_FAILURE([ + AC_MSG_WARN([ This version of the AC@&t@_PYTHON_DEVEL macro doesn't work properly with versions of Python before 2.1.0. You may need to re-run configure, setting the @@ -112,20 +122,27 @@ PYTHON_EXTRA_LIBS and PYTHON_EXTRA_LDFLAGS by hand. Moreover, to disable this check, set PYTHON_NOVERSIONCHECK to something else than an empty string. ]) + if ! $ax_python_devel_optional; then + AC_MSG_FAILURE([Giving up]) + fi + ax_python_devel_found=no + PYTHON_VERSION="" else AC_MSG_RESULT([skip at user request]) fi - else + else AC_MSG_RESULT([yes]) + fi fi - # - # If the macro parameter ``version'' is set, honour it. - # A Python shim class, VPy, is used to implement correct version comparisons via - # string expressions, since e.g. a naive textual ">= 2.7.3" won't work for - # Python 2.7.10 (the ".1" being evaluated as less than ".3"). - # - if test -n "$1"; then + if test $ax_python_devel_found = yes; then + # + # If the macro parameter ``version'' is set, honour it. + # A Python shim class, VPy, is used to implement correct version comparisons via + # string expressions, since e.g. a naive textual ">= 2.7.3" won't work for + # Python 2.7.10 (the ".1" being evaluated as less than ".3"). + # + if test -n "$1"; then AC_MSG_CHECKING([for a version of Python $1]) cat << EOF > ax_python_devel_vpy.py class VPy: @@ -133,7 +150,7 @@ class VPy: return tuple(map(int, s.strip().replace("rc", ".").split("."))) def __init__(self): import sys - self.vpy = tuple(sys.version_info) + self.vpy = tuple(sys.version_info)[[:3]] def __eq__(self, s): return self.vpy == self.vtup(s) def __ne__(self, s): @@ -155,25 +172,69 @@ EOF AC_MSG_RESULT([yes]) else AC_MSG_RESULT([no]) - AC_MSG_ERROR([this package requires Python $1. + AC_MSG_WARN([this package requires Python $1. If you have it installed, but it isn't the default Python interpreter in your system path, please pass the PYTHON_VERSION variable to configure. See ``configure --help'' for reference. ]) + if ! $ax_python_devel_optional; then + AC_MSG_ERROR([Giving up]) + fi + ax_python_devel_found=no PYTHON_VERSION="" fi + fi fi - # - # Check for Python include path - # - # - AC_MSG_CHECKING([for Python include path]) - if test -z "$PYTHON_CPPFLAGS"; then - python_path=`$PYTHON -c "import sysconfig; \ - print (sysconfig.get_path('include'));"` - plat_python_path=`$PYTHON -c "import sysconfig; \ - print (sysconfig.get_path('platinclude'));"` + if test $ax_python_devel_found = yes; then + # + # Check if you have distutils, else fail + # + AC_MSG_CHECKING([for the sysconfig Python package]) + ac_sysconfig_result=`$PYTHON -c "import sysconfig" 2>&1` + if test $? -eq 0; then + AC_MSG_RESULT([yes]) + IMPORT_SYSCONFIG="import sysconfig" + else + AC_MSG_RESULT([no]) + + AC_MSG_CHECKING([for the distutils Python package]) + ac_sysconfig_result=`$PYTHON -c "from distutils import sysconfig" 2>&1` + if test $? -eq 0; then + AC_MSG_RESULT([yes]) + IMPORT_SYSCONFIG="from distutils import sysconfig" + else + AC_MSG_WARN([cannot import Python module "distutils". +Please check your Python installation. The error was: +$ac_sysconfig_result]) + if ! $ax_python_devel_optional; then + AC_MSG_ERROR([Giving up]) + fi + ax_python_devel_found=no + PYTHON_VERSION="" + fi + fi + fi + + if test $ax_python_devel_found = yes; then + # + # Check for Python include path + # + AC_MSG_CHECKING([for Python include path]) + if test -z "$PYTHON_CPPFLAGS"; then + if test "$IMPORT_SYSCONFIG" = "import sysconfig"; then + # sysconfig module has different functions + python_path=`$PYTHON -c "$IMPORT_SYSCONFIG; \ + print (sysconfig.get_path ('include'));"` + plat_python_path=`$PYTHON -c "$IMPORT_SYSCONFIG; \ + print (sysconfig.get_path ('platinclude'));"` + else + # old distutils way + python_path=`$PYTHON -c "$IMPORT_SYSCONFIG; \ + print (sysconfig.get_python_inc ());"` + plat_python_path=`$PYTHON -c "$IMPORT_SYSCONFIG; \ + print (sysconfig.get_python_inc (plat_specific=1));"` + fi if test -n "${python_path}"; then if test "${plat_python_path}" != "${python_path}"; then python_path="-I$python_path -I$plat_python_path" @@ -182,15 +243,15 @@ variable to configure. See ``configure --help'' for reference. fi fi PYTHON_CPPFLAGS=$python_path - fi - AC_MSG_RESULT([$PYTHON_CPPFLAGS]) - AC_SUBST([PYTHON_CPPFLAGS]) + fi + AC_MSG_RESULT([$PYTHON_CPPFLAGS]) + AC_SUBST([PYTHON_CPPFLAGS]) - # - # Check for Python library path - # - AC_MSG_CHECKING([for Python library path]) - if test -z "$PYTHON_LIBS"; then + # + # Check for Python library path + # + AC_MSG_CHECKING([for Python library path]) + if test -z "$PYTHON_LIBS"; then # (makes two attempts to ensure we've got a version number # from the interpreter) ac_python_version=`cat</dev/null || \ - $PYTHON -c "import sysconfig; \ - print (sysconfig.get_path('purelib'));"` - fi - AC_MSG_RESULT([$PYTHON_SITE_PKG]) - AC_SUBST([PYTHON_SITE_PKG]) + if test $ax_python_devel_found = yes; then + AC_MSG_RESULT([$PYTHON_LIBS]) + AC_SUBST([PYTHON_LIBS]) - # - # libraries which must be linked in when embedding - # - AC_MSG_CHECKING(python extra libraries) - if test -z "$PYTHON_EXTRA_LIBS"; then - PYTHON_EXTRA_LIBS=`$PYTHON -c "import sysconfig; \ + # + # Check for site packages + # + AC_MSG_CHECKING([for Python site-packages path]) + if test -z "$PYTHON_SITE_PKG"; then + if test "$IMPORT_SYSCONFIG" = "import sysconfig"; then + PYTHON_SITE_PKG=`$PYTHON -c " +$IMPORT_SYSCONFIG; +if hasattr(sysconfig, 'get_default_scheme'): + scheme = sysconfig.get_default_scheme() +else: + scheme = sysconfig._get_default_scheme() +if scheme == 'posix_local': + # Debian's default scheme installs to /usr/local/ but we want to find headers in /usr/ + scheme = 'posix_prefix' +prefix = '$prefix' +if prefix == 'NONE': + prefix = '$ac_default_prefix' +sitedir = sysconfig.get_path('purelib', scheme, vars={'base': prefix}) +print(sitedir)"` + else + # distutils.sysconfig way + PYTHON_SITE_PKG=`$PYTHON -c "$IMPORT_SYSCONFIG; \ + print (sysconfig.get_python_lib(0,0));"` + fi + fi + AC_MSG_RESULT([$PYTHON_SITE_PKG]) + AC_SUBST([PYTHON_SITE_PKG]) + + # + # Check for platform-specific site packages + # + AC_MSG_CHECKING([for Python platform specific site-packages path]) + if test -z "$PYTHON_PLATFORM_SITE_PKG"; then + if test "$IMPORT_SYSCONFIG" = "import sysconfig"; then + PYTHON_PLATFORM_SITE_PKG=`$PYTHON -c " +$IMPORT_SYSCONFIG; +if hasattr(sysconfig, 'get_default_scheme'): + scheme = sysconfig.get_default_scheme() +else: + scheme = sysconfig._get_default_scheme() +if scheme == 'posix_local': + # Debian's default scheme installs to /usr/local/ but we want to find headers in /usr/ + scheme = 'posix_prefix' +prefix = '$prefix' +if prefix == 'NONE': + prefix = '$ac_default_prefix' +sitedir = sysconfig.get_path('platlib', scheme, vars={'platbase': prefix}) +print(sitedir)"` + else + # distutils.sysconfig way + PYTHON_PLATFORM_SITE_PKG=`$PYTHON -c "$IMPORT_SYSCONFIG; \ + print (sysconfig.get_python_lib(1,0));"` + fi + fi + AC_MSG_RESULT([$PYTHON_PLATFORM_SITE_PKG]) + AC_SUBST([PYTHON_PLATFORM_SITE_PKG]) + + # + # libraries which must be linked in when embedding + # + AC_MSG_CHECKING(python extra libraries) + if test -z "$PYTHON_EXTRA_LIBS"; then + PYTHON_EXTRA_LIBS=`$PYTHON -c "$IMPORT_SYSCONFIG; \ conf = sysconfig.get_config_var; \ print (conf('LIBS') + ' ' + conf('SYSLIBS'))"` - fi - AC_MSG_RESULT([$PYTHON_EXTRA_LIBS]) - AC_SUBST(PYTHON_EXTRA_LIBS) + fi + AC_MSG_RESULT([$PYTHON_EXTRA_LIBS]) + AC_SUBST(PYTHON_EXTRA_LIBS) - # - # linking flags needed when embedding - # - AC_MSG_CHECKING(python extra linking flags) - if test -z "$PYTHON_EXTRA_LDFLAGS"; then - PYTHON_EXTRA_LDFLAGS=`$PYTHON -c "import sysconfig; \ + # + # linking flags needed when embedding + # + AC_MSG_CHECKING(python extra linking flags) + if test -z "$PYTHON_EXTRA_LDFLAGS"; then + PYTHON_EXTRA_LDFLAGS=`$PYTHON -c "$IMPORT_SYSCONFIG; \ conf = sysconfig.get_config_var; \ print (conf('LINKFORSHARED'))"` - fi - AC_MSG_RESULT([$PYTHON_EXTRA_LDFLAGS]) - AC_SUBST(PYTHON_EXTRA_LDFLAGS) + # Hack for macos, it sticks this in here. + PYTHON_EXTRA_LDFLAGS=`echo $PYTHON_EXTRA_LDFLAGS | sed 's/CoreFoundation.*$/CoreFoundation/'` + fi + AC_MSG_RESULT([$PYTHON_EXTRA_LDFLAGS]) + AC_SUBST(PYTHON_EXTRA_LDFLAGS) - # - # final check to see if everything compiles alright - # - AC_MSG_CHECKING([consistency of all components of python development environment]) - # save current global flags - ac_save_LIBS="$LIBS" - ac_save_LDFLAGS="$LDFLAGS" - ac_save_CPPFLAGS="$CPPFLAGS" - LIBS="$ac_save_LIBS $PYTHON_LIBS $PYTHON_EXTRA_LIBS $PYTHON_EXTRA_LIBS" - LDFLAGS="$ac_save_LDFLAGS $PYTHON_EXTRA_LDFLAGS" - CPPFLAGS="$ac_save_CPPFLAGS $PYTHON_CPPFLAGS" - AC_LANG_PUSH([C]) - AC_LINK_IFELSE([ + # + # final check to see if everything compiles alright + # + AC_MSG_CHECKING([consistency of all components of python development environment]) + # save current global flags + ac_save_LIBS="$LIBS" + ac_save_LDFLAGS="$LDFLAGS" + ac_save_CPPFLAGS="$CPPFLAGS" + LIBS="$ac_save_LIBS $PYTHON_LIBS $PYTHON_EXTRA_LIBS" + LDFLAGS="$ac_save_LDFLAGS $PYTHON_EXTRA_LDFLAGS" + CPPFLAGS="$ac_save_CPPFLAGS $PYTHON_CPPFLAGS" + AC_LANG_PUSH([C]) + AC_LINK_IFELSE([ AC_LANG_PROGRAM([[#include ]], [[Py_Initialize();]]) ],[pythonexists=yes],[pythonexists=no]) - AC_LANG_POP([C]) - # turn back to default flags - CPPFLAGS="$ac_save_CPPFLAGS" - LIBS="$ac_save_LIBS" - LDFLAGS="$ac_save_LDFLAGS" + AC_LANG_POP([C]) + # turn back to default flags + CPPFLAGS="$ac_save_CPPFLAGS" + LIBS="$ac_save_LIBS" + LDFLAGS="$ac_save_LDFLAGS" - AC_MSG_RESULT([$pythonexists]) + AC_MSG_RESULT([$pythonexists]) - if test ! "x$pythonexists" = "xyes"; then - m4_ifvaln([$2],[$2],[ - AC_MSG_FAILURE([ + if test ! "x$pythonexists" = "xyes"; then + AC_MSG_WARN([ Could not link test program to Python. Maybe the main Python library has been installed in some non-standard library path. If so, pass it to configure, via the LIBS environment variable. @@ -340,9 +453,13 @@ EOD` You probably have to install the development version of the Python package for your distribution. The exact name of this package varies among them. ============================================================================ - ]) - PYTHON_VERSION="" - ]) + ]) + if ! $ax_python_devel_optional; then + AC_MSG_ERROR([Giving up]) + fi + ax_python_devel_found=no + PYTHON_VERSION="" + fi fi # diff --git a/config/kernel-blk-queue.m4 b/config/kernel-blk-queue.m4 index bb5903b313eb..15dbe1c7dff0 100644 --- a/config/kernel-blk-queue.m4 +++ b/config/kernel-blk-queue.m4 @@ -377,6 +377,14 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_MQ], [ (void) blk_mq_alloc_tag_set(&tag_set); return BLK_STS_OK; ], []) + ZFS_LINUX_TEST_SRC([blk_mq_rq_hctx], [ + #include + #include + ], [ + struct request rq = {0}; + struct blk_mq_hw_ctx *hctx = NULL; + rq.mq_hctx = hctx; + ], []) ]) AC_DEFUN([ZFS_AC_KERNEL_BLK_MQ], [ @@ -384,6 +392,13 @@ AC_DEFUN([ZFS_AC_KERNEL_BLK_MQ], [ ZFS_LINUX_TEST_RESULT([blk_mq], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_BLK_MQ, 1, [block multiqueue is available]) + AC_MSG_CHECKING([whether block multiqueue hardware context is cached in struct request]) + ZFS_LINUX_TEST_RESULT([blk_mq_rq_hctx], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_MQ_RQ_HCTX, 1, [block multiqueue hardware context is cached in struct request]) + ], [ + AC_MSG_RESULT(no) + ]) ], [ AC_MSG_RESULT(no) ]) diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4 index dae7bef9ce0d..b6ce1e1cf083 100644 --- a/config/kernel-blkdev.m4 +++ b/config/kernel-blkdev.m4 @@ -54,6 +54,26 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_OPEN_BY_PATH], [ ]) ]) +dnl # +dnl # 6.9.x API change +dnl # bdev_file_open_by_path() replaced bdev_open_by_path(), +dnl # and returns struct file* +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BDEV_FILE_OPEN_BY_PATH], [ + ZFS_LINUX_TEST_SRC([bdev_file_open_by_path], [ + #include + #include + ], [ + struct file *file __attribute__ ((unused)) = NULL; + const char *path = "path"; + fmode_t mode = 0; + void *holder = NULL; + struct blk_holder_ops h; + + file = bdev_file_open_by_path(path, mode, holder, &h); + ]) +]) + AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH], [ AC_MSG_CHECKING([whether blkdev_get_by_path() exists and takes 3 args]) ZFS_LINUX_TEST_RESULT([blkdev_get_by_path], [ @@ -73,7 +93,16 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH], [ [bdev_open_by_path() exists]) AC_MSG_RESULT(yes) ], [ - ZFS_LINUX_TEST_ERROR([blkdev_get_by_path()]) + AC_MSG_RESULT(no) + AC_MSG_CHECKING([whether bdev_file_open_by_path() exists]) + ZFS_LINUX_TEST_RESULT([bdev_file_open_by_path], [ + AC_DEFINE(HAVE_BDEV_FILE_OPEN_BY_PATH, 1, + [bdev_file_open_by_path() exists]) + AC_MSG_RESULT(yes) + ], [ + AC_MSG_RESULT(no) + ZFS_LINUX_TEST_ERROR([blkdev_get_by_path()]) + ]) ]) ]) ]) @@ -149,10 +178,19 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_RELEASE], [ ]) ]) +dnl # +dnl # 6.9.x API change +dnl # +dnl # bdev_release() now private, but because bdev_file_open_by_path() returns +dnl # struct file*, we can just use fput(). So the blkdev_put test no longer +dnl # fails if not found. +dnl # + AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_PUT], [ AC_MSG_CHECKING([whether blkdev_put() exists]) ZFS_LINUX_TEST_RESULT([blkdev_put], [ AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLKDEV_PUT, 1, [blkdev_put() exists]) ], [ AC_MSG_RESULT(no) AC_MSG_CHECKING([whether blkdev_put() accepts void* as arg 2]) @@ -168,7 +206,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_PUT], [ AC_DEFINE(HAVE_BDEV_RELEASE, 1, [bdev_release() exists]) ], [ - ZFS_LINUX_TEST_ERROR([blkdev_put()]) + AC_MSG_RESULT(no) ]) ]) ]) @@ -697,6 +735,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [ ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH_4ARG ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_OPEN_BY_PATH + ZFS_AC_KERNEL_SRC_BDEV_FILE_OPEN_BY_PATH ZFS_AC_KERNEL_SRC_BLKDEV_PUT ZFS_AC_KERNEL_SRC_BLKDEV_PUT_HOLDER ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_RELEASE diff --git a/config/kernel-filemap.m4 b/config/kernel-filemap.m4 index 745928168f92..0b7da828d299 100644 --- a/config/kernel-filemap.m4 +++ b/config/kernel-filemap.m4 @@ -4,6 +4,7 @@ dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_FILEMAP], [ ZFS_LINUX_TEST_SRC([filemap_range_has_page], [ #include + #include ],[ struct address_space *mapping = NULL; loff_t lstart = 0; diff --git a/config/kernel-make-request-fn.m4 b/config/kernel-make-request-fn.m4 index 4d20dd45c4a1..9813ad2fb3f3 100644 --- a/config/kernel-make-request-fn.m4 +++ b/config/kernel-make-request-fn.m4 @@ -50,6 +50,14 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN], [ disk = blk_alloc_disk(NUMA_NO_NODE); ]) + ZFS_LINUX_TEST_SRC([blk_alloc_disk_2arg], [ + #include + ],[ + struct queue_limits *lim = NULL; + struct gendisk *disk __attribute__ ((unused)); + disk = blk_alloc_disk(lim, NUMA_NO_NODE); + ]) + ZFS_LINUX_TEST_SRC([blk_cleanup_disk], [ #include ],[ @@ -96,6 +104,31 @@ AC_DEFUN([ZFS_AC_KERNEL_MAKE_REQUEST_FN], [ ], [ AC_MSG_RESULT(no) ]) + + dnl # + dnl # Linux 6.9 API Change: + dnl # blk_alloc_queue() takes a nullable queue_limits arg. + dnl # + AC_MSG_CHECKING([whether blk_alloc_disk() exists and takes 2 args]) + ZFS_LINUX_TEST_RESULT([blk_alloc_disk_2arg], [ + AC_MSG_RESULT(yes) + AC_DEFINE([HAVE_BLK_ALLOC_DISK_2ARG], 1, [blk_alloc_disk() exists and takes 2 args]) + + dnl # + dnl # 5.20 API change, + dnl # Removed blk_cleanup_disk(), put_disk() should be used. + dnl # + AC_MSG_CHECKING([whether blk_cleanup_disk() exists]) + ZFS_LINUX_TEST_RESULT([blk_cleanup_disk], [ + AC_MSG_RESULT(yes) + AC_DEFINE([HAVE_BLK_CLEANUP_DISK], 1, + [blk_cleanup_disk() exists]) + ], [ + AC_MSG_RESULT(no) + ]) + ], [ + AC_MSG_RESULT(no) + ]) ],[ AC_MSG_RESULT(no) diff --git a/config/zfs-build.m4 b/config/zfs-build.m4 index 5f36569fe25b..bb5a85d815d1 100644 --- a/config/zfs-build.m4 +++ b/config/zfs-build.m4 @@ -578,13 +578,15 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ AC_MSG_CHECKING([default shell]) case "$VENDOR" in - gentoo) DEFAULT_INIT_SHELL="/sbin/openrc-run";; - alpine) DEFAULT_INIT_SHELL="/sbin/openrc-run";; - *) DEFAULT_INIT_SHELL="/bin/sh" ;; + gentoo|alpine) DEFAULT_INIT_SHELL=/sbin/openrc-run + IS_SYSV_RC=false ;; + *) DEFAULT_INIT_SHELL=/bin/sh + IS_SYSV_RC=true ;; esac AC_MSG_RESULT([$DEFAULT_INIT_SHELL]) AC_SUBST(DEFAULT_INIT_SHELL) + AC_SUBST(IS_SYSV_RC) AC_MSG_CHECKING([default nfs server init script]) AS_IF([test "$VENDOR" = "debian"], diff --git a/contrib/debian/control b/contrib/debian/control index 5b7874d11401..1f1a98a5ba99 100644 --- a/contrib/debian/control +++ b/contrib/debian/control @@ -10,6 +10,7 @@ Build-Depends: debhelper-compat (= 12), libelf-dev, libpam0g-dev, libssl-dev | libssl1.0-dev, + libtirpc-dev, libtool, libudev-dev, lsb-release, @@ -194,7 +195,7 @@ Depends: dkms (>> 2.1.1.2-5), file, libc6-dev | libc-dev, lsb-release, - python3-distutils | libpython3-stdlib (<< 3.6.4), + python3 (>> 3.12) | python3-distutils | libpython3-stdlib (<< 3.6.4), ${misc:Depends}, ${perl:Depends} Recommends: openzfs-zfs-zed, openzfs-zfsutils (>= ${source:Version}), ${linux:Recommends} diff --git a/etc/init.d/README.md b/etc/init.d/README.md index 2de05042ce63..da780fdc1222 100644 --- a/etc/init.d/README.md +++ b/etc/init.d/README.md @@ -7,11 +7,7 @@ DESCRIPTION They have been tested successfully on: - * Debian GNU/Linux Wheezy - * Debian GNU/Linux Jessie - * Ubuntu Trusty - * CentOS 6.0 - * CentOS 6.6 + * Debian GNU/Linux Bookworm * Gentoo SUPPORT diff --git a/etc/init.d/zfs-import.in b/etc/init.d/zfs-import.in index a9a0604f81ac..ff169eb96d86 100755 --- a/etc/init.d/zfs-import.in +++ b/etc/init.d/zfs-import.in @@ -307,7 +307,7 @@ do_start() # ---------------------------------------------------- -if [ ! -e /sbin/openrc-run ] +if @IS_SYSV_RC@ then case "$1" in start) diff --git a/etc/init.d/zfs-load-key.in b/etc/init.d/zfs-load-key.in index 53c7766b793a..27dfeeb0bcc5 100755 --- a/etc/init.d/zfs-load-key.in +++ b/etc/init.d/zfs-load-key.in @@ -104,7 +104,7 @@ do_stop() # ---------------------------------------------------- -if [ ! -e /sbin/openrc-run ] +if @IS_SYSV_RC@ then case "$1" in start) diff --git a/etc/init.d/zfs-mount.in b/etc/init.d/zfs-mount.in index a0825f19fcdd..6a3ca5f86908 100755 --- a/etc/init.d/zfs-mount.in +++ b/etc/init.d/zfs-mount.in @@ -114,7 +114,7 @@ do_stop() # ---------------------------------------------------- -if [ ! -e /sbin/openrc-run ] +if @IS_SYSV_RC@ then case "$1" in start) diff --git a/etc/init.d/zfs-share.in b/etc/init.d/zfs-share.in index 88978071cbf6..06c59c620b75 100755 --- a/etc/init.d/zfs-share.in +++ b/etc/init.d/zfs-share.in @@ -57,7 +57,8 @@ do_stop() # ---------------------------------------------------- -if [ ! -e /sbin/openrc-run ]; then +if @IS_SYSV_RC@ +then case "$1" in start) do_start diff --git a/etc/init.d/zfs-zed.in b/etc/init.d/zfs-zed.in index e9cf8867403c..3d40600cea5d 100755 --- a/etc/init.d/zfs-zed.in +++ b/etc/init.d/zfs-zed.in @@ -93,7 +93,8 @@ do_reload() # ---------------------------------------------------- -if [ ! -e /sbin/openrc-run ]; then +if @IS_SYSV_RC@ +then case "$1" in start) do_start diff --git a/include/libzfs.h b/include/libzfs.h index 770c5e1f201c..2823b8845827 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2022 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright Joyent, Inc. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2016, Intel Corporation. @@ -157,6 +157,8 @@ typedef enum zfs_error { EZFS_CKSUM, /* insufficient replicas */ EZFS_RESUME_EXISTS, /* Resume on existing dataset without force */ EZFS_SHAREFAILED, /* filesystem share failed */ + EZFS_RAIDZ_EXPAND_IN_PROGRESS, /* a raidz is currently expanding */ + EZFS_ASHIFT_MISMATCH, /* can't add vdevs with different ashifts */ EZFS_UNKNOWN } zfs_error_t; @@ -260,7 +262,7 @@ _LIBZFS_H boolean_t zpool_skip_pool(const char *); _LIBZFS_H int zpool_create(libzfs_handle_t *, const char *, nvlist_t *, nvlist_t *, nvlist_t *); _LIBZFS_H int zpool_destroy(zpool_handle_t *, const char *); -_LIBZFS_H int zpool_add(zpool_handle_t *, nvlist_t *); +_LIBZFS_H int zpool_add(zpool_handle_t *, nvlist_t *, boolean_t check_ashift); typedef struct splitflags { /* do not split, but return the config that would be split off */ diff --git a/include/os/freebsd/Makefile.am b/include/os/freebsd/Makefile.am index 9819e534b7f6..d4103c2f062a 100644 --- a/include/os/freebsd/Makefile.am +++ b/include/os/freebsd/Makefile.am @@ -4,8 +4,6 @@ noinst_HEADERS = \ \ %D%/spl/acl/acl_common.h \ \ - %D%/spl/rpc/xdr.h \ - \ %D%/spl/sys/ia32/asm_linkage.h \ \ %D%/spl/sys/acl.h \ @@ -80,7 +78,9 @@ noinst_HEADERS = \ %D%/spl/sys/zmod.h \ %D%/spl/sys/zone.h \ \ + %D%/zfs/sys/arc_os.h \ %D%/zfs/sys/freebsd_crypto.h \ + %D%/zfs/sys/freebsd_event.h \ %D%/zfs/sys/vdev_os.h \ %D%/zfs/sys/zfs_bootenv_os.h \ %D%/zfs/sys/zfs_context_os.h \ diff --git a/include/os/freebsd/spl/rpc/xdr.h b/include/os/freebsd/spl/rpc/xdr.h deleted file mode 100644 index c98466e9d16a..000000000000 --- a/include/os/freebsd/spl/rpc/xdr.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Sun RPC is a product of Sun Microsystems, Inc. and is provided for - * unrestricted use provided that this legend is included on all tape - * media and as a part of the software program in whole or part. Users - * may copy or modify Sun RPC without charge, but are not authorized - * to license or distribute it to anyone else except as part of a product or - * program developed by the user. - * - * SUN RPC IS PROVIDED AS IS WITH NO WARRANTIES OF ANY KIND INCLUDING THE - * WARRANTIES OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE, OR ARISING FROM A COURSE OF DEALING, USAGE OR TRADE PRACTICE. - * - * Sun RPC is provided with no support and without any obligation on the - * part of Sun Microsystems, Inc. to assist in its use, correction, - * modification or enhancement. - * - * SUN MICROSYSTEMS, INC. SHALL HAVE NO LIABILITY WITH RESPECT TO THE - * INFRINGEMENT OF COPYRIGHTS, TRADE SECRETS OR ANY PATENTS BY SUN RPC - * OR ANY PART THEREOF. - * - * In no event will Sun Microsystems, Inc. be liable for any lost revenue - * or profits or other special, indirect and consequential damages, even if - * Sun has been advised of the possibility of such damages. - * - * Sun Microsystems, Inc. - * 2550 Garcia Avenue - * Mountain View, California 94043 - */ - -#ifndef _OPENSOLARIS_RPC_XDR_H_ -#define _OPENSOLARIS_RPC_XDR_H_ - -#include -#include_next - -#if !defined(_KERNEL) && !defined(_STANDALONE) - -#include - -/* - * Taken from sys/xdr/xdr_mem.c. - * - * FreeBSD's userland XDR doesn't implement control method (only the kernel), - * but OpenSolaris nvpair still depend on it, so we have to implement it here. - */ -static __inline bool_t -xdrmem_control(XDR *xdrs, int request, void *info) -{ - xdr_bytesrec *xptr; - - switch (request) { - case XDR_GET_BYTES_AVAIL: - xptr = (xdr_bytesrec *)info; - xptr->xc_is_last_record = TRUE; - xptr->xc_num_avail = xdrs->x_handy; - return (TRUE); - default: - assert(!"unexpected request"); - } - return (FALSE); -} - -#undef XDR_CONTROL -#define XDR_CONTROL(xdrs, req, op) \ - (((xdrs)->x_ops->x_control == NULL) ? \ - xdrmem_control((xdrs), (req), (op)) : \ - (*(xdrs)->x_ops->x_control)(xdrs, req, op)) - -#endif /* !_KERNEL && !_STANDALONE */ - -#endif /* !_OPENSOLARIS_RPC_XDR_H_ */ diff --git a/include/os/freebsd/spl/sys/debug.h b/include/os/freebsd/spl/sys/debug.h index c788b54d5ac4..31755ad8c2a4 100644 --- a/include/os/freebsd/spl/sys/debug.h +++ b/include/os/freebsd/spl/sys/debug.h @@ -91,8 +91,8 @@ spl_assert(const char *buf, const char *file, const char *func, int line) spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ "failed (%d " #OP " %d)\n", \ - (boolean_t)(_verify3_left), \ - (boolean_t)(_verify3_right)); \ + (boolean_t)_verify3_left, \ + (boolean_t)_verify3_right); \ } while (0) #define VERIFY3S(LEFT, OP, RIGHT) do { \ @@ -102,8 +102,8 @@ spl_assert(const char *buf, const char *file, const char *func, int line) spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ "failed (%lld " #OP " %lld)\n", \ - (long long) (_verify3_left), \ - (long long) (_verify3_right)); \ + (long long)_verify3_left, \ + (long long)_verify3_right); \ } while (0) #define VERIFY3U(LEFT, OP, RIGHT) do { \ @@ -113,8 +113,8 @@ spl_assert(const char *buf, const char *file, const char *func, int line) spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ "failed (%llu " #OP " %llu)\n", \ - (unsigned long long) (_verify3_left), \ - (unsigned long long) (_verify3_right)); \ + (unsigned long long)_verify3_left, \ + (unsigned long long)_verify3_right); \ } while (0) #define VERIFY3P(LEFT, OP, RIGHT) do { \ @@ -123,19 +123,27 @@ spl_assert(const char *buf, const char *file, const char *func, int line) if (unlikely(!(_verify3_left OP _verify3_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ - "failed (%px " #OP " %px)\n", \ - (void *) (_verify3_left), \ - (void *) (_verify3_right)); \ + "failed (%p " #OP " %p)\n", \ + (void *)_verify3_left, \ + (void *)_verify3_right); \ } while (0) #define VERIFY0(RIGHT) do { \ - const int64_t _verify3_left = (int64_t)(0); \ - const int64_t _verify3_right = (int64_t)(RIGHT); \ - if (unlikely(!(_verify3_left == _verify3_right))) \ + const int64_t _verify0_right = (int64_t)(RIGHT); \ + if (unlikely(!(0 == _verify0_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY0(0 == " #RIGHT ") " \ + "VERIFY0(" #RIGHT ") " \ "failed (0 == %lld)\n", \ - (long long) (_verify3_right)); \ + (long long)_verify0_right); \ + } while (0) + +#define VERIFY0P(RIGHT) do { \ + const uintptr_t _verify0_right = (uintptr_t)(RIGHT); \ + if (unlikely(!(0 == _verify0_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY0P(" #RIGHT ") " \ + "failed (NULL == %p)\n", \ + (void *)_verify0_right); \ } while (0) #define VERIFY0P(RIGHT) do { \ diff --git a/include/os/linux/Makefile.am b/include/os/linux/Makefile.am index 51c27132b4ef..332569efe361 100644 --- a/include/os/linux/Makefile.am +++ b/include/os/linux/Makefile.am @@ -47,6 +47,7 @@ kernel_sys_HEADERS = \ kernel_spl_rpcdir = $(kerneldir)/spl/rpc kernel_spl_rpc_HEADERS = \ + %D%/spl/rpc/types.h \ %D%/spl/rpc/xdr.h kernel_spl_sysdir = $(kerneldir)/spl/sys diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h index f111e648ccf7..b0f398354e4f 100644 --- a/include/os/linux/kernel/linux/blkdev_compat.h +++ b/include/os/linux/kernel/linux/blkdev_compat.h @@ -563,9 +563,11 @@ static inline boolean_t bdev_discard_supported(struct block_device *bdev) { #if defined(HAVE_BDEV_MAX_DISCARD_SECTORS) - return (!!bdev_max_discard_sectors(bdev)); + return (bdev_max_discard_sectors(bdev) > 0 && + bdev_discard_granularity(bdev) > 0); #elif defined(HAVE_BLK_QUEUE_DISCARD) - return (!!blk_queue_discard(bdev_get_queue(bdev))); + return (blk_queue_discard(bdev_get_queue(bdev)) > 0 && + bdev_get_queue(bdev)->limits.discard_granularity > 0); #else #error "Unsupported kernel" #endif diff --git a/include/os/linux/spl/rpc/types.h b/include/os/linux/spl/rpc/types.h new file mode 100644 index 000000000000..5bbb4f2dec46 --- /dev/null +++ b/include/os/linux/spl/rpc/types.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2008 Sun Microsystems, Inc. + * Written by Ricardo Correia + * + * This file is part of the SPL, Solaris Porting Layer. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_RPC_TYPES_H +#define _SPL_RPC_TYPES_H + +#include + +/* Just enough to support rpc/xdr.h */ + +typedef int bool_t; + +#endif /* SPL_RPC_TYPES_H */ diff --git a/include/os/linux/spl/rpc/xdr.h b/include/os/linux/spl/rpc/xdr.h index 05aed7cb81ce..606566113e1c 100644 --- a/include/os/linux/spl/rpc/xdr.h +++ b/include/os/linux/spl/rpc/xdr.h @@ -24,8 +24,6 @@ #include #include -typedef int bool_t; - /* * XDR enums and types. */ diff --git a/include/os/linux/spl/sys/debug.h b/include/os/linux/spl/sys/debug.h index 82338d212edd..9743108253de 100644 --- a/include/os/linux/spl/sys/debug.h +++ b/include/os/linux/spl/sys/debug.h @@ -95,8 +95,8 @@ spl_assert(const char *buf, const char *file, const char *func, int line) spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ "failed (%d " #OP " %d)\n", \ - (boolean_t)(_verify3_left), \ - (boolean_t)(_verify3_right)); \ + (boolean_t)_verify3_left, \ + (boolean_t)_verify3_right); \ } while (0) #define VERIFY3S(LEFT, OP, RIGHT) do { \ @@ -106,8 +106,8 @@ spl_assert(const char *buf, const char *file, const char *func, int line) spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ "failed (%lld " #OP " %lld)\n", \ - (long long)(_verify3_left), \ - (long long)(_verify3_right)); \ + (long long)_verify3_left, \ + (long long)_verify3_right); \ } while (0) #define VERIFY3U(LEFT, OP, RIGHT) do { \ @@ -117,8 +117,8 @@ spl_assert(const char *buf, const char *file, const char *func, int line) spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ "failed (%llu " #OP " %llu)\n", \ - (unsigned long long)(_verify3_left), \ - (unsigned long long)(_verify3_right)); \ + (unsigned long long)_verify3_left, \ + (unsigned long long)_verify3_right); \ } while (0) #define VERIFY3P(LEFT, OP, RIGHT) do { \ @@ -128,18 +128,26 @@ spl_assert(const char *buf, const char *file, const char *func, int line) spl_panic(__FILE__, __FUNCTION__, __LINE__, \ "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ "failed (%px " #OP " %px)\n", \ - (void *) (_verify3_left), \ - (void *) (_verify3_right)); \ + (void *)_verify3_left, \ + (void *)_verify3_right); \ } while (0) #define VERIFY0(RIGHT) do { \ - const int64_t _verify3_left = (int64_t)(0); \ - const int64_t _verify3_right = (int64_t)(RIGHT); \ - if (unlikely(!(_verify3_left == _verify3_right))) \ + const int64_t _verify0_right = (int64_t)(RIGHT); \ + if (unlikely(!(0 == _verify0_right))) \ spl_panic(__FILE__, __FUNCTION__, __LINE__, \ - "VERIFY0(0 == " #RIGHT ") " \ + "VERIFY0(" #RIGHT ") " \ "failed (0 == %lld)\n", \ - (long long) (_verify3_right)); \ + (long long)_verify0_right); \ + } while (0) + +#define VERIFY0P(RIGHT) do { \ + const uintptr_t _verify0_right = (uintptr_t)(RIGHT); \ + if (unlikely(!(0 == _verify0_right))) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY0P(" #RIGHT ") " \ + "failed (NULL == %px)\n", \ + (void *)_verify0_right); \ } while (0) #define VERIFY0P(RIGHT) do { \ diff --git a/include/os/linux/spl/sys/taskq.h b/include/os/linux/spl/sys/taskq.h index 6c1b4377a98a..6c0dbbefda7f 100644 --- a/include/os/linux/spl/sys/taskq.h +++ b/include/os/linux/spl/sys/taskq.h @@ -104,7 +104,7 @@ typedef struct taskq { /* list node for the cpu hotplug callback */ struct hlist_node tq_hp_cb_node; boolean_t tq_hp_support; - unsigned long lastshouldstop; /* when to purge dynamic */ + unsigned long lastspawnstop; /* when to purge dynamic */ } taskq_t; typedef struct taskq_ent { diff --git a/include/os/linux/zfs/sys/trace_zil.h b/include/os/linux/zfs/sys/trace_zil.h index afa1a274e43c..ae1caa3ac473 100644 --- a/include/os/linux/zfs/sys/trace_zil.h +++ b/include/os/linux/zfs/sys/trace_zil.h @@ -51,7 +51,9 @@ __field(uint64_t, zl_parse_lr_seq) \ __field(uint64_t, zl_parse_blk_count) \ __field(uint64_t, zl_parse_lr_count) \ - __field(uint64_t, zl_cur_used) \ + __field(uint64_t, zl_cur_size) \ + __field(uint64_t, zl_cur_left) \ + __field(uint64_t, zl_cur_max) \ __field(clock_t, zl_replay_time) \ __field(uint64_t, zl_replay_blks) @@ -72,7 +74,9 @@ __entry->zl_parse_lr_seq = zilog->zl_parse_lr_seq; \ __entry->zl_parse_blk_count = zilog->zl_parse_blk_count;\ __entry->zl_parse_lr_count = zilog->zl_parse_lr_count; \ - __entry->zl_cur_used = zilog->zl_cur_used; \ + __entry->zl_cur_size = zilog->zl_cur_size; \ + __entry->zl_cur_left = zilog->zl_cur_left; \ + __entry->zl_cur_max = zilog->zl_cur_max; \ __entry->zl_replay_time = zilog->zl_replay_time; \ __entry->zl_replay_blks = zilog->zl_replay_blks; @@ -82,7 +86,8 @@ "replay %u stop_sync %u logbias %u sync %u " \ "parse_error %u parse_blk_seq %llu parse_lr_seq %llu " \ "parse_blk_count %llu parse_lr_count %llu " \ - "cur_used %llu replay_time %lu replay_blks %llu }" + "cur_size %llu cur_left %llu cur_max %llu replay_time %lu " \ + "replay_blks %llu }" #define ZILOG_TP_PRINTK_ARGS \ __entry->zl_lr_seq, __entry->zl_commit_lr_seq, \ @@ -92,7 +97,8 @@ __entry->zl_stop_sync, __entry->zl_logbias, __entry->zl_sync, \ __entry->zl_parse_error, __entry->zl_parse_blk_seq, \ __entry->zl_parse_lr_seq, __entry->zl_parse_blk_count, \ - __entry->zl_parse_lr_count, __entry->zl_cur_used, \ + __entry->zl_parse_lr_count, __entry->zl_cur_size, \ + __entry->zl_cur_left, __entry->zl_cur_max, \ __entry->zl_replay_time, __entry->zl_replay_blks #define ITX_TP_STRUCT_ENTRY \ diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 06b4dc27dfea..26b329b53f05 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -739,8 +739,6 @@ void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user); void *dmu_buf_get_user(dmu_buf_t *db); objset_t *dmu_buf_get_objset(dmu_buf_t *db); -dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db); -void dmu_buf_dnode_exit(dmu_buf_t *db); /* Block until any in-progress dmu buf user evictions complete. */ void dmu_buf_user_evict_wait(void); @@ -889,6 +887,9 @@ extern uint_t zfs_max_recordsize; */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, enum zio_priority pri); +void dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset, + uint64_t len, enum zio_priority pri); +void dmu_prefetch_dnode(objset_t *os, uint64_t object, enum zio_priority pri); typedef struct dmu_object_info { /* All sizes are in bytes unless otherwise indicated. */ diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h index 9f6e0fdd601b..a9123e862af7 100644 --- a/include/sys/dmu_objset.h +++ b/include/sys/dmu_objset.h @@ -132,6 +132,7 @@ struct objset { zfs_logbias_op_t os_logbias; zfs_cache_type_t os_primary_cache; zfs_cache_type_t os_secondary_cache; + zfs_prefetch_type_t os_prefetch; zfs_sync_type_t os_sync; zfs_redundant_metadata_type_t os_redundant_metadata; uint64_t os_recordsize; diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h index fb9e8649221e..c746600cd2d5 100644 --- a/include/sys/fm/fs/zfs.h +++ b/include/sys/fm/fs/zfs.h @@ -82,6 +82,8 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T "vdev_cksum_t" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N "vdev_io_n" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T "vdev_io_t" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N "vdev_slow_io_n" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T "vdev_slow_io_t" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS "vdev_delays" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type" diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index d8b6ff3fbd4e..1a924cfb3874 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013, 2017 Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] @@ -191,6 +191,7 @@ typedef enum { ZFS_PROP_REDACTED, ZFS_PROP_REDACT_SNAPS, ZFS_PROP_SNAPSHOTS_CHANGED, + ZFS_PROP_PREFETCH, ZFS_PROP_VOLTHREADING, ZFS_NUM_PROPS } zfs_prop_t; @@ -364,6 +365,9 @@ typedef enum { VDEV_PROP_CHECKSUM_T, VDEV_PROP_IO_N, VDEV_PROP_IO_T, + VDEV_PROP_RAIDZ_EXPANDING, + VDEV_PROP_SLOW_IO_N, + VDEV_PROP_SLOW_IO_T, VDEV_NUM_PROPS } vdev_prop_t; @@ -544,6 +548,12 @@ typedef enum zfs_key_location { ZFS_KEYLOCATION_LOCATIONS } zfs_keylocation_t; +typedef enum { + ZFS_PREFETCH_NONE = 0, + ZFS_PREFETCH_METADATA = 1, + ZFS_PREFETCH_ALL = 2 +} zfs_prefetch_type_t; + #define DEFAULT_PBKDF2_ITERATIONS 350000 #define MIN_PBKDF2_ITERATIONS 100000 @@ -1570,6 +1580,8 @@ typedef enum { ZFS_ERR_NOT_USER_NAMESPACE, ZFS_ERR_RESUME_EXISTS, ZFS_ERR_CRYPTO_NOTSUP, + ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS, + ZFS_ERR_ASHIFT_MISMATCH, } zfs_errno_t; /* diff --git a/include/sys/spa.h b/include/sys/spa.h index 87ddbd90e170..6611141b9569 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2021 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. @@ -769,7 +769,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); #define SPA_ASYNC_CONFIG_UPDATE 0x01 #define SPA_ASYNC_REMOVE 0x02 -#define SPA_ASYNC_PROBE 0x04 +#define SPA_ASYNC_FAULT_VDEV 0x04 #define SPA_ASYNC_RESILVER_DONE 0x08 #define SPA_ASYNC_RESILVER 0x10 #define SPA_ASYNC_AUTOEXPAND 0x20 @@ -784,7 +784,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); #define SPA_ASYNC_DETACH_SPARE 0x4000 /* device manipulation */ -extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); +extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t ashift_check); extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, int rebuild); extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, @@ -966,6 +966,10 @@ extern int spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t max_txg); extern int spa_import_progress_set_state(uint64_t pool_guid, spa_load_state_t spa_load_state); +extern void spa_import_progress_set_notes(spa_t *spa, + const char *fmt, ...) __printflike(2, 3); +extern void spa_import_progress_set_notes_nolog(spa_t *spa, + const char *fmt, ...) __printflike(2, 3); /* Pool configuration locks */ extern int spa_config_tryenter(spa_t *spa, int locks, const void *tag, @@ -1109,6 +1113,8 @@ extern uint32_t spa_get_hostid(spa_t *spa); extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *); extern boolean_t spa_livelist_delete_check(spa_t *spa); +extern boolean_t spa_mmp_remote_host_activity(spa_t *spa); + extern spa_mode_t spa_mode(spa_t *spa); extern uint64_t zfs_strtonum(const char *str, char **nptr); diff --git a/include/sys/uberblock_impl.h b/include/sys/uberblock_impl.h index 03bcfa8f4dd1..13fce9c29e2d 100644 --- a/include/sys/uberblock_impl.h +++ b/include/sys/uberblock_impl.h @@ -50,20 +50,20 @@ extern "C" { #define MMP_SEQ_VALID_BIT 0x02 #define MMP_FAIL_INT_VALID_BIT 0x04 -#define MMP_VALID(ubp) (ubp->ub_magic == UBERBLOCK_MAGIC && \ - ubp->ub_mmp_magic == MMP_MAGIC) -#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ +#define MMP_VALID(ubp) ((ubp)->ub_magic == UBERBLOCK_MAGIC && \ + (ubp)->ub_mmp_magic == MMP_MAGIC) +#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \ MMP_INTERVAL_VALID_BIT)) -#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ +#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \ MMP_SEQ_VALID_BIT)) -#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ +#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && ((ubp)->ub_mmp_config & \ MMP_FAIL_INT_VALID_BIT)) -#define MMP_INTERVAL(ubp) ((ubp->ub_mmp_config & 0x00000000FFFFFF00) \ +#define MMP_INTERVAL(ubp) (((ubp)->ub_mmp_config & 0x00000000FFFFFF00) \ >> 8) -#define MMP_SEQ(ubp) ((ubp->ub_mmp_config & 0x0000FFFF00000000) \ +#define MMP_SEQ(ubp) (((ubp)->ub_mmp_config & 0x0000FFFF00000000) \ >> 32) -#define MMP_FAIL_INT(ubp) ((ubp->ub_mmp_config & 0xFFFF000000000000) \ +#define MMP_FAIL_INT(ubp) (((ubp)->ub_mmp_config & 0xFFFF000000000000) \ >> 48) #define MMP_INTERVAL_SET(write) \ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 3f2312c23438..8c6ab316fa18 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2023, Klara Inc. */ #ifndef _SYS_VDEV_IMPL_H @@ -273,7 +274,7 @@ struct vdev { txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ boolean_t vdev_remove_wanted; /* async remove wanted? */ - boolean_t vdev_probe_wanted; /* async probe wanted? */ + boolean_t vdev_fault_wanted; /* async faulted wanted? */ list_node_t vdev_config_dirty_node; /* config dirty list */ list_node_t vdev_state_dirty_node; /* state dirty list */ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ @@ -453,12 +454,14 @@ struct vdev { zfs_ratelimit_t vdev_checksum_rl; /* - * Checksum and IO thresholds for tuning ZED + * Vdev properties for tuning ZED or zfsd */ uint64_t vdev_checksum_n; uint64_t vdev_checksum_t; uint64_t vdev_io_n; uint64_t vdev_io_t; + uint64_t vdev_slow_io_n; + uint64_t vdev_slow_io_t; }; #define VDEV_PAD_SIZE (8 << 10) diff --git a/include/sys/zap.h b/include/sys/zap.h index 308a7c7284d7..96ddcc324b65 100644 --- a/include/sys/zap.h +++ b/include/sys/zap.h @@ -253,6 +253,9 @@ int zap_add_by_dnode(dnode_t *dn, const char *key, int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); +int zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx); /* * Set the attribute with the given name to the given value. If an @@ -267,6 +270,9 @@ int zap_update(objset_t *ds, uint64_t zapobj, const char *name, int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); +int zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); /* * Get the length (in integers) and the integer size of the specified @@ -292,6 +298,8 @@ int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name, int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx); int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, dmu_tx_t *tx); +int zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints, dmu_tx_t *tx); /* * Returns (in *count) the number of attributes in the specified zap diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h index 74853f5faceb..2959aa9b2ca4 100644 --- a/include/sys/zap_impl.h +++ b/include/sys/zap_impl.h @@ -145,6 +145,7 @@ typedef struct zap { dmu_buf_user_t zap_dbu; objset_t *zap_objset; uint64_t zap_object; + dnode_t *zap_dnode; struct dmu_buf *zap_dbuf; krwlock_t zap_rwlock; boolean_t zap_ismicro; diff --git a/include/sys/zap_leaf.h b/include/sys/zap_leaf.h index ebc67c2bf465..e54456d3472b 100644 --- a/include/sys/zap_leaf.h +++ b/include/sys/zap_leaf.h @@ -47,7 +47,7 @@ struct zap_stats; * entries - header space (2*chunksize) */ #define ZAP_LEAF_NUMCHUNKS_BS(bs) \ - (((1<<(bs)) - 2*ZAP_LEAF_HASH_NUMENTRIES_BS(bs)) / \ + (((1U << (bs)) - 2 * ZAP_LEAF_HASH_NUMENTRIES_BS(bs)) / \ ZAP_LEAF_CHUNKSIZE - 2) #define ZAP_LEAF_NUMCHUNKS(l) (ZAP_LEAF_NUMCHUNKS_BS(((l)->l_bs))) @@ -80,7 +80,7 @@ struct zap_stats; * chunks per entry (3). */ #define ZAP_LEAF_HASH_SHIFT_BS(bs) ((bs) - 5) -#define ZAP_LEAF_HASH_NUMENTRIES_BS(bs) (1 << ZAP_LEAF_HASH_SHIFT_BS(bs)) +#define ZAP_LEAF_HASH_NUMENTRIES_BS(bs) (1U << ZAP_LEAF_HASH_SHIFT_BS(bs)) #define ZAP_LEAF_HASH_SHIFT(l) (ZAP_LEAF_HASH_SHIFT_BS(((l)->l_bs))) #define ZAP_LEAF_HASH_NUMENTRIES(l) (ZAP_LEAF_HASH_NUMENTRIES_BS(((l)->l_bs))) @@ -132,7 +132,7 @@ typedef struct zap_leaf_phys { * with the ZAP_LEAF_CHUNK() macro. */ - uint16_t l_hash[1]; + uint16_t l_hash[]; } zap_leaf_phys_t; typedef union zap_leaf_chunk { @@ -163,7 +163,7 @@ typedef struct zap_leaf { dmu_buf_user_t l_dbu; krwlock_t l_rwlock; uint64_t l_blkid; /* 1<ul_debug || robust) { - uu_list_walk_t my_walk; + uu_list_walk_t *my_walk; void *e; - list_walk_init(&my_walk, lp, flags); + my_walk = uu_zalloc(sizeof (*my_walk)); + if (my_walk == NULL) + return (-1); + + list_walk_init(my_walk, lp, flags); while (status == UU_WALK_NEXT && - (e = uu_list_walk_next(&my_walk)) != NULL) + (e = uu_list_walk_next(my_walk)) != NULL) status = (*func)(e, private); - list_walk_fini(&my_walk); + list_walk_fini(my_walk); + + uu_free(my_walk); } else { if (!reverse) { for (np = lp->ul_null_node.uln_next; diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index e40285cbb5ef..54f128f07cf3 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -1112,14 +1112,11 @@ - + - - - - + @@ -1870,8 +1867,9 @@ - - + + + @@ -2879,6 +2877,9 @@ + + + @@ -2891,6 +2892,10 @@ + + + + @@ -3011,6 +3016,24 @@ + + + + + + + + + + + + + + + + + + @@ -3026,6 +3049,7 @@ + @@ -3261,9 +3285,13 @@ + + + + @@ -3296,6 +3324,7 @@ + @@ -3400,6 +3429,10 @@ + + + + @@ -3483,8 +3516,9 @@ - + + @@ -3841,12 +3875,18 @@ + + + + + + @@ -4279,9 +4319,13 @@ - - - + + + + + + + @@ -4305,9 +4349,13 @@ - - - + + + + + + + @@ -5672,7 +5720,10 @@ - + + + + @@ -6243,6 +6294,7 @@ + @@ -6694,7 +6746,7 @@ - + @@ -6702,7 +6754,7 @@ - + diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 7cdfab982e81..1a4996a54e8b 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -22,7 +22,7 @@ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2016 Igor Kozhukhov * Copyright (c) 2018 Datto Inc. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. @@ -1724,7 +1724,7 @@ zpool_discard_checkpoint(zpool_handle_t *zhp) * necessary verification to ensure that the vdev specification is well-formed. */ int -zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot) +zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot, boolean_t check_ashift) { zfs_cmd_t zc = {"\0"}; int ret; @@ -1756,6 +1756,7 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot) zcmd_write_conf_nvlist(hdl, &zc, nvroot); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + zc.zc_flags = check_ashift; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) { switch (errno) { @@ -1899,7 +1900,8 @@ zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun, (void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss); if (localtime_r((time_t *)&rewindto, &t) != NULL && - strftime(timestr, 128, "%c", &t) != 0) { + ctime_r((time_t *)&rewindto, timestr) != NULL) { + timestr[24] = 0; if (dryrun) { (void) printf(dgettext(TEXT_DOMAIN, "Would be able to return %s " @@ -1961,7 +1963,8 @@ zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason, "Recovery is possible, but will result in some data loss.\n")); if (localtime_r((time_t *)&rewindto, &t) != NULL && - strftime(timestr, 128, "%c", &t) != 0) { + ctime_r((time_t *)&rewindto, timestr) != NULL) { + timestr[24] = 0; (void) printf(dgettext(TEXT_DOMAIN, "\tReturning the pool to its state as of %s\n" "\tshould correct the problem. "), @@ -5227,6 +5230,8 @@ zpool_get_vdev_prop_value(nvlist_t *nvprop, vdev_prop_t prop, char *prop_name, case VDEV_PROP_CHECKSUM_T: case VDEV_PROP_IO_N: case VDEV_PROP_IO_T: + case VDEV_PROP_SLOW_IO_N: + case VDEV_PROP_SLOW_IO_T: if (intval == UINT64_MAX) { (void) strlcpy(buf, "-", len); } else { diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index e9bc78aa8d39..143aecb9459f 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -1053,6 +1053,7 @@ send_progress_thread(void *arg) } } pthread_cleanup_pop(B_TRUE); + return (NULL); } static boolean_t diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index fdd1975fa677..60e9262f6b71 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2020 Joyent, Inc. All rights reserved. - * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright 2016 Igor Kozhukhov * Copyright (c) 2017 Datto Inc. * Copyright (c) 2020 The FreeBSD Foundation @@ -317,6 +317,9 @@ libzfs_error_description(libzfs_handle_t *hdl) case EZFS_RESUME_EXISTS: return (dgettext(TEXT_DOMAIN, "Resuming recv on existing " "dataset without force")); + case EZFS_ASHIFT_MISMATCH: + return (dgettext(TEXT_DOMAIN, "adding devices with " + "different physical sector sizes is not allowed")); case EZFS_UNKNOWN: return (dgettext(TEXT_DOMAIN, "unknown error")); default: @@ -763,6 +766,9 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) case ZFS_ERR_IOC_ARG_BADTYPE: zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); break; + case ZFS_ERR_ASHIFT_MISMATCH: + zfs_verror(hdl, EZFS_ASHIFT_MISMATCH, fmt, ap); + break; default: zfs_error_aux(hdl, "%s", strerror(error)); zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); @@ -1699,7 +1705,9 @@ zprop_parse_value(libzfs_handle_t *hdl, nvpair_t *elem, int prop, (prop == VDEV_PROP_CHECKSUM_N || prop == VDEV_PROP_CHECKSUM_T || prop == VDEV_PROP_IO_N || - prop == VDEV_PROP_IO_T)) { + prop == VDEV_PROP_IO_T || + prop == VDEV_PROP_SLOW_IO_N || + prop == VDEV_PROP_SLOW_IO_T)) { *ivalp = UINT64_MAX; } diff --git a/lib/libzfs/os/linux/libzfs_pool_os.c b/lib/libzfs/os/linux/libzfs_pool_os.c index 23985cb409ce..088ba4ea2335 100644 --- a/lib/libzfs/os/linux/libzfs_pool_os.c +++ b/lib/libzfs/os/linux/libzfs_pool_os.c @@ -279,6 +279,16 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, const char *name) vtoc->efi_parts[0].p_start = start_block; vtoc->efi_parts[0].p_size = slice_size; + if (vtoc->efi_parts[0].p_size * vtoc->efi_lbasize < SPA_MINDEVSIZE) { + (void) close(fd); + efi_free(vtoc); + + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " + "label '%s': partition would be less than the minimum " + "device size (64M)"), path); + return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); + } + /* * Why we use V_USR: V_BACKUP confuses users, and is considered * disposable by some EFI utilities (since EFI doesn't have a backup diff --git a/man/Makefile.am b/man/Makefile.am index 45156571eec3..43bb014ddd32 100644 --- a/man/Makefile.am +++ b/man/Makefile.am @@ -62,7 +62,6 @@ dist_man_MANS = \ %D%/man8/zfs-userspace.8 \ %D%/man8/zfs-wait.8 \ %D%/man8/zfs_ids_to_path.8 \ - %D%/man8/zfs_prepare_disk.8 \ %D%/man8/zgenhostid.8 \ %D%/man8/zinject.8 \ %D%/man8/zpool.8 \ @@ -115,7 +114,8 @@ endif nodist_man_MANS = \ %D%/man8/zed.8 \ - %D%/man8/zfs-mount-generator.8 + %D%/man8/zfs-mount-generator.8 \ + %D%/man8/zfs_prepare_disk.8 dist_noinst_DATA += $(dist_noinst_man_MANS) $(dist_man_MANS) diff --git a/man/man4/spl.4 b/man/man4/spl.4 index 414a92394858..5cc12764e18c 100644 --- a/man/man4/spl.4 +++ b/man/man4/spl.4 @@ -186,18 +186,8 @@ reading it could cause a lock-up if the list grow too large without limiting the output. "(truncated)" will be shown if the list is larger than the limit. . -.It Sy spl_taskq_thread_timeout_ms Ns = Ns Sy 10000 Pq uint -(Linux-only) -How long a taskq has to have had no work before we tear it down. -Previously, we would tear down a dynamic taskq worker as soon -as we noticed it had no work, but it was observed that this led -to a lot of churn in tearing down things we then immediately -spawned anew. -In practice, it seems any nonzero value will remove the vast -majority of this churn, while the nontrivially larger value -was chosen to help filter out the little remaining churn on -a mostly idle system. -Setting this value to -.Sy 0 -will revert to the previous behavior. +.It Sy spl_taskq_thread_timeout_ms Ns = Ns Sy 5000 Pq uint +Minimum idle threads exit interval for dynamic taskqs. +Smaller values allow idle threads exit more often and potentially be +respawned again on demand, causing more churn. .El diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 13a57900271e..e822d88a9981 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -245,12 +245,25 @@ For blocks that could be forced to be a gang block (due to .Sy metaslab_force_ganging ) , force this many of them to be gang blocks. . -.It Sy zfs_ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int +.It Sy brt_zap_prefetch Ns = Ns Sy 1 Ns | Ns 0 Pq int +Controls prefetching BRT records for blocks which are going to be cloned. +. +.It Sy brt_zap_default_bs Ns = Ns Sy 12 Po 4 KiB Pc Pq int +Default BRT ZAP data block size as a power of 2. Note that changing this after +creating a BRT on the pool will not affect existing BRTs, only newly created +ones. +. +.It Sy brt_zap_default_ibs Ns = Ns Sy 12 Po 4 KiB Pc Pq int +Default BRT ZAP indirect block size as a power of 2. Note that changing this +after creating a BRT on the pool will not affect existing BRTs, only newly +created ones. +. +.It Sy ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int Default DDT ZAP data block size as a power of 2. Note that changing this after creating a DDT on the pool will not affect existing DDTs, only newly created ones. . -.It Sy zfs_ddt_zap_default_ibs Ns = Ns Sy 15 Po 32 KiB Pc Pq int +.It Sy ddt_zap_default_ibs Ns = Ns Sy 15 Po 32 KiB Pc Pq int Default DDT ZAP indirect block size as a power of 2. Note that changing this after creating a DDT on the pool will not affect existing DDTs, only newly created ones. @@ -2371,6 +2384,13 @@ The number of requests which can be handled concurrently is controlled by is ignored when running on a kernel that supports block multiqueue .Pq Li blk-mq . . +.It Sy zvol_num_taskqs Ns = Ns Sy 0 Pq uint +Number of zvol taskqs. +If +.Sy 0 +(the default) then scaling is done internally to prefer 6 threads per taskq. +This only applies on Linux. +. .It Sy zvol_threads Ns = Ns Sy 0 Pq uint The number of system wide threads to use for processing zvol block IOs. If diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7 index 6eebfa0060de..5ec37df179de 100644 --- a/man/man7/vdevprops.7 +++ b/man/man7/vdevprops.7 @@ -44,7 +44,7 @@ section, below. Every vdev has a set of properties that export statistics about the vdev as well as control various behaviors. Properties are not inherited from top-level vdevs, with the exception of -checksum_n, checksum_t, io_n, and io_t. +checksum_n, checksum_t, io_n, io_t, slow_io_n, and slow_io_t. .Pp The values of numeric properties can be specified using human-readable suffixes .Po for example, @@ -117,7 +117,7 @@ If this device is currently being removed from the pool .Pp The following native properties can be used to change the behavior of a vdev. .Bl -tag -width "allocating" -.It Sy checksum_n , checksum_t , io_n , io_t +.It Sy checksum_n , checksum_t , io_n , io_t , slow_io_n , slow_io_t Tune the fault management daemon by specifying checksum/io thresholds of errors in seconds, respectively. These properties can be set on leaf and top-level vdevs. @@ -127,7 +127,13 @@ If the property is only set on the top-level vdev, this value will be used. The value of these properties do not persist across vdev replacement. For this reason, it is advisable to set the property on the top-level vdev - not on the leaf vdev itself. -The default values are 10 errors in 600 seconds. +The default values for +.Sy OpenZFS on Linux +are 10 errors in 600 seconds. +For +.Sy OpenZFS on FreeBSD +defaults see +.Xr zfsd 8 . .It Sy comment A text comment up to 8192 characters long .It Sy bootsize diff --git a/man/man7/zfsprops.7 b/man/man7/zfsprops.7 index 069be5e3837a..3cd225e57b3a 100644 --- a/man/man7/zfsprops.7 +++ b/man/man7/zfsprops.7 @@ -1620,6 +1620,23 @@ If this property is set to then only metadata is cached. The default value is .Sy all . +.It Sy prefetch Ns = Ns Sy all Ns | Ns Sy none Ns | Ns Sy metadata +Controls what speculative prefetch does. +If this property is set to +.Sy all , +then both user data and metadata are prefetched. +If this property is set to +.Sy none , +then neither user data nor metadata are prefetched. +If this property is set to +.Sy metadata , +then only metadata are prefetched. +The default value is +.Sy all . +.Pp +Please note that the module parameter zfs_disable_prefetch=1 can +be used to totally disable speculative prefetch, bypassing anything +this property does. .It Sy setuid Ns = Ns Sy on Ns | Ns Sy off Controls whether the setuid bit is respected for the file system. The default value is diff --git a/man/man7/zpoolconcepts.7 b/man/man7/zpoolconcepts.7 index 98f3ee7cd660..18dfca6dc8ac 100644 --- a/man/man7/zpoolconcepts.7 +++ b/man/man7/zpoolconcepts.7 @@ -260,8 +260,8 @@ sufficient replicas exist to continue functioning. The underlying conditions are as follows: .Bl -bullet -compact .It -The number of checksum errors exceeds acceptable levels and the device is -degraded as an indication that something may be wrong. +The number of checksum errors or slow I/Os exceeds acceptable levels and the +device is degraded as an indication that something may be wrong. ZFS continues to use the device as necessary. .It The number of I/O errors exceeds acceptable levels. diff --git a/man/man8/zinject.8 b/man/man8/zinject.8 index 4f0bbae81212..b692f12130a8 100644 --- a/man/man8/zinject.8 +++ b/man/man8/zinject.8 @@ -69,6 +69,7 @@ Force a vdev into the DEGRADED or FAULTED state. .Nm zinject .Fl d Ar vdev .Fl D Ar latency : Ns Ar lanes +.Op Fl T Ar read|write .Ar pool .Xc Add an artificial delay to I/O requests on a particular diff --git a/man/man8/zpool-add.8 b/man/man8/zpool-add.8 index 8ccdcccc7b06..60b35f1a511a 100644 --- a/man/man8/zpool-add.8 +++ b/man/man8/zpool-add.8 @@ -24,8 +24,9 @@ .\" Copyright (c) 2018 George Melikov. All Rights Reserved. .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" Copyright (c) 2024 by Delphix. All Rights Reserved. .\" -.Dd March 16, 2022 +.Dd March 8, 2024 .Dt ZPOOL-ADD 8 .Os . @@ -36,6 +37,7 @@ .Nm zpool .Cm add .Op Fl fgLnP +.Op Fl -allow-in-use -allow-replication-mismatch -allow-ashift-mismatch .Oo Fl o Ar property Ns = Ns Ar value Oc .Ar pool vdev Ns … . @@ -56,7 +58,8 @@ subcommand. .It Fl f Forces use of .Ar vdev Ns s , -even if they appear in use or specify a conflicting replication level. +even if they appear in use, have conflicting ashift values, or specify +a conflicting replication level. Not all devices can be overridden in this manner. .It Fl g Display @@ -91,6 +94,17 @@ See the manual page for a list of valid properties that can be set. The only property supported at the moment is .Sy ashift . +.It Fl -allow-ashift-mismatch +Disable the ashift validation which allows mismatched ashift values in the +pool. +Adding top-level +.Ar vdev Ns s +with different sector sizes will prohibit future device removal operations, see +.Xr zpool-remove 8 . +.It Fl -allow-in-use +Allow vdevs to be added even if they might be in use in another pool. +.It Fl -allow-replication-mismatch +Allow vdevs with conflicting replication levels to be added to the pool. .El . .Sh EXAMPLES diff --git a/man/man8/zpool-clear.8 b/man/man8/zpool-clear.8 index c61ecae483ac..3e448be87fc2 100644 --- a/man/man8/zpool-clear.8 +++ b/man/man8/zpool-clear.8 @@ -50,9 +50,10 @@ If the pool was suspended it will be brought back online provided the devices can be accessed. Pools with .Sy multihost -enabled which have been suspended cannot be resumed. -While the pool was suspended, it may have been imported on -another host, and resuming I/O could result in pool damage. +enabled which have been suspended cannot be resumed when there is evidence +that the pool was imported by another host. +The same checks performed during an import will be applied before the clear +proceeds. .Bl -tag -width Ds .It Fl -power Power on the devices's slot in the storage enclosure and wait for the device diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 index 24ad6e643cae..bbe7a45aa0c6 100644 --- a/man/man8/zpool-status.8 +++ b/man/man8/zpool-status.8 @@ -36,7 +36,7 @@ .Sh SYNOPSIS .Nm zpool .Cm status -.Op Fl DeigLpPstvx +.Op Fl DegiLpPstvx .Op Fl T Sy u Ns | Ns Sy d .Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns … .Oo Ar pool Oc Ns … @@ -69,14 +69,20 @@ See the option of .Nm zpool Cm iostat for complete details. +.It Fl D +Display a histogram of deduplication statistics, showing the allocated +.Pq physically present on disk +and referenced +.Pq logically referenced in the pool +block counts and sizes by reference count. .It Fl e Only show unhealthy vdevs (not-ONLINE or with errors). -.It Fl i -Display vdev initialization status. .It Fl g Display vdev GUIDs instead of the normal device names These GUIDs can be used in place of device names for the zpool detach/offline/remove/replace commands. +.It Fl i +Display vdev initialization status. .It Fl L Display real paths for vdevs resolving all symbolic links. This can be used to look up the current block device name regardless of the @@ -90,12 +96,6 @@ the path. This can be used in conjunction with the .Fl L flag. -.It Fl D -Display a histogram of deduplication statistics, showing the allocated -.Pq physically present on disk -and referenced -.Pq logically referenced in the pool -block counts and sizes by reference count. .It Fl s Display the number of leaf vdev slow I/O operations. This is the number of I/O operations that didn't complete in diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S index dc2719d142db..e66bb4bc7f26 100644 --- a/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S +++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S @@ -32,6 +32,14 @@ */ #if defined(__aarch64__) + +/* make gcc <= 9 happy */ +#if LD_VERSION >= 233010000 +#define CFI_NEGATE_RA_STATE .cfi_negate_ra_state +#else +#define CFI_NEGATE_RA_STATE +#endif + .text .section .note.gnu.property,"a",@note .p2align 3 @@ -51,7 +59,7 @@ zfs_blake3_compress_in_place_sse2: .cfi_startproc hint #25 - .cfi_negate_ra_state + CFI_NEGATE_RA_STATE sub sp, sp, #96 stp x29, x30, [sp, #64] add x29, sp, #64 @@ -555,7 +563,7 @@ compress_pre: zfs_blake3_compress_xof_sse2: .cfi_startproc hint #25 - .cfi_negate_ra_state + CFI_NEGATE_RA_STATE sub sp, sp, #96 stp x29, x30, [sp, #64] add x29, sp, #64 @@ -608,7 +616,7 @@ zfs_blake3_compress_xof_sse2: zfs_blake3_hash_many_sse2: .cfi_startproc hint #25 - .cfi_negate_ra_state + CFI_NEGATE_RA_STATE stp d15, d14, [sp, #-160]! stp d13, d12, [sp, #16] stp d11, d10, [sp, #32] diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S index c4c2dfc5bcde..b9fb28dfcf03 100644 --- a/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S +++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S @@ -32,6 +32,14 @@ */ #if defined(__aarch64__) + +/* make gcc <= 9 happy */ +#if LD_VERSION >= 233010000 +#define CFI_NEGATE_RA_STATE .cfi_negate_ra_state +#else +#define CFI_NEGATE_RA_STATE +#endif + .text .section .note.gnu.property,"a",@note .p2align 3 @@ -51,7 +59,7 @@ zfs_blake3_compress_in_place_sse41: .cfi_startproc hint #25 - .cfi_negate_ra_state + CFI_NEGATE_RA_STATE sub sp, sp, #96 stp x29, x30, [sp, #64] add x29, sp, #64 @@ -565,7 +573,7 @@ compress_pre: zfs_blake3_compress_xof_sse41: .cfi_startproc hint #25 - .cfi_negate_ra_state + CFI_NEGATE_RA_STATE sub sp, sp, #96 stp x29, x30, [sp, #64] add x29, sp, #64 diff --git a/module/icp/asm-aarch64/sha2/sha256-armv8.S b/module/icp/asm-aarch64/sha2/sha256-armv8.S index 7ae486e4e229..4dcdd3b65d0b 100644 --- a/module/icp/asm-aarch64/sha2/sha256-armv8.S +++ b/module/icp/asm-aarch64/sha2/sha256-armv8.S @@ -21,6 +21,16 @@ #if defined(__aarch64__) + .section .note.gnu.property,"a",@note + .p2align 3 + .word 4 + .word 16 + .word 5 + .asciz "GNU" + .word 3221225472 + .word 4 + .word 3 + .word 0 .text .align 6 diff --git a/module/icp/asm-aarch64/sha2/sha512-armv8.S b/module/icp/asm-aarch64/sha2/sha512-armv8.S index 9c61eeee4d7b..f6c8f7742912 100644 --- a/module/icp/asm-aarch64/sha2/sha512-armv8.S +++ b/module/icp/asm-aarch64/sha2/sha512-armv8.S @@ -21,6 +21,16 @@ #if defined(__aarch64__) + .section .note.gnu.property,"a",@note + .p2align 3 + .word 4 + .word 16 + .word 5 + .asciz "GNU" + .word 3221225472 + .word 4 + .word 3 + .word 0 .text .align 6 diff --git a/module/nvpair/nvpair.c b/module/nvpair/nvpair.c index d9449e47e87a..887f7d32df4a 100644 --- a/module/nvpair/nvpair.c +++ b/module/nvpair/nvpair.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index 05f28033be6a..1ba25bce6196 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -1869,10 +1869,8 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, ASSERT3S(outcount, <=, bufsize); - /* Prefetch znode */ if (prefetch) - dmu_prefetch(os, objnum, 0, 0, 0, - ZIO_PRIORITY_SYNC_READ); + dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ); /* * Move to the next entry, fill in the previous offset. diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c index b6edac434dea..7c418f26fc14 100644 --- a/module/os/freebsd/zfs/zvol_os.c +++ b/module/os/freebsd/zfs/zvol_os.c @@ -1273,7 +1273,7 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname) ASSERT(MUTEX_HELD(&zv->zv_state_lock)); /* Move to a new hashtable entry. */ - zv->zv_hash = zvol_name_hash(zv->zv_name); + zv->zv_hash = zvol_name_hash(newname); hlist_del(&zv->zv_hlink); hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c index d18f935b167c..0e44aa1fcd38 100644 --- a/module/os/linux/spl/spl-taskq.c +++ b/module/os/linux/spl/spl-taskq.c @@ -36,12 +36,12 @@ static int spl_taskq_thread_bind = 0; module_param(spl_taskq_thread_bind, int, 0644); MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default"); -static uint_t spl_taskq_thread_timeout_ms = 10000; +static uint_t spl_taskq_thread_timeout_ms = 5000; /* BEGIN CSTYLED */ module_param(spl_taskq_thread_timeout_ms, uint, 0644); /* END CSTYLED */ MODULE_PARM_DESC(spl_taskq_thread_timeout_ms, - "Time to require a dynamic thread be idle before it gets cleaned up"); + "Minimum idle threads exit interval for dynamic taskqs"); static int spl_taskq_thread_dynamic = 1; module_param(spl_taskq_thread_dynamic, int, 0444); @@ -594,8 +594,7 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) ASSERT(tq->tq_nactive <= tq->tq_nthreads); if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) { /* Dynamic taskq may be able to spawn another thread */ - if (!(tq->tq_flags & TASKQ_DYNAMIC) || - taskq_thread_spawn(tq) == 0) + if (taskq_thread_spawn(tq) == 0) goto out; } @@ -629,11 +628,11 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) spin_unlock(&t->tqent_lock); wake_up(&tq->tq_work_waitq); -out: + /* Spawn additional taskq threads if required. */ if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads) (void) taskq_thread_spawn(tq); - +out: spin_unlock_irqrestore(&tq->tq_lock, irqflags); return (rc); } @@ -676,10 +675,11 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); spin_unlock(&t->tqent_lock); -out: + /* Spawn additional taskq threads if required. */ if (tq->tq_nactive == tq->tq_nthreads) (void) taskq_thread_spawn(tq); +out: spin_unlock_irqrestore(&tq->tq_lock, irqflags); return (rc); } @@ -704,9 +704,8 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) { /* Dynamic taskq may be able to spawn another thread */ - if (!(tq->tq_flags & TASKQ_DYNAMIC) || - taskq_thread_spawn(tq) == 0) - goto out2; + if (taskq_thread_spawn(tq) == 0) + goto out; flags |= TQ_FRONT; } @@ -742,11 +741,11 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, spin_unlock(&t->tqent_lock); wake_up(&tq->tq_work_waitq); -out: + /* Spawn additional taskq threads if required. */ if (tq->tq_nactive == tq->tq_nthreads) (void) taskq_thread_spawn(tq); -out2: +out: spin_unlock_irqrestore(&tq->tq_lock, irqflags); } EXPORT_SYMBOL(taskq_dispatch_ent); @@ -825,6 +824,7 @@ taskq_thread_spawn(taskq_t *tq) if (!(tq->tq_flags & TASKQ_DYNAMIC)) return (0); + tq->lastspawnstop = jiffies; if ((tq->tq_nthreads + tq->tq_nspawn < tq->tq_maxthreads) && (tq->tq_flags & TASKQ_ACTIVE)) { spawning = (++tq->tq_nspawn); @@ -836,9 +836,9 @@ taskq_thread_spawn(taskq_t *tq) } /* - * Threads in a dynamic taskq should only exit once it has been completely - * drained and no other threads are actively servicing tasks. This prevents - * threads from being created and destroyed more than is required. + * Threads in a dynamic taskq may exit once there is no more work to do. + * To prevent threads from being created and destroyed too often limit + * the exit rate to one per spl_taskq_thread_timeout_ms. * * The first thread is the thread list is treated as the primary thread. * There is nothing special about the primary thread but in order to avoid @@ -847,44 +847,22 @@ taskq_thread_spawn(taskq_t *tq) static int taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt) { - if (!(tq->tq_flags & TASKQ_DYNAMIC)) + ASSERT(!taskq_next_ent(tq)); + if (!(tq->tq_flags & TASKQ_DYNAMIC) || !spl_taskq_thread_dynamic) return (0); - + if (!(tq->tq_flags & TASKQ_ACTIVE)) + return (1); if (list_first_entry(&(tq->tq_thread_list), taskq_thread_t, tqt_thread_list) == tqt) return (0); - - int no_work = - ((tq->tq_nspawn == 0) && /* No threads are being spawned */ - (tq->tq_nactive == 0) && /* No threads are handling tasks */ - (tq->tq_nthreads > 1) && /* More than 1 thread is running */ - (!taskq_next_ent(tq)) && /* There are no pending tasks */ - (spl_taskq_thread_dynamic)); /* Dynamic taskqs are allowed */ - - /* - * If we would have said stop before, let's instead wait a bit, maybe - * we'll see more work come our way soon... - */ - if (no_work) { - /* if it's 0, we want the old behavior. */ - /* if the taskq is being torn down, we also want to go away. */ - if (spl_taskq_thread_timeout_ms == 0 || - !(tq->tq_flags & TASKQ_ACTIVE)) - return (1); - unsigned long lasttime = tq->lastshouldstop; - if (lasttime > 0) { - if (time_after(jiffies, lasttime + - msecs_to_jiffies(spl_taskq_thread_timeout_ms))) - return (1); - else - return (0); - } else { - tq->lastshouldstop = jiffies; - } - } else { - tq->lastshouldstop = 0; - } - return (0); + ASSERT3U(tq->tq_nthreads, >, 1); + if (tq->tq_nspawn != 0) + return (0); + if (time_before(jiffies, tq->lastspawnstop + + msecs_to_jiffies(spl_taskq_thread_timeout_ms))) + return (0); + tq->lastspawnstop = jiffies; + return (1); } static int @@ -935,10 +913,8 @@ taskq_thread(void *args) if (list_empty(&tq->tq_pend_list) && list_empty(&tq->tq_prio_list)) { - if (taskq_thread_should_stop(tq, tqt)) { - wake_up_all(&tq->tq_wait_waitq); + if (taskq_thread_should_stop(tq, tqt)) break; - } add_wait_queue_exclusive(&tq->tq_work_waitq, &wait); spin_unlock_irqrestore(&tq->tq_lock, flags); @@ -1013,9 +989,6 @@ taskq_thread(void *args) tqt->tqt_id = TASKQID_INVALID; tqt->tqt_flags = 0; wake_up_all(&tq->tq_wait_waitq); - } else { - if (taskq_thread_should_stop(tq, tqt)) - break; } set_current_state(TASK_INTERRUPTIBLE); @@ -1122,7 +1095,7 @@ taskq_create(const char *name, int threads_arg, pri_t pri, tq->tq_flags = (flags | TASKQ_ACTIVE); tq->tq_next_id = TASKQID_INITIAL; tq->tq_lowest_id = TASKQID_INITIAL; - tq->lastshouldstop = 0; + tq->lastspawnstop = jiffies; INIT_LIST_HEAD(&tq->tq_free_list); INIT_LIST_HEAD(&tq->tq_pend_list); INIT_LIST_HEAD(&tq->tq_prio_list); diff --git a/module/os/linux/spl/spl-xdr.c b/module/os/linux/spl/spl-xdr.c index 6b77524181db..e1773da5d173 100644 --- a/module/os/linux/spl/spl-xdr.c +++ b/module/os/linux/spl/spl-xdr.c @@ -25,6 +25,7 @@ #include #include #include +#include #include /* diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 071bcdf54dd2..943e534ef5b0 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -45,15 +45,25 @@ /* * Linux 6.8.x uses a bdev_handle as an instance/refcount for an underlying * block_device. Since it carries the block_device inside, its convenient to - * just use the handle as a proxy. For pre-6.8, we just emulate this with - * a cast, since we don't need any of the other fields inside the handle. + * just use the handle as a proxy. + * + * Linux 6.9.x uses a file for the same purpose. + * + * For pre-6.8, we just emulate this with a cast, since we don't need any of + * the other fields inside the handle. */ -#ifdef HAVE_BDEV_OPEN_BY_PATH +#if defined(HAVE_BDEV_OPEN_BY_PATH) typedef struct bdev_handle zfs_bdev_handle_t; #define BDH_BDEV(bdh) ((bdh)->bdev) #define BDH_IS_ERR(bdh) (IS_ERR(bdh)) #define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) #define BDH_ERR_PTR(err) (ERR_PTR(err)) +#elif defined(HAVE_BDEV_FILE_OPEN_BY_PATH) +typedef struct file zfs_bdev_handle_t; +#define BDH_BDEV(bdh) (file_bdev(bdh)) +#define BDH_IS_ERR(bdh) (IS_ERR(bdh)) +#define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) +#define BDH_ERR_PTR(err) (ERR_PTR(err)) #else typedef void zfs_bdev_handle_t; #define BDH_BDEV(bdh) ((struct block_device *)bdh) @@ -97,38 +107,41 @@ static uint_t zfs_vdev_open_timeout_ms = 1000; static unsigned int zfs_vdev_failfast_mask = 1; +/* + * Convert SPA mode flags into bdev open mode flags. + */ #ifdef HAVE_BLK_MODE_T -static blk_mode_t +typedef blk_mode_t vdev_bdev_mode_t; +#define VDEV_BDEV_MODE_READ BLK_OPEN_READ +#define VDEV_BDEV_MODE_WRITE BLK_OPEN_WRITE +#define VDEV_BDEV_MODE_EXCL BLK_OPEN_EXCL +#define VDEV_BDEV_MODE_MASK (BLK_OPEN_READ|BLK_OPEN_WRITE|BLK_OPEN_EXCL) #else -static fmode_t +typedef fmode_t vdev_bdev_mode_t; +#define VDEV_BDEV_MODE_READ FMODE_READ +#define VDEV_BDEV_MODE_WRITE FMODE_WRITE +#define VDEV_BDEV_MODE_EXCL FMODE_EXCL +#define VDEV_BDEV_MODE_MASK (FMODE_READ|FMODE_WRITE|FMODE_EXCL) #endif -vdev_bdev_mode(spa_mode_t spa_mode, boolean_t exclusive) -{ -#ifdef HAVE_BLK_MODE_T - blk_mode_t mode = 0; - - if (spa_mode & SPA_MODE_READ) - mode |= BLK_OPEN_READ; - if (spa_mode & SPA_MODE_WRITE) - mode |= BLK_OPEN_WRITE; +static vdev_bdev_mode_t +vdev_bdev_mode(spa_mode_t smode) +{ + ASSERT3U(smode, !=, SPA_MODE_UNINIT); + ASSERT0(smode & ~(SPA_MODE_READ|SPA_MODE_WRITE)); - if (exclusive) - mode |= BLK_OPEN_EXCL; -#else - fmode_t mode = 0; + vdev_bdev_mode_t bmode = VDEV_BDEV_MODE_EXCL; - if (spa_mode & SPA_MODE_READ) - mode |= FMODE_READ; + if (smode & SPA_MODE_READ) + bmode |= VDEV_BDEV_MODE_READ; - if (spa_mode & SPA_MODE_WRITE) - mode |= FMODE_WRITE; + if (smode & SPA_MODE_WRITE) + bmode |= VDEV_BDEV_MODE_WRITE; - if (exclusive) - mode |= FMODE_EXCL; -#endif + ASSERT(bmode & VDEV_BDEV_MODE_MASK); + ASSERT0(bmode & ~VDEV_BDEV_MODE_MASK); - return (mode); + return (bmode); } /* @@ -235,30 +248,32 @@ vdev_disk_kobj_evt_post(vdev_t *v) } static zfs_bdev_handle_t * -vdev_blkdev_get_by_path(const char *path, spa_mode_t mode, void *holder) +vdev_blkdev_get_by_path(const char *path, spa_mode_t smode, void *holder) { -#if defined(HAVE_BDEV_OPEN_BY_PATH) - return (bdev_open_by_path(path, - vdev_bdev_mode(mode, B_TRUE), holder, NULL)); + vdev_bdev_mode_t bmode = vdev_bdev_mode(smode); + +#if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) + return (bdev_file_open_by_path(path, bmode, holder, NULL)); +#elif defined(HAVE_BDEV_OPEN_BY_PATH) + return (bdev_open_by_path(path, bmode, holder, NULL)); #elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG) - return (blkdev_get_by_path(path, - vdev_bdev_mode(mode, B_TRUE), holder, NULL)); + return (blkdev_get_by_path(path, bmode, holder, NULL)); #else - return (blkdev_get_by_path(path, - vdev_bdev_mode(mode, B_TRUE), holder)); + return (blkdev_get_by_path(path, bmode, holder)); #endif } static void -vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t mode, void *holder) +vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t smode, void *holder) { #if defined(HAVE_BDEV_RELEASE) return (bdev_release(bdh)); #elif defined(HAVE_BLKDEV_PUT_HOLDER) return (blkdev_put(BDH_BDEV(bdh), holder)); +#elif defined(HAVE_BLKDEV_PUT) + return (blkdev_put(BDH_BDEV(bdh), vdev_bdev_mode(smode))); #else - return (blkdev_put(BDH_BDEV(bdh), - vdev_bdev_mode(mode, B_TRUE))); + fput(bdh); #endif } @@ -267,11 +282,7 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) { zfs_bdev_handle_t *bdh; -#ifdef HAVE_BLK_MODE_T - blk_mode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE); -#else - fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE); -#endif + spa_mode_t smode = spa_mode(v->vdev_spa); hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); vdev_disk_t *vd; @@ -322,16 +333,16 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, reread_part = B_TRUE; } - vdev_blkdev_put(bdh, mode, zfs_vdev_holder); + vdev_blkdev_put(bdh, smode, zfs_vdev_holder); } if (reread_part) { - bdh = vdev_blkdev_get_by_path(disk_name, mode, + bdh = vdev_blkdev_get_by_path(disk_name, smode, zfs_vdev_holder); if (!BDH_IS_ERR(bdh)) { int error = vdev_bdev_reread_part(BDH_BDEV(bdh)); - vdev_blkdev_put(bdh, mode, zfs_vdev_holder); + vdev_blkdev_put(bdh, smode, zfs_vdev_holder); if (error == 0) { timeout = MSEC2NSEC( zfs_vdev_open_timeout_ms * 2); @@ -376,7 +387,7 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, hrtime_t start = gethrtime(); bdh = BDH_ERR_PTR(-ENXIO); while (BDH_IS_ERR(bdh) && ((gethrtime() - start) < timeout)) { - bdh = vdev_blkdev_get_by_path(v->vdev_path, mode, + bdh = vdev_blkdev_get_by_path(v->vdev_path, smode, zfs_vdev_holder); if (unlikely(BDH_PTR_ERR(bdh) == -ENOENT)) { /* @@ -775,6 +786,12 @@ vbio_submit(vbio_t *vbio, abd_t *abd, uint64_t size) vbio->vbio_bio->bi_end_io = vbio_completion; vbio->vbio_bio->bi_private = vbio; + /* + * Once submitted, vbio_bio now owns vbio (through bi_private) and we + * can't touch it again. The bio may complete and vbio_completion() be + * called and free the vbio before this task is run again, so we must + * consider it invalid from this point. + */ vdev_submit_bio(vbio->vbio_bio); blk_finish_plug(&plug); @@ -859,7 +876,7 @@ vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv) * Note if we're taking less than a full block, so we can check it * above on the next call. */ - s->end = len & s->bmask; + s->end = (off+len) & s->bmask; /* All blocks after the first must start on a block size boundary. */ if (s->npages != 0 && (off & s->bmask) != 0) diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index d2ae1bc01007..6dab671f7f66 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -1642,11 +1642,8 @@ zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) if (done) break; - /* Prefetch znode */ - if (prefetch) { - dmu_prefetch(os, objnum, 0, 0, 0, - ZIO_PRIORITY_SYNC_READ); - } + if (prefetch) + dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ); /* * Move to the next entry, fill in the previous offset. diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index bde3be2a7978..39d484d41862 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -53,6 +54,12 @@ static unsigned int zvol_request_sync = 0; static unsigned int zvol_prefetch_bytes = (128 * 1024); static unsigned long zvol_max_discard_blocks = 16384; +/* + * Switch taskq at multiple of 512 MB offset. This can be set to a lower value + * to utilize more threads for small files but may affect prefetch hits. + */ +#define ZVOL_TASKQ_OFFSET_SHIFT 29 + #ifndef HAVE_BLKDEV_GET_ERESTARTSYS static unsigned int zvol_open_timeout_ms = 1000; #endif @@ -76,6 +83,8 @@ static boolean_t zvol_use_blk_mq = B_FALSE; static unsigned int zvol_blk_mq_blocks_per_thread = 8; #endif +static unsigned int zvol_num_taskqs = 0; + #ifndef BLKDEV_DEFAULT_RQ /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ @@ -114,7 +123,11 @@ struct zvol_state_os { boolean_t use_blk_mq; }; -static taskq_t *zvol_taskq; +typedef struct zv_taskq { + uint_t tqs_cnt; + taskq_t **tqs_taskq; +} zv_taskq_t; +static zv_taskq_t zvol_taskqs; static struct ida zvol_ida; typedef struct zv_request_stack { @@ -532,6 +545,22 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, } zv_request_task_t *task; + zv_taskq_t *ztqs = &zvol_taskqs; + uint_t blk_mq_hw_queue = 0; + uint_t tq_idx; + uint_t taskq_hash; +#ifdef HAVE_BLK_MQ + if (rq) +#ifdef HAVE_BLK_MQ_RQ_HCTX + blk_mq_hw_queue = rq->mq_hctx->queue_num; +#else + blk_mq_hw_queue = + rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num; +#endif +#endif + taskq_hash = cityhash4((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, + blk_mq_hw_queue, 0); + tq_idx = taskq_hash % ztqs->tqs_cnt; if (rw == WRITE) { if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { @@ -601,7 +630,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, zvol_discard(&zvr); } else { task = zv_request_task_create(zvr); - taskq_dispatch_ent(zvol_taskq, + taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_discard_task, task, 0, &task->ent); } } else { @@ -609,7 +638,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, zvol_write(&zvr); } else { task = zv_request_task_create(zvr); - taskq_dispatch_ent(zvol_taskq, + taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_write_task, task, 0, &task->ent); } } @@ -631,7 +660,7 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, zvol_read(&zvr); } else { task = zv_request_task_create(zvr); - taskq_dispatch_ent(zvol_taskq, + taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_read_task, task, 0, &task->ent); } } @@ -1053,6 +1082,16 @@ zvol_alloc_non_blk_mq(struct zvol_state_os *zso) if (zso->zvo_disk == NULL) return (1); + zso->zvo_disk->minors = ZVOL_MINORS; + zso->zvo_queue = zso->zvo_disk->queue; +#elif defined(HAVE_BLK_ALLOC_DISK_2ARG) + struct gendisk *disk = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(disk)) { + zso->zvo_disk = NULL; + return (1); + } + + zso->zvo_disk = disk; zso->zvo_disk->minors = ZVOL_MINORS; zso->zvo_queue = zso->zvo_disk->queue; #else @@ -1103,6 +1142,17 @@ zvol_alloc_blk_mq(zvol_state_t *zv) } zso->zvo_queue = zso->zvo_disk->queue; zso->zvo_disk->minors = ZVOL_MINORS; +#elif defined(HAVE_BLK_ALLOC_DISK_2ARG) + struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, NULL, zv); + if (IS_ERR(disk)) { + zso->zvo_disk = NULL; + blk_mq_free_tag_set(&zso->tag_set); + return (1); + } + + zso->zvo_disk = disk; + zso->zvo_queue = zso->zvo_disk->queue; + zso->zvo_disk->minors = ZVOL_MINORS; #else zso->zvo_disk = alloc_disk(ZVOL_MINORS); if (zso->zvo_disk == NULL) { @@ -1256,7 +1306,7 @@ zvol_os_free(zvol_state_t *zv) del_gendisk(zv->zv_zso->zvo_disk); #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ - defined(HAVE_BLK_ALLOC_DISK) + (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG)) #if defined(HAVE_BLK_CLEANUP_DISK) blk_cleanup_disk(zv->zv_zso->zvo_disk); #else @@ -1314,6 +1364,13 @@ zvol_os_create_minor(const char *name) if (idx < 0) return (SET_ERROR(-idx)); minor = idx << ZVOL_MINOR_BITS; + if (MINOR(minor) != minor) { + /* too many partitions can cause an overflow */ + zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u", + name, minor, MINOR(minor)); + ida_simple_remove(&zvol_ida, idx); + return (SET_ERROR(EINVAL)); + } zv = zvol_find_by_name_hash(name, hash, RW_NONE); if (zv) { @@ -1514,7 +1571,7 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname) strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); /* move to new hashtable entry */ - zv->zv_hash = zvol_name_hash(zv->zv_name); + zv->zv_hash = zvol_name_hash(newname); hlist_del(&zv->zv_hlink); hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); @@ -1570,8 +1627,40 @@ zvol_init(void) zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); } + /* + * Use atleast 32 zvol_threads but for many core system, + * prefer 6 threads per taskq, but no more taskqs + * than threads in them on large systems. + * + * taskq total + * cpus taskqs threads threads + * ------- ------- ------- ------- + * 1 1 32 32 + * 2 1 32 32 + * 4 1 32 32 + * 8 2 16 32 + * 16 3 11 33 + * 32 5 7 35 + * 64 8 8 64 + * 128 11 12 132 + * 256 16 16 256 + */ + zv_taskq_t *ztqs = &zvol_taskqs; + uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs); + if (num_tqs == 0) { + num_tqs = 1 + num_online_cpus() / 6; + while (num_tqs * num_tqs > zvol_actual_threads) + num_tqs--; + } + uint_t per_tq_thread = zvol_actual_threads / num_tqs; + if (per_tq_thread * num_tqs < zvol_actual_threads) + per_tq_thread++; + ztqs->tqs_cnt = num_tqs; + ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP); error = register_blkdev(zvol_major, ZVOL_DRIVER); if (error) { + kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *)); + ztqs->tqs_taskq = NULL; printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); return (error); } @@ -1591,11 +1680,22 @@ zvol_init(void) 1024); } #endif - zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_actual_threads, maxclsyspri, - zvol_actual_threads, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); - if (zvol_taskq == NULL) { - unregister_blkdev(zvol_major, ZVOL_DRIVER); - return (-ENOMEM); + for (uint_t i = 0; i < num_tqs; i++) { + char name[32]; + (void) snprintf(name, sizeof (name), "%s_tq-%u", + ZVOL_DRIVER, i); + ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread, + maxclsyspri, per_tq_thread, INT_MAX, + TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + if (ztqs->tqs_taskq[i] == NULL) { + for (int j = i - 1; j >= 0; j--) + taskq_destroy(ztqs->tqs_taskq[j]); + unregister_blkdev(zvol_major, ZVOL_DRIVER); + kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * + sizeof (taskq_t *)); + ztqs->tqs_taskq = NULL; + return (-ENOMEM); + } } zvol_init_impl(); @@ -1606,9 +1706,22 @@ zvol_init(void) void zvol_fini(void) { + zv_taskq_t *ztqs = &zvol_taskqs; zvol_fini_impl(); unregister_blkdev(zvol_major, ZVOL_DRIVER); - taskq_destroy(zvol_taskq); + + if (ztqs->tqs_taskq == NULL) { + ASSERT3U(ztqs->tqs_cnt, ==, 0); + } else { + for (uint_t i = 0; i < ztqs->tqs_cnt; i++) { + ASSERT3P(ztqs->tqs_taskq[i], !=, NULL); + taskq_destroy(ztqs->tqs_taskq[i]); + } + kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * + sizeof (taskq_t *)); + ztqs->tqs_taskq = NULL; + } + ida_destroy(&zvol_ida); } @@ -1629,6 +1742,9 @@ MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); module_param(zvol_max_discard_blocks, ulong, 0444); MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); +module_param(zvol_num_taskqs, uint, 0444); +MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs"); + module_param(zvol_prefetch_bytes, uint, 0644); MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c index f63c2ad2f61f..31425802f97f 100644 --- a/module/zcommon/zfs_prop.c +++ b/module/zcommon/zfs_prop.c @@ -345,6 +345,13 @@ zfs_prop_init(void) { NULL } }; + static const zprop_index_t prefetch_table[] = { + { "none", ZFS_PREFETCH_NONE }, + { "metadata", ZFS_PREFETCH_METADATA }, + { "all", ZFS_PREFETCH_ALL }, + { NULL } + }; + static const zprop_index_t sync_table[] = { { "standard", ZFS_SYNC_STANDARD }, { "always", ZFS_SYNC_ALWAYS }, @@ -447,6 +454,10 @@ zfs_prop_init(void) ZFS_CACHE_ALL, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, "all | none | metadata", "SECONDARYCACHE", cache_table, sfeatures); + zprop_register_index(ZFS_PROP_PREFETCH, "prefetch", + ZFS_PREFETCH_ALL, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, + "none | metadata | all", "PREFETCH", prefetch_table, sfeatures); zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "latency | throughput", "LOGBIAS", logbias_table, sfeatures); diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index c4aca04a96bd..ff70c0e3c35b 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -431,6 +431,12 @@ vdev_prop_init(void) zprop_register_number(VDEV_PROP_IO_T, "io_t", UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_VDEV, "", "IO_T", B_FALSE, sfeatures); + zprop_register_number(VDEV_PROP_SLOW_IO_N, "slow_io_n", UINT64_MAX, + PROP_DEFAULT, ZFS_TYPE_VDEV, "", "SLOW_IO_N", B_FALSE, + sfeatures); + zprop_register_number(VDEV_PROP_SLOW_IO_T, "slow_io_t", UINT64_MAX, + PROP_DEFAULT, ZFS_TYPE_VDEV, "", "SLOW_IO_T", B_FALSE, + sfeatures); /* default index (boolean) properties */ zprop_register_index(VDEV_PROP_REMOVING, "removing", 0, diff --git a/module/zfs/brt.c b/module/zfs/brt.c index 225ddaca1e54..bf8fdf6ea4b4 100644 --- a/module/zfs/brt.c +++ b/module/zfs/brt.c @@ -248,7 +248,7 @@ static kmem_cache_t *brt_pending_entry_cache; /* * Enable/disable prefetching of BRT entries that we are going to modify. */ -int zfs_brt_prefetch = 1; +static int brt_zap_prefetch = 1; #ifdef ZFS_DEBUG #define BRT_DEBUG(...) do { \ @@ -260,8 +260,8 @@ int zfs_brt_prefetch = 1; #define BRT_DEBUG(...) do { } while (0) #endif -int brt_zap_leaf_blockshift = 12; -int brt_zap_indirect_blockshift = 12; +static int brt_zap_default_bs = 12; +static int brt_zap_default_ibs = 12; static kstat_t *brt_ksp; @@ -458,8 +458,7 @@ brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0, ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA, - brt_zap_leaf_blockshift, brt_zap_indirect_blockshift, DMU_OT_NONE, - 0, tx); + brt_zap_default_bs, brt_zap_default_ibs, DMU_OT_NONE, 0, tx); VERIFY(brtvd->bv_mos_entries != 0); BRT_DEBUG("MOS entries created, object=%llu", (u_longlong_t)brtvd->bv_mos_entries); @@ -901,7 +900,6 @@ static int brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre) { uint64_t mos_entries; - uint64_t one, physsize; int error; ASSERT(RW_LOCK_HELD(&brt->brt_lock)); @@ -919,21 +917,8 @@ brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre) brt_unlock(brt); - error = zap_length_uint64(brt->brt_mos, mos_entries, &bre->bre_offset, - BRT_KEY_WORDS, &one, &physsize); - if (error == 0) { - ASSERT3U(one, ==, 1); - ASSERT3U(physsize, ==, sizeof (bre->bre_refcount)); - - error = zap_lookup_uint64(brt->brt_mos, mos_entries, - &bre->bre_offset, BRT_KEY_WORDS, 1, - sizeof (bre->bre_refcount), &bre->bre_refcount); - BRT_DEBUG("ZAP lookup: object=%llu vdev=%llu offset=%llu " - "count=%llu error=%d", (u_longlong_t)mos_entries, - (u_longlong_t)brtvd->bv_vdevid, - (u_longlong_t)bre->bre_offset, - error == 0 ? (u_longlong_t)bre->bre_refcount : 0, error); - } + error = zap_lookup_uint64(brt->brt_mos, mos_entries, &bre->bre_offset, + BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount), &bre->bre_refcount); brt_wlock(brt); @@ -955,52 +940,10 @@ brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre) if (mos_entries == 0) return; - BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu", - (u_longlong_t)mos_entries, (u_longlong_t)vdevid, - (u_longlong_t)bre->bre_offset); (void) zap_prefetch_uint64(brt->brt_mos, mos_entries, (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS); } -static int -brt_entry_update(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) -{ - int error; - - ASSERT(RW_LOCK_HELD(&brt->brt_lock)); - ASSERT(brtvd->bv_mos_entries != 0); - ASSERT(bre->bre_refcount > 0); - - error = zap_update_uint64(brt->brt_mos, brtvd->bv_mos_entries, - (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, 1, - sizeof (bre->bre_refcount), &bre->bre_refcount, tx); - BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu " - "error=%d", (u_longlong_t)brtvd->bv_mos_entries, - (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset, - (u_longlong_t)bre->bre_refcount, error); - - return (error); -} - -static int -brt_entry_remove(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) -{ - int error; - - ASSERT(RW_LOCK_HELD(&brt->brt_lock)); - ASSERT(brtvd->bv_mos_entries != 0); - ASSERT0(bre->bre_refcount); - - error = zap_remove_uint64(brt->brt_mos, brtvd->bv_mos_entries, - (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, tx); - BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu " - "error=%d", (u_longlong_t)brtvd->bv_mos_entries, - (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset, - (u_longlong_t)bre->bre_refcount, error); - - return (error); -} - /* * Return TRUE if we _can_ have BRT entry for this bp. It might be false * positive, but gives us quick answer if we should look into BRT, which @@ -1405,7 +1348,7 @@ brt_prefetch(brt_t *brt, const blkptr_t *bp) ASSERT(bp != NULL); - if (!zfs_brt_prefetch) + if (!brt_zap_prefetch) return; brt_entry_fill(bp, &bre, &vdevid); @@ -1420,13 +1363,14 @@ brt_pending_entry_compare(const void *x1, const void *x2) const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp; int cmp; - cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1), BP_PHYSICAL_BIRTH(bp2)); + cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]), + DVA_GET_VDEV(&bp2->blk_dva[0])); if (cmp == 0) { - cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]), - DVA_GET_VDEV(&bp2->blk_dva[0])); - if (cmp == 0) { - cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]), - DVA_GET_OFFSET(&bp2->blk_dva[0])); + cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]), + DVA_GET_OFFSET(&bp2->blk_dva[0])); + if (unlikely(cmp == 0)) { + cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1), + BP_PHYSICAL_BIRTH(bp2)); } } @@ -1471,10 +1415,10 @@ brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) kmem_cache_free(brt_pending_entry_cache, newbpe); } else { ASSERT(bpe == NULL); - } - /* Prefetch BRT entry, as we will need it in the syncing context. */ - brt_prefetch(brt, bp); + /* Prefetch BRT entry for the syncing context. */ + brt_prefetch(brt, bp); + } } void @@ -1514,26 +1458,23 @@ brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) void brt_pending_apply(spa_t *spa, uint64_t txg) { - brt_t *brt; + brt_t *brt = spa->spa_brt; brt_pending_entry_t *bpe; avl_tree_t *pending_tree; - kmutex_t *pending_lock; void *c; ASSERT3U(txg, !=, 0); - brt = spa->spa_brt; + /* + * We are in syncing context, so no other brt_pending_tree accesses + * are possible for the TXG. Don't need to acquire brt_pending_lock. + */ pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; - pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; - - mutex_enter(pending_lock); c = NULL; while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) { boolean_t added_to_ddt; - mutex_exit(pending_lock); - for (int i = 0; i < bpe->bpe_count; i++) { /* * If the block has DEDUP bit set, it means that it @@ -1551,31 +1492,20 @@ brt_pending_apply(spa_t *spa, uint64_t txg) } kmem_cache_free(brt_pending_entry_cache, bpe); - mutex_enter(pending_lock); } - - mutex_exit(pending_lock); } static void -brt_sync_entry(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) +brt_sync_entry(dnode_t *dn, brt_entry_t *bre, dmu_tx_t *tx) { - - ASSERT(RW_WRITE_HELD(&brt->brt_lock)); - ASSERT(brtvd->bv_mos_entries != 0); - if (bre->bre_refcount == 0) { - int error; - - error = brt_entry_remove(brt, brtvd, bre, tx); - ASSERT(error == 0 || error == ENOENT); - /* - * If error == ENOENT then zfs_clone_range() was done from a - * removed (but opened) file (open(), unlink()). - */ - ASSERT(brt_entry_lookup(brt, brtvd, bre) == ENOENT); + int error = zap_remove_uint64_by_dnode(dn, &bre->bre_offset, + BRT_KEY_WORDS, tx); + VERIFY(error == 0 || error == ENOENT); } else { - VERIFY0(brt_entry_update(brt, brtvd, bre, tx)); + VERIFY0(zap_update_uint64_by_dnode(dn, &bre->bre_offset, + BRT_KEY_WORDS, 1, sizeof (bre->bre_refcount), + &bre->bre_refcount, tx)); } } @@ -1584,6 +1514,7 @@ brt_sync_table(brt_t *brt, dmu_tx_t *tx) { brt_vdev_t *brtvd; brt_entry_t *bre; + dnode_t *dn; uint64_t vdevid; void *c; @@ -1607,14 +1538,19 @@ brt_sync_table(brt_t *brt, dmu_tx_t *tx) if (brtvd->bv_mos_brtvdev == 0) brt_vdev_create(brt, brtvd, tx); + VERIFY0(dnode_hold(brt->brt_mos, brtvd->bv_mos_entries, + FTAG, &dn)); + c = NULL; while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) { - brt_sync_entry(brt, brtvd, bre, tx); + brt_sync_entry(dn, bre, tx); brt_entry_free(bre); ASSERT(brt->brt_nentries > 0); brt->brt_nentries--; } + dnode_rele(dn, FTAG); + brt_vdev_sync(brt, brtvd, tx); if (brtvd->bv_totalcount == 0) @@ -1729,9 +1665,10 @@ brt_unload(spa_t *spa) } /* BEGIN CSTYLED */ -ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, prefetch, INT, ZMOD_RW, - "Enable prefetching of BRT entries"); -#ifdef ZFS_BRT_DEBUG -ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, debug, INT, ZMOD_RW, "BRT debug"); -#endif +ZFS_MODULE_PARAM(zfs_brt, , brt_zap_prefetch, INT, ZMOD_RW, + "Enable prefetching of BRT ZAP entries"); +ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_bs, UINT, ZMOD_RW, + "BRT ZAP leaf blockshift"); +ZFS_MODULE_PARAM(zfs_brt, , brt_zap_default_ibs, UINT, ZMOD_RW, + "BRT ZAP indirect blockshift"); /* END CSTYLED */ diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index ef48cbf984b3..8bd9dd9a88c5 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1548,7 +1548,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, zbookmark_phys_t zb; uint32_t aflags = ARC_FLAG_NOWAIT; int err, zio_flags; - blkptr_t bp, *bpp; + blkptr_t bp, *bpp = NULL; ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(MUTEX_HELD(&db->db_mtx)); @@ -1562,29 +1562,28 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, goto early_unlock; } - if (db->db_state == DB_UNCACHED) { - if (db->db_blkptr == NULL) { - bpp = NULL; - } else { - bp = *db->db_blkptr; + /* + * If we have a pending block clone, we don't want to read the + * underlying block, but the content of the block being cloned, + * pointed by the dirty record, so we have the most recent data. + * If there is no dirty record, then we hit a race in a sync + * process when the dirty record is already removed, while the + * dbuf is not yet destroyed. Such case is equivalent to uncached. + */ + if (db->db_state == DB_NOFILL) { + dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); + if (dr != NULL) { + if (!dr->dt.dl.dr_brtwrite) { + err = EIO; + goto early_unlock; + } + bp = dr->dt.dl.dr_overridden_by; bpp = &bp; } - } else { - dbuf_dirty_record_t *dr; - - ASSERT3S(db->db_state, ==, DB_NOFILL); + } - /* - * Block cloning: If we have a pending block clone, - * we don't want to read the underlying block, but the content - * of the block being cloned, so we have the most recent data. - */ - dr = list_head(&db->db_dirty_records); - if (dr == NULL || !dr->dt.dl.dr_brtwrite) { - err = EIO; - goto early_unlock; - } - bp = dr->dt.dl.dr_overridden_by; + if (bpp == NULL && db->db_blkptr != NULL) { + bp = *db->db_blkptr; bpp = &bp; } @@ -2617,26 +2616,24 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) ASSERT(!zfs_refcount_is_zero(&db->db_holds)); /* - * Quick check for dirtiness. For already dirty blocks, this - * reduces runtime of this function by >90%, and overall performance - * by 50% for some workloads (e.g. file deletion with indirect blocks - * cached). + * Quick check for dirtiness to improve performance for some workloads + * (e.g. file deletion with indirect blocks cached). */ mutex_enter(&db->db_mtx); - if (db->db_state == DB_CACHED || db->db_state == DB_NOFILL) { - dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg); /* - * It's possible that it is already dirty but not cached, + * It's possible that the dbuf is already dirty but not cached, * because there are some calls to dbuf_dirty() that don't * go through dmu_buf_will_dirty(). */ + dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg); if (dr != NULL) { - if (dr->dt.dl.dr_brtwrite) { + if (db->db_level == 0 && + dr->dt.dl.dr_brtwrite) { /* * Block cloning: If we are dirtying a cloned - * block, we cannot simply redirty it, because - * this dr has no data associated with it. + * level 0 block, we cannot simply redirty it, + * because this dr has no associated data. * We will go through a full undirtying below, * before dirtying it again. */ @@ -2704,7 +2701,7 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx) mutex_enter(&db->db_mtx); DBUF_VERIFY(db); VERIFY(!dbuf_undirty(db, tx)); - ASSERT3P(dbuf_find_dirty_eq(db, tx->tx_txg), ==, NULL); + ASSERT0P(dbuf_find_dirty_eq(db, tx->tx_txg)); if (db->db_buf != NULL) { arc_buf_destroy(db->db_buf, db); db->db_buf = NULL; @@ -4129,21 +4126,6 @@ dmu_buf_get_objset(dmu_buf_t *db) return (dbi->db_objset); } -dnode_t * -dmu_buf_dnode_enter(dmu_buf_t *db) -{ - dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; - DB_DNODE_ENTER(dbi); - return (DB_DNODE(dbi)); -} - -void -dmu_buf_dnode_exit(dmu_buf_t *db) -{ - dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; - DB_DNODE_EXIT(dbi); -} - static void dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) { @@ -4566,11 +4548,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT) dbuf_prepare_encrypted_dnode_leaf(dr); - if (db->db_state != DB_NOFILL && + if (*datap != NULL && *datap == db->db_buf && dn->dn_object != DMU_META_DNODE_OBJECT && zfs_refcount_count(&db->db_holds) > 1 && - dr->dt.dl.dr_override_state != DR_OVERRIDDEN && - *datap == db->db_buf) { + dr->dt.dl.dr_override_state != DR_OVERRIDDEN) { /* * If this buffer is currently "in use" (i.e., there * are active holds and db_data still references it), @@ -4855,11 +4836,9 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) if (db->db_level == 0) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); - if (db->db_state != DB_NOFILL) { - if (dr->dt.dl.dr_data != NULL && - dr->dt.dl.dr_data != db->db_buf) { - arc_buf_destroy(dr->dt.dl.dr_data, db); - } + if (dr->dt.dl.dr_data != NULL && + dr->dt.dl.dr_data != db->db_buf) { + arc_buf_destroy(dr->dt.dl.dr_data, db); } } else { ASSERT(list_head(&dr->dt.di.dr_children) == NULL); @@ -5058,21 +5037,18 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) os = dn->dn_objset; - if (db->db_state != DB_NOFILL) { - if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { - /* - * Private object buffers are released here rather - * than in dbuf_dirty() since they are only modified - * in the syncing context and we don't want the - * overhead of making multiple copies of the data. - */ - if (BP_IS_HOLE(db->db_blkptr)) { - arc_buf_thaw(data); - } else { - dbuf_release_bp(db); - } - dbuf_remap(dn, db, tx); - } + if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { + /* + * Private object buffers are released here rather than in + * dbuf_dirty() since they are only modified in the syncing + * context and we don't want the overhead of making multiple + * copies of the data. + */ + if (BP_IS_HOLE(db->db_blkptr)) + arc_buf_thaw(data); + else + dbuf_release_bp(db); + dbuf_remap(dn, db, tx); } if (parent != dn->dn_dbuf) { @@ -5108,7 +5084,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) if (db->db_blkid == DMU_SPILL_BLKID) wp_flag = WP_SPILL; - wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; + wp_flag |= (data == NULL) ? WP_NOFILL : 0; dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); @@ -5140,7 +5116,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite, dr->dt.dl.dr_brtwrite); mutex_exit(&db->db_mtx); - } else if (db->db_state == DB_NOFILL) { + } else if (data == NULL) { ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); dr->dr_zio = zio_write(pio, os->os_spa, txg, diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 08bb185a3ee5..d8d5cfdbd230 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -697,72 +697,99 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, const void *tag) } /* - * Issue prefetch i/os for the given blocks. If level is greater than 0, the + * Issue prefetch I/Os for the given blocks. If level is greater than 0, the * indirect blocks prefetched will be those that point to the blocks containing - * the data starting at offset, and continuing to offset + len. + * the data starting at offset, and continuing to offset + len. If the range + * it too long, prefetch the first dmu_prefetch_max bytes as requested, while + * for the rest only a higher level, also fitting within dmu_prefetch_max. It + * should primarily help random reads, since for long sequential reads there is + * a speculative prefetcher. * * Note that if the indirect blocks above the blocks being prefetched are not - * in cache, they will be asynchronously read in. + * in cache, they will be asynchronously read in. Dnode read by dnode_hold() + * is currently synchronous. */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, zio_priority_t pri) { dnode_t *dn; - uint64_t blkid; - int nblks, err; - - if (len == 0) { /* they're interested in the bonus buffer */ - dn = DMU_META_DNODE(os); - - if (object == 0 || object >= DN_MAX_OBJECT) - return; - rw_enter(&dn->dn_struct_rwlock, RW_READER); - blkid = dbuf_whichblock(dn, level, - object * sizeof (dnode_phys_t)); - dbuf_prefetch(dn, level, blkid, pri, 0); - rw_exit(&dn->dn_struct_rwlock); + if (dmu_prefetch_max == 0 || len == 0) { + dmu_prefetch_dnode(os, object, pri); return; } - /* - * See comment before the definition of dmu_prefetch_max. - */ - len = MIN(len, dmu_prefetch_max); - - /* - * XXX - Note, if the dnode for the requested object is not - * already cached, we will do a *synchronous* read in the - * dnode_hold() call. The same is true for any indirects. - */ - err = dnode_hold(os, object, FTAG, &dn); - if (err != 0) + if (dnode_hold(os, object, FTAG, &dn) != 0) return; + dmu_prefetch_by_dnode(dn, level, offset, len, pri); + + dnode_rele(dn, FTAG); +} + +void +dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset, + uint64_t len, zio_priority_t pri) +{ + int64_t level2 = level; + uint64_t start, end, start2, end2; + /* - * offset + len - 1 is the last byte we want to prefetch for, and offset - * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the - * last block we want to prefetch, and dbuf_whichblock(dn, level, - * offset) is the first. Then the number we need to prefetch is the - * last - first + 1. + * Depending on len we may do two prefetches: blocks [start, end) at + * level, and following blocks [start2, end2) at higher level2. */ rw_enter(&dn->dn_struct_rwlock, RW_READER); - if (level > 0 || dn->dn_datablkshift != 0) { - nblks = dbuf_whichblock(dn, level, offset + len - 1) - - dbuf_whichblock(dn, level, offset) + 1; + if (dn->dn_datablkshift != 0) { + /* + * The object has multiple blocks. Calculate the full range + * of blocks [start, end2) and then split it into two parts, + * so that the first [start, end) fits into dmu_prefetch_max. + */ + start = dbuf_whichblock(dn, level, offset); + end2 = dbuf_whichblock(dn, level, offset + len - 1) + 1; + uint8_t ibs = dn->dn_indblkshift; + uint8_t bs = (level == 0) ? dn->dn_datablkshift : ibs; + uint_t limit = P2ROUNDUP(dmu_prefetch_max, 1 << bs) >> bs; + start2 = end = MIN(end2, start + limit); + + /* + * Find level2 where [start2, end2) fits into dmu_prefetch_max. + */ + uint8_t ibps = ibs - SPA_BLKPTRSHIFT; + limit = P2ROUNDUP(dmu_prefetch_max, 1 << ibs) >> ibs; + do { + level2++; + start2 = P2ROUNDUP(start2, 1 << ibps) >> ibps; + end2 = P2ROUNDUP(end2, 1 << ibps) >> ibps; + } while (end2 - start2 > limit); } else { - nblks = (offset < dn->dn_datablksz); + /* There is only one block. Prefetch it or nothing. */ + start = start2 = end2 = 0; + end = start + (level == 0 && offset < dn->dn_datablksz); } - if (nblks != 0) { - blkid = dbuf_whichblock(dn, level, offset); - for (int i = 0; i < nblks; i++) - dbuf_prefetch(dn, level, blkid + i, pri, 0); - } + for (uint64_t i = start; i < end; i++) + dbuf_prefetch(dn, level, i, pri, 0); + for (uint64_t i = start2; i < end2; i++) + dbuf_prefetch(dn, level2, i, pri, 0); rw_exit(&dn->dn_struct_rwlock); +} - dnode_rele(dn, FTAG); +/* + * Issue prefetch I/Os for the given object's dnode. + */ +void +dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri) +{ + if (object == 0 || object >= DN_MAX_OBJECT) + return; + + dnode_t *dn = DMU_META_DNODE(os); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + uint64_t blkid = dbuf_whichblock(dn, 0, object * sizeof (dnode_phys_t)); + dbuf_prefetch(dn, 0, blkid, pri, 0); + rw_exit(&dn->dn_struct_rwlock); } /* @@ -2240,11 +2267,13 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, if (bp == NULL) { /* - * The block was created in this transaction group, - * so it has no BP yet. + * The file size was increased, but the block was never + * written, otherwise we would either have the block + * pointer or the dirty record and would not get here. + * It is effectively a hole, so report it as such. */ - error = SET_ERROR(EAGAIN); - goto out; + BP_ZERO(&bps[i]); + continue; } /* * Make sure we clone only data blocks. @@ -2336,18 +2365,16 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, ASSERT3U(dr->dr_txg, ==, tx->tx_txg); dl = &dr->dt.dl; dl->dr_overridden_by = *bp; - dl->dr_brtwrite = B_TRUE; - dl->dr_override_state = DR_OVERRIDDEN; - if (BP_IS_HOLE(bp)) { - dl->dr_overridden_by.blk_birth = 0; - dl->dr_overridden_by.blk_phys_birth = 0; - } else { - dl->dr_overridden_by.blk_birth = dr->dr_txg; + if (!BP_IS_HOLE(bp) || bp->blk_birth != 0) { if (!BP_IS_EMBEDDED(bp)) { - dl->dr_overridden_by.blk_phys_birth = - BP_PHYSICAL_BIRTH(bp); + BP_SET_BIRTH(&dl->dr_overridden_by, dr->dr_txg, + BP_PHYSICAL_BIRTH(bp)); + } else { + dl->dr_overridden_by.blk_birth = dr->dr_txg; } } + dl->dr_brtwrite = B_TRUE; + dl->dr_override_state = DR_OVERRIDDEN; mutex_exit(&db->db_mtx); @@ -2546,6 +2573,8 @@ EXPORT_SYMBOL(dmu_bonus_hold_by_dnode); EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus); EXPORT_SYMBOL(dmu_buf_rele_array); EXPORT_SYMBOL(dmu_prefetch); +EXPORT_SYMBOL(dmu_prefetch_by_dnode); +EXPORT_SYMBOL(dmu_prefetch_dnode); EXPORT_SYMBOL(dmu_free_range); EXPORT_SYMBOL(dmu_free_long_range); EXPORT_SYMBOL(dmu_free_long_object); diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index f8bd7422a5df..ef966d6703c9 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -263,6 +263,19 @@ secondary_cache_changed_cb(void *arg, uint64_t newval) os->os_secondary_cache = newval; } +static void +prefetch_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance should have been done by now. + */ + ASSERT(newval == ZFS_PREFETCH_ALL || newval == ZFS_PREFETCH_NONE || + newval == ZFS_PREFETCH_METADATA); + os->os_prefetch = newval; +} + static void sync_changed_cb(void *arg, uint64_t newval) { @@ -562,6 +575,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), secondary_cache_changed_cb, os); } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_PREFETCH), + prefetch_changed_cb, os); + } if (!ds->ds_is_snapshot) { if (err == 0) { err = dsl_prop_register(ds, @@ -635,6 +653,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, os->os_primary_cache = ZFS_CACHE_ALL; os->os_secondary_cache = ZFS_CACHE_ALL; os->os_dnodesize = DNODE_MIN_SIZE; + os->os_prefetch = ZFS_PREFETCH_ALL; } if (ds == NULL || !ds->ds_is_snapshot) diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 2cf10909738b..9f1c25f866f7 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -2353,7 +2353,6 @@ receive_process_write_record(struct receive_writer_arg *rwa, if (rwa->heal) { blkptr_t *bp; dmu_buf_t *dbp; - dnode_t *dn; int flags = DB_RF_CANFAIL; if (rwa->raw) @@ -2385,19 +2384,15 @@ receive_process_write_record(struct receive_writer_arg *rwa, dmu_buf_rele(dbp, FTAG); return (err); } - dn = dmu_buf_dnode_enter(dbp); /* Make sure the on-disk block and recv record sizes match */ - if (drrw->drr_logical_size != - dn->dn_datablkszsec << SPA_MINBLOCKSHIFT) { + if (drrw->drr_logical_size != dbp->db_size) { err = ENOTSUP; - dmu_buf_dnode_exit(dbp); dmu_buf_rele(dbp, FTAG); return (err); } /* Get the block pointer for the corrupted block */ bp = dmu_buf_get_blkptr(dbp); err = do_corrective_recv(rwa, drrw, rrd, bp); - dmu_buf_dnode_exit(dbp); dmu_buf_rele(dbp, FTAG); return (err); } diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c index e26195176036..ed50f1889b59 100644 --- a/module/zfs/dmu_zfetch.c +++ b/module/zfs/dmu_zfetch.c @@ -418,8 +418,8 @@ dmu_zfetch_future(zstream_t *zs, uint64_t blkid, uint64_t nblks) zs->zs_ranges[f].start = zs->zs_ranges[l].start; zs->zs_ranges[f].end = zs->zs_ranges[l].end; } - zs->zs_ranges[ZFETCH_RANGES - 1].start = 0; - zs->zs_ranges[ZFETCH_RANGES - 1].end = 0; + zs->zs_ranges[f].start = 0; + zs->zs_ranges[f].end = 0; } } else if (i < ZFETCH_RANGES) { /* Got no intersecting ranges, insert new one. */ @@ -471,9 +471,14 @@ dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, { zstream_t *zs; spa_t *spa = zf->zf_dnode->dn_objset->os_spa; + zfs_prefetch_type_t os_prefetch = zf->zf_dnode->dn_objset->os_prefetch; - if (zfs_prefetch_disable) + if (zfs_prefetch_disable || os_prefetch == ZFS_PREFETCH_NONE) return (NULL); + + if (os_prefetch == ZFS_PREFETCH_METADATA) + fetch_data = B_FALSE; + /* * If we haven't yet loaded the indirect vdevs' mappings, we * can only read from blocks that we carefully ensure are on diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index ac30a370813f..e6c8d4be13b4 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -173,8 +173,8 @@ dsl_deadlist_load_tree(dsl_deadlist_t *dl) * in parallel. Then open them all in a second pass. */ dle->dle_bpobj.bpo_object = za.za_first_integer; - dmu_prefetch(dl->dl_os, dle->dle_bpobj.bpo_object, - 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + dmu_prefetch_dnode(dl->dl_os, dle->dle_bpobj.bpo_object, + ZIO_PRIORITY_SYNC_READ); avl_add(&dl->dl_tree, dle); } @@ -235,8 +235,8 @@ dsl_deadlist_load_cache(dsl_deadlist_t *dl) * in parallel. Then open them all in a second pass. */ dlce->dlce_bpobj = za.za_first_integer; - dmu_prefetch(dl->dl_os, dlce->dlce_bpobj, - 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + dmu_prefetch_dnode(dl->dl_os, dlce->dlce_bpobj, + ZIO_PRIORITY_SYNC_READ); avl_add(&dl->dl_cache, dlce); } VERIFY3U(error, ==, ENOENT); diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index 25eea0752941..8144d8965085 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -663,12 +663,13 @@ mmp_thread(void *arg) (gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) { zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu " "mmp_last_write %llu mmp_interval %llu " - "mmp_fail_intervals %llu mmp_fail_ns %llu", + "mmp_fail_intervals %llu mmp_fail_ns %llu txg %llu", spa_name(spa), (u_longlong_t)gethrtime(), (u_longlong_t)mmp->mmp_last_write, (u_longlong_t)mmp_interval, (u_longlong_t)mmp_fail_intervals, - (u_longlong_t)mmp_fail_ns); + (u_longlong_t)mmp_fail_ns, + (u_longlong_t)spa->spa_uberblock.ub_txg); cmn_err(CE_WARN, "MMP writes to pool '%s' have not " "succeeded in over %llu ms; suspending pool. " "Hrtime %llu", diff --git a/module/zfs/spa.c b/module/zfs/spa.c index d7fe96cde6a4..251dd8a4d1c7 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. @@ -3324,6 +3324,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) spa->spa_load_state = state; (void) spa_import_progress_set_state(spa_guid(spa), spa_load_state(spa)); + spa_import_progress_set_notes(spa, "spa_load()"); gethrestime(&spa->spa_loaded_ts); error = spa_load_impl(spa, type, &ereport); @@ -3541,18 +3542,23 @@ spa_activity_check_duration(spa_t *spa, uberblock_t *ub) } /* - * Perform the import activity check. If the user canceled the import or - * we detected activity then fail. + * Remote host activity check. + * + * error results: + * 0 - no activity detected + * EREMOTEIO - remote activity detected + * EINTR - user canceled the operation */ static int -spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) +spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config, + boolean_t importing) { uint64_t txg = ub->ub_txg; uint64_t timestamp = ub->ub_timestamp; uint64_t mmp_config = ub->ub_mmp_config; uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0; uint64_t import_delay; - hrtime_t import_expire; + hrtime_t import_expire, now; nvlist_t *mmp_label = NULL; vdev_t *rvd = spa->spa_root_vdev; kcondvar_t cv; @@ -3590,9 +3596,23 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) import_expire = gethrtime() + import_delay; - while (gethrtime() < import_expire) { - (void) spa_import_progress_set_mmp_check(spa_guid(spa), - NSEC2SEC(import_expire - gethrtime())); + if (importing) { + spa_import_progress_set_notes(spa, "Checking MMP activity, " + "waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay)); + } + + int iterations = 0; + while ((now = gethrtime()) < import_expire) { + if (importing && iterations++ % 30 == 0) { + spa_import_progress_set_notes(spa, "Checking MMP " + "activity, %llu ms remaining", + (u_longlong_t)NSEC2MSEC(import_expire - now)); + } + + if (importing) { + (void) spa_import_progress_set_mmp_check(spa_guid(spa), + NSEC2SEC(import_expire - gethrtime())); + } vdev_uberblock_load(rvd, ub, &mmp_label); @@ -3674,6 +3694,61 @@ spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) return (error); } +/* + * Called from zfs_ioc_clear for a pool that was suspended + * after failing mmp write checks. + */ +boolean_t +spa_mmp_remote_host_activity(spa_t *spa) +{ + ASSERT(spa_multihost(spa) && spa_suspended(spa)); + + nvlist_t *best_label; + uberblock_t best_ub; + + /* + * Locate the best uberblock on disk + */ + vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label); + if (best_label) { + /* + * confirm that the best hostid matches our hostid + */ + if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) && + spa_get_hostid(spa) != + fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) { + nvlist_free(best_label); + return (B_TRUE); + } + nvlist_free(best_label); + } else { + return (B_TRUE); + } + + if (!MMP_VALID(&best_ub) || + !MMP_FAIL_INT_VALID(&best_ub) || + MMP_FAIL_INT(&best_ub) == 0) { + return (B_TRUE); + } + + if (best_ub.ub_txg != spa->spa_uberblock.ub_txg || + best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) { + zfs_dbgmsg("txg mismatch detected during pool clear " + "txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu", + (u_longlong_t)spa->spa_uberblock.ub_txg, + (u_longlong_t)best_ub.ub_txg, + (u_longlong_t)spa->spa_uberblock.ub_timestamp, + (u_longlong_t)best_ub.ub_timestamp); + return (B_TRUE); + } + + /* + * Perform an activity check looking for any remote writer + */ + return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config, + B_FALSE) != 0); +} + static int spa_verify_host(spa_t *spa, nvlist_t *mos_config) { @@ -3994,7 +4069,8 @@ spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); } - int error = spa_activity_check(spa, ub, spa->spa_config); + int error = + spa_activity_check(spa, ub, spa->spa_config, B_TRUE); if (error) { nvlist_free(label); return (error); @@ -5204,6 +5280,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) /* * Retrieve the checkpoint txg if the pool has a checkpoint. */ + spa_import_progress_set_notes(spa, "Loading checkpoint txg"); error = spa_ld_read_checkpoint_txg(spa); if (error != 0) return (error); @@ -5216,6 +5293,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) * initiated. Otherwise we could be reading from indirect vdevs before * we have loaded their mappings. */ + spa_import_progress_set_notes(spa, "Loading indirect vdev metadata"); error = spa_ld_open_indirect_vdev_metadata(spa); if (error != 0) return (error); @@ -5224,6 +5302,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) * Retrieve the full list of active features from the MOS and check if * they are all supported. */ + spa_import_progress_set_notes(spa, "Checking feature flags"); error = spa_ld_check_features(spa, &missing_feat_write); if (error != 0) return (error); @@ -5232,6 +5311,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) * Load several special directories from the MOS needed by the dsl_pool * layer. */ + spa_import_progress_set_notes(spa, "Loading special MOS directories"); error = spa_ld_load_special_directories(spa); if (error != 0) return (error); @@ -5239,6 +5319,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) /* * Retrieve pool properties from the MOS. */ + spa_import_progress_set_notes(spa, "Loading properties"); error = spa_ld_get_props(spa); if (error != 0) return (error); @@ -5247,6 +5328,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) * Retrieve the list of auxiliary devices - cache devices and spares - * and open them. */ + spa_import_progress_set_notes(spa, "Loading AUX vdevs"); error = spa_ld_open_aux_vdevs(spa, type); if (error != 0) return (error); @@ -5255,14 +5337,17 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) * Load the metadata for all vdevs. Also check if unopenable devices * should be autoreplaced. */ + spa_import_progress_set_notes(spa, "Loading vdev metadata"); error = spa_ld_load_vdev_metadata(spa); if (error != 0) return (error); + spa_import_progress_set_notes(spa, "Loading dedup tables"); error = spa_ld_load_dedup_tables(spa); if (error != 0) return (error); + spa_import_progress_set_notes(spa, "Loading BRT"); error = spa_ld_load_brt(spa); if (error != 0) return (error); @@ -5271,6 +5356,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) * Verify the logs now to make sure we don't have any unexpected errors * when we claim log blocks later. */ + spa_import_progress_set_notes(spa, "Verifying Log Devices"); error = spa_ld_verify_logs(spa, type, ereport); if (error != 0) return (error); @@ -5292,6 +5378,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) * state. When performing an extreme rewind, we verify the whole pool, * which can take a very long time. */ + spa_import_progress_set_notes(spa, "Verifying pool data"); error = spa_ld_verify_pool_data(spa); if (error != 0) return (error); @@ -5301,6 +5388,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) * we write anything to the pool because we'd need to update the space * accounting using the deflated sizes. */ + spa_import_progress_set_notes(spa, "Calculating deflated space"); spa_update_dspace(spa); /* @@ -5308,6 +5396,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) * pool. If we are importing the pool in read-write mode, a few * additional steps must be performed to finish the import. */ + spa_import_progress_set_notes(spa, "Starting import"); if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || spa->spa_load_max_txg == UINT64_MAX)) { uint64_t config_cache_txg = spa->spa_config_txg; @@ -5324,6 +5413,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); } + spa_import_progress_set_notes(spa, "Claiming ZIL blocks"); /* * Traverse the ZIL and claim all blocks. */ @@ -5343,6 +5433,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) * will have been set for us by ZIL traversal operations * performed above. */ + spa_import_progress_set_notes(spa, "Syncing ZIL claims"); txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); /* @@ -5350,6 +5441,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) * next sync, we would update the config stored in vdev labels * and the cachefile (by default /etc/zfs/zpool.cache). */ + spa_import_progress_set_notes(spa, "Updating configs"); spa_ld_check_for_config_update(spa, config_cache_txg, update_config_cache); @@ -5358,6 +5450,7 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) * Then check all DTLs to see if anything needs resilvering. * The resilver will be deferred if a rebuild was started. */ + spa_import_progress_set_notes(spa, "Starting resilvers"); if (vdev_rebuild_active(spa->spa_root_vdev)) { vdev_rebuild_restart(spa); } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) && @@ -5371,6 +5464,8 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) */ spa_history_log_version(spa, "open", NULL); + spa_import_progress_set_notes(spa, + "Restarting device removals"); spa_restart_removal(spa); spa_spawn_aux_threads(spa); @@ -5383,19 +5478,26 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) * auxiliary threads above (from which the livelist * deletion zthr is part of). */ + spa_import_progress_set_notes(spa, + "Cleaning up inconsistent objsets"); (void) dmu_objset_find(spa_name(spa), dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); /* * Clean up any stale temporary dataset userrefs. */ + spa_import_progress_set_notes(spa, + "Cleaning up temporary userrefs"); dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + spa_import_progress_set_notes(spa, "Restarting initialize"); vdev_initialize_restart(spa->spa_root_vdev); + spa_import_progress_set_notes(spa, "Restarting TRIM"); vdev_trim_restart(spa->spa_root_vdev); vdev_autotrim_restart(spa); spa_config_exit(spa, SCL_CONFIG, FTAG); + spa_import_progress_set_notes(spa, "Finished importing"); } spa_import_progress_remove(spa_guid(spa)); @@ -6978,7 +7080,7 @@ spa_draid_feature_incr(void *arg, dmu_tx_t *tx) * Add a device to a storage pool. */ int -spa_vdev_add(spa_t *spa, nvlist_t *nvroot) +spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift) { uint64_t txg, ndraid = 0; int error; @@ -7069,6 +7171,16 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) } } + if (check_ashift && spa->spa_max_ashift == spa->spa_min_ashift) { + for (int c = 0; c < vd->vdev_children; c++) { + tvd = vd->vdev_child[c]; + if (tvd->vdev_ashift != spa->spa_max_ashift) { + return (spa_vdev_exit(spa, vd, txg, + ZFS_ERR_ASHIFT_MISMATCH)); + } + } + } + for (int c = 0; c < vd->vdev_children; c++) { tvd = vd->vdev_child[c]; vdev_remove_child(vd, tvd); @@ -8542,15 +8654,16 @@ spa_async_remove(spa_t *spa, vdev_t *vd) } static void -spa_async_probe(spa_t *spa, vdev_t *vd) +spa_async_fault_vdev(spa_t *spa, vdev_t *vd) { - if (vd->vdev_probe_wanted) { - vd->vdev_probe_wanted = B_FALSE; - vdev_reopen(vd); /* vdev_open() does the actual probe */ + if (vd->vdev_fault_wanted) { + vd->vdev_fault_wanted = B_FALSE; + vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, + VDEV_AUX_ERR_EXCEEDED); } for (int c = 0; c < vd->vdev_children; c++) - spa_async_probe(spa, vd->vdev_child[c]); + spa_async_fault_vdev(spa, vd->vdev_child[c]); } static void @@ -8638,11 +8751,11 @@ spa_async_thread(void *arg) } /* - * See if any devices need to be probed. + * See if any devices need to be marked faulted. */ - if (tasks & SPA_ASYNC_PROBE) { + if (tasks & SPA_ASYNC_FAULT_VDEV) { spa_vdev_state_enter(spa, SCL_NONE); - spa_async_probe(spa, spa->spa_root_vdev); + spa_async_fault_vdev(spa, spa->spa_root_vdev); (void) spa_vdev_state_exit(spa, NULL, 0); } diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c index 2878e68c6e4b..873089a53e34 100644 --- a/module/zfs/spa_log_spacemap.c +++ b/module/zfs/spa_log_spacemap.c @@ -1147,12 +1147,13 @@ spa_ld_log_sm_data(spa_t *spa) /* Prefetch log spacemaps dnodes. */ for (sls = avl_first(&spa->spa_sm_logs_by_txg); sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { - dmu_prefetch(spa_meta_objset(spa), sls->sls_sm_obj, - 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + dmu_prefetch_dnode(spa_meta_objset(spa), sls->sls_sm_obj, + ZIO_PRIORITY_SYNC_READ); } uint_t pn = 0; uint64_t ps = 0; + uint64_t nsm = 0; psls = sls = avl_first(&spa->spa_sm_logs_by_txg); while (sls != NULL) { /* Prefetch log spacemaps up to 16 TXGs or MBs ahead. */ @@ -1185,6 +1186,10 @@ spa_ld_log_sm_data(spa_t *spa) summary_add_data(spa, sls->sls_txg, sls->sls_mscount, 0, sls->sls_nblocks); + spa_import_progress_set_notes_nolog(spa, + "Read %llu of %lu log space maps", (u_longlong_t)nsm, + avl_numnodes(&spa->spa_sm_logs_by_txg)); + struct spa_ld_log_sm_arg vla = { .slls_spa = spa, .slls_txg = sls->sls_txg @@ -1200,6 +1205,7 @@ spa_ld_log_sm_data(spa_t *spa) pn--; ps -= space_map_length(sls->sls_sm); + nsm++; space_map_close(sls->sls_sm); sls->sls_sm = NULL; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls); @@ -1210,11 +1216,11 @@ spa_ld_log_sm_data(spa_t *spa) hrtime_t read_logs_endtime = gethrtime(); spa_load_note(spa, - "read %llu log space maps (%llu total blocks - blksz = %llu bytes) " - "in %lld ms", (u_longlong_t)avl_numnodes(&spa->spa_sm_logs_by_txg), + "Read %lu log space maps (%llu total blocks - blksz = %llu bytes) " + "in %lld ms", avl_numnodes(&spa->spa_sm_logs_by_txg), (u_longlong_t)spa_log_sm_nblocks(spa), (u_longlong_t)zfs_log_sm_blksz, - (longlong_t)((read_logs_endtime - read_logs_starttime) / 1000000)); + (longlong_t)NSEC2MSEC(read_logs_endtime - read_logs_starttime)); out: if (error != 0) { diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 24f038ad7f4b..649fe2f634b5 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -417,6 +417,8 @@ spa_load_note(spa_t *spa, const char *fmt, ...) zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name, spa->spa_trust_config ? "trusted" : "untrusted", buf); + + spa_import_progress_set_notes_nolog(spa, "%s", buf); } /* @@ -2172,6 +2174,7 @@ typedef struct spa_import_progress { uint64_t pool_guid; /* unique id for updates */ char *pool_name; spa_load_state_t spa_load_state; + char *spa_load_notes; uint64_t mmp_sec_remaining; /* MMP activity check */ uint64_t spa_load_max_txg; /* rewind txg */ procfs_list_node_t smh_node; @@ -2182,9 +2185,9 @@ spa_history_list_t *spa_import_progress_list = NULL; static int spa_import_progress_show_header(struct seq_file *f) { - seq_printf(f, "%-20s %-14s %-14s %-12s %s\n", "pool_guid", + seq_printf(f, "%-20s %-14s %-14s %-12s %-16s %s\n", "pool_guid", "load_state", "multihost_secs", "max_txg", - "pool_name"); + "pool_name", "notes"); return (0); } @@ -2193,11 +2196,12 @@ spa_import_progress_show(struct seq_file *f, void *data) { spa_import_progress_t *sip = (spa_import_progress_t *)data; - seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %s\n", + seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %-16s %s\n", (u_longlong_t)sip->pool_guid, (u_longlong_t)sip->spa_load_state, (u_longlong_t)sip->mmp_sec_remaining, (u_longlong_t)sip->spa_load_max_txg, - (sip->pool_name ? sip->pool_name : "-")); + (sip->pool_name ? sip->pool_name : "-"), + (sip->spa_load_notes ? sip->spa_load_notes : "-")); return (0); } @@ -2211,6 +2215,8 @@ spa_import_progress_truncate(spa_history_list_t *shl, unsigned int size) sip = list_remove_head(&shl->procfs_list.pl_list); if (sip->pool_name) spa_strfree(sip->pool_name); + if (sip->spa_load_notes) + kmem_strfree(sip->spa_load_notes); kmem_free(sip, sizeof (spa_import_progress_t)); shl->size--; } @@ -2266,6 +2272,10 @@ spa_import_progress_set_state(uint64_t pool_guid, sip = list_prev(&shl->procfs_list.pl_list, sip)) { if (sip->pool_guid == pool_guid) { sip->spa_load_state = load_state; + if (sip->spa_load_notes != NULL) { + kmem_strfree(sip->spa_load_notes); + sip->spa_load_notes = NULL; + } error = 0; break; } @@ -2275,6 +2285,59 @@ spa_import_progress_set_state(uint64_t pool_guid, return (error); } +static void +spa_import_progress_set_notes_impl(spa_t *spa, boolean_t log_dbgmsg, + const char *fmt, va_list adx) +{ + spa_history_list_t *shl = spa_import_progress_list; + spa_import_progress_t *sip; + uint64_t pool_guid = spa_guid(spa); + + if (shl->size == 0) + return; + + char *notes = kmem_vasprintf(fmt, adx); + + mutex_enter(&shl->procfs_list.pl_lock); + for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; + sip = list_prev(&shl->procfs_list.pl_list, sip)) { + if (sip->pool_guid == pool_guid) { + if (sip->spa_load_notes != NULL) { + kmem_strfree(sip->spa_load_notes); + sip->spa_load_notes = NULL; + } + sip->spa_load_notes = notes; + if (log_dbgmsg) + zfs_dbgmsg("'%s' %s", sip->pool_name, notes); + notes = NULL; + break; + } + } + mutex_exit(&shl->procfs_list.pl_lock); + if (notes != NULL) + kmem_strfree(notes); +} + +void +spa_import_progress_set_notes(spa_t *spa, const char *fmt, ...) +{ + va_list adx; + + va_start(adx, fmt); + spa_import_progress_set_notes_impl(spa, B_TRUE, fmt, adx); + va_end(adx); +} + +void +spa_import_progress_set_notes_nolog(spa_t *spa, const char *fmt, ...) +{ + va_list adx; + + va_start(adx, fmt); + spa_import_progress_set_notes_impl(spa, B_FALSE, fmt, adx); + va_end(adx); +} + int spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t load_max_txg) { @@ -2343,6 +2406,7 @@ spa_import_progress_add(spa_t *spa) poolname = spa_name(spa); sip->pool_name = spa_strdup(poolname); sip->spa_load_state = spa_load_state(spa); + sip->spa_load_notes = NULL; mutex_enter(&shl->procfs_list.pl_lock); procfs_list_add(&shl->procfs_list, sip); @@ -2362,6 +2426,8 @@ spa_import_progress_remove(uint64_t pool_guid) if (sip->pool_guid == pool_guid) { if (sip->pool_name) spa_strfree(sip->pool_name); + if (sip->spa_load_notes) + spa_strfree(sip->spa_load_notes); list_remove(&shl->procfs_list.pl_list, sip); shl->size--; kmem_free(sip, sizeof (spa_import_progress_t)); diff --git a/module/zfs/txg.c b/module/zfs/txg.c index a67c043446f5..5ce6be69be14 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -550,6 +550,15 @@ txg_sync_thread(void *arg) timer = (delta > timeout ? 0 : timeout - delta); } + /* + * When we're suspended, nothing should be changing and for + * MMP we don't want to bump anything that would make it + * harder to detect if another host is changing it when + * resuming after a MMP suspend. + */ + if (spa_suspended(spa)) + continue; + /* * Wait until the quiesce thread hands off a txg to us, * prompting it to do so if necessary. diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index e1ca1aecc900..981da4e986c4 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -676,6 +676,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T); vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N); vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T); + vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N); + vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T); list_link_init(&vd->vdev_config_dirty_node); list_link_init(&vd->vdev_state_dirty_node); @@ -1659,6 +1661,7 @@ vdev_metaslab_fini(vdev_t *vd) typedef struct vdev_probe_stats { boolean_t vps_readable; boolean_t vps_writeable; + boolean_t vps_zio_done_probe; int vps_flags; } vdev_probe_stats_t; @@ -1702,6 +1705,17 @@ vdev_probe_done(zio_t *zio) (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, spa, vd, NULL, NULL, 0); zio->io_error = SET_ERROR(ENXIO); + + /* + * If this probe was initiated from zio pipeline, then + * change the state in a spa_async_request. Probes that + * were initiated from a vdev_open can change the state + * as part of the open call. + */ + if (vps->vps_zio_done_probe) { + vd->vdev_fault_wanted = B_TRUE; + spa_async_request(spa, SPA_ASYNC_FAULT_VDEV); + } } mutex_enter(&vd->vdev_probe_lock); @@ -1752,6 +1766,7 @@ vdev_probe(vdev_t *vd, zio_t *zio) vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; + vps->vps_zio_done_probe = (zio != NULL); if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* @@ -1778,15 +1793,6 @@ vdev_probe(vdev_t *vd, zio_t *zio) vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, vdev_probe_done, vps, vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); - - /* - * We can't change the vdev state in this context, so we - * kick off an async task to do it on our behalf. - */ - if (zio != NULL) { - vd->vdev_probe_wanted = B_TRUE; - spa_async_request(spa, SPA_ASYNC_PROBE); - } } if (zio != NULL) @@ -3730,6 +3736,18 @@ vdev_load(vdev_t *vd) if (error && error != ENOENT) vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " "failed [error=%d]", (u_longlong_t)zapobj, error); + + error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_N, + &vd->vdev_slow_io_n); + if (error && error != ENOENT) + vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " + "failed [error=%d]", (u_longlong_t)zapobj, error); + + error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_T, + &vd->vdev_slow_io_t); + if (error && error != ENOENT) + vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " + "failed [error=%d]", (u_longlong_t)zapobj, error); } /* @@ -5934,6 +5952,20 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) } vd->vdev_io_t = intval; break; + case VDEV_PROP_SLOW_IO_N: + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + vd->vdev_slow_io_n = intval; + break; + case VDEV_PROP_SLOW_IO_T: + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + vd->vdev_slow_io_t = intval; + break; default: /* Most processing is done in vdev_props_set_sync */ break; @@ -6269,6 +6301,8 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) case VDEV_PROP_CHECKSUM_T: case VDEV_PROP_IO_N: case VDEV_PROP_IO_T: + case VDEV_PROP_SLOW_IO_N: + case VDEV_PROP_SLOW_IO_T: err = vdev_prop_get_int(vd, prop, &intval); if (err && err != ENOENT) break; diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 737d8b33e188..5c0e750c4614 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -1982,6 +1982,7 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) /* * If this isn't a resync due to I/O errors, * and nothing changed in this transaction group, + * and multihost protection isn't enabled, * and the vdev configuration hasn't changed, * then there's nothing to do. */ @@ -1989,7 +1990,8 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) boolean_t changed = uberblock_update(ub, spa->spa_root_vdev, txg, spa->spa_mmp.mmp_delay); - if (!changed && list_is_empty(&spa->spa_config_dirty_list)) + if (!changed && list_is_empty(&spa->spa_config_dirty_list) && + !spa_multihost(spa)) return (0); } diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 3445fa9d35d5..34ef57a3548d 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -1283,8 +1283,9 @@ vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, static void vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) { - int n, i, c, t, tt; - int nmissing_rows; + int i, c, t, tt; + unsigned int n; + unsigned int nmissing_rows; int missing_rows[VDEV_RAIDZ_MAXPARITY]; int parity_map[VDEV_RAIDZ_MAXPARITY]; uint8_t *p, *pp; diff --git a/module/zfs/zap.c b/module/zfs/zap.c index dde05d7005c2..da86defb445c 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -133,7 +133,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) * set up block 1 - the first leaf */ dmu_buf_t *db; - VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, + VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, 1<zt_numblks * 2); tbl->zt_nextblk = newblk; ASSERT0(tbl->zt_blks_copied); - dmu_prefetch(zap->zap_objset, zap->zap_object, 0, + dmu_prefetch_by_dnode(zap->zap_dnode, 0, tbl->zt_blk << bs, tbl->zt_numblks << bs, ZIO_PRIORITY_SYNC_READ); } @@ -193,21 +193,21 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, uint64_t b = tbl->zt_blks_copied; dmu_buf_t *db_old; - int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH); if (err != 0) return (err); /* first half of entries in old[b] go to new[2*b+0] */ dmu_buf_t *db_new; - VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, + VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db_new, tx); transfer_func(db_old->db_data, db_new->db_data, hepb); dmu_buf_rele(db_new, FTAG); /* second half of entries in old[b] go to new[2*b+1] */ - VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, + VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db_new, tx); transfer_func((uint64_t *)db_old->db_data + hepb, @@ -255,7 +255,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, uint64_t off = idx & ((1<<(bs-3))-1); dmu_buf_t *db; - int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err != 0) return (err); @@ -267,7 +267,7 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, uint64_t off2 = idx2 & ((1<<(bs-3))-1); dmu_buf_t *db2; - err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + err = dmu_buf_hold_by_dnode(zap->zap_dnode, (tbl->zt_nextblk + blk2) << bs, FTAG, &db2, DMU_READ_NO_PREFETCH); if (err != 0) { @@ -296,16 +296,9 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) uint64_t blk = idx >> (bs-3); uint64_t off = idx & ((1<<(bs-3))-1); - /* - * Note: this is equivalent to dmu_buf_hold(), but we use - * _dnode_enter / _by_dnode because it's faster because we don't - * have to hold the dnode. - */ - dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf); dmu_buf_t *db; - int err = dmu_buf_hold_by_dnode(dn, + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); - dmu_buf_dnode_exit(zap->zap_dbuf); if (err != 0) return (err); *valp = ((uint64_t *)db->db_data)[off]; @@ -319,11 +312,9 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) */ blk = (idx*2) >> (bs-3); - dn = dmu_buf_dnode_enter(zap->zap_dbuf); - err = dmu_buf_hold_by_dnode(dn, + err = dmu_buf_hold_by_dnode(zap->zap_dnode, (tbl->zt_nextblk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); - dmu_buf_dnode_exit(zap->zap_dbuf); if (err == 0) dmu_buf_rele(db, FTAG); } @@ -368,7 +359,7 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) uint64_t newblk = zap_allocate_blocks(zap, 1); dmu_buf_t *db_new; - int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new, DMU_READ_NO_PREFETCH); if (err != 0) @@ -433,7 +424,7 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx) l->l_blkid = zap_allocate_blocks(zap, 1); l->l_dbuf = NULL; - VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, + VERIFY0(dmu_buf_hold_by_dnode(zap->zap_dnode, l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf, DMU_READ_NO_PREFETCH)); dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf); @@ -533,10 +524,8 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, return (SET_ERROR(ENOENT)); int bs = FZAP_BLOCK_SHIFT(zap); - dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf); - int err = dmu_buf_hold_by_dnode(dn, + int err = dmu_buf_hold_by_dnode(zap->zap_dnode, blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH); - dmu_buf_dnode_exit(zap->zap_dbuf); if (err != 0) return (err); @@ -985,7 +974,7 @@ fzap_prefetch(zap_name_t *zn) if (zap_idx_to_blk(zap, idx, &blk) != 0) return; int bs = FZAP_BLOCK_SHIFT(zap); - dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs, + dmu_prefetch_by_dnode(zap->zap_dnode, 0, blk << bs, 1 << bs, ZIO_PRIORITY_SYNC_READ); } @@ -1228,7 +1217,7 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) */ if (zc->zc_hash == 0 && zap_iterate_prefetch && zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) { - dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0, + dmu_prefetch_by_dnode(zap->zap_dnode, 0, 0, zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap), ZIO_PRIORITY_ASYNC_READ); } @@ -1356,7 +1345,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs); } else { - dmu_prefetch(zap->zap_objset, zap->zap_object, 0, + dmu_prefetch_by_dnode(zap->zap_dnode, 0, zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs, zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs, ZIO_PRIORITY_SYNC_READ); @@ -1366,7 +1355,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) dmu_buf_t *db; int err; - err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + err = dmu_buf_hold_by_dnode(zap->zap_dnode, (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err == 0) { diff --git a/module/zfs/zap_leaf.c b/module/zfs/zap_leaf.c index e6afb1c58c95..032aca92695e 100644 --- a/module/zfs/zap_leaf.c +++ b/module/zfs/zap_leaf.c @@ -41,7 +41,8 @@ #include #include -static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry); +static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, struct zap_leaf_entry *le, + uint16_t entry); #define CHAIN_END 0xffff /* end of the chunk chain */ @@ -52,16 +53,6 @@ static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry); #define LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)]) -static void -zap_memset(void *a, int c, size_t n) -{ - char *cp = a; - char *cpend = cp + n; - - while (cp < cpend) - *cp++ = c; -} - static void stv(int len, void *addr, uint64_t value) { @@ -79,7 +70,7 @@ stv(int len, void *addr, uint64_t value) *(uint64_t *)addr = value; return; default: - cmn_err(CE_PANIC, "bad int len %d", len); + PANIC("bad int len %d", len); } } @@ -96,13 +87,13 @@ ldv(int len, const void *addr) case 8: return (*(uint64_t *)addr); default: - cmn_err(CE_PANIC, "bad int len %d", len); + PANIC("bad int len %d", len); } return (0xFEEDFACEDEADBEEFULL); } void -zap_leaf_byteswap(zap_leaf_phys_t *buf, int size) +zap_leaf_byteswap(zap_leaf_phys_t *buf, size_t size) { zap_leaf_t l; dmu_buf_t l_dbuf; @@ -119,10 +110,10 @@ zap_leaf_byteswap(zap_leaf_phys_t *buf, int size) buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len); buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist); - for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++) + for (uint_t i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++) buf->l_hash[i] = BSWAP_16(buf->l_hash[i]); - for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) { + for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) { zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i); struct zap_leaf_entry *le; @@ -160,11 +151,11 @@ void zap_leaf_init(zap_leaf_t *l, boolean_t sort) { l->l_bs = highbit64(l->l_dbuf->db_size) - 1; - zap_memset(&zap_leaf_phys(l)->l_hdr, 0, + memset(&zap_leaf_phys(l)->l_hdr, 0, sizeof (struct zap_leaf_header)); - zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END, + memset(zap_leaf_phys(l)->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l)); - for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { + for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE; ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1; } @@ -185,7 +176,7 @@ zap_leaf_chunk_alloc(zap_leaf_t *l) { ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0); - int chunk = zap_leaf_phys(l)->l_hdr.lh_freelist; + uint_t chunk = zap_leaf_phys(l)->l_hdr.lh_freelist; ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE); @@ -223,28 +214,29 @@ zap_leaf_array_create(zap_leaf_t *l, const char *buf, { uint16_t chunk_head; uint16_t *chunkp = &chunk_head; - int byten = 0; + int byten = integer_size; uint64_t value = 0; int shift = (integer_size - 1) * 8; int len = num_integers; ASSERT3U(num_integers * integer_size, <=, ZAP_MAXVALUELEN); + if (len > 0) + value = ldv(integer_size, buf); while (len > 0) { uint16_t chunk = zap_leaf_chunk_alloc(l); struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; la->la_type = ZAP_CHUNK_ARRAY; for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) { - if (byten == 0) - value = ldv(integer_size, buf); la->la_array[i] = value >> shift; value <<= 8; - if (++byten == integer_size) { - byten = 0; - buf += integer_size; + if (--byten == 0) { if (--len == 0) break; + byten = integer_size; + buf += integer_size; + value = ldv(integer_size, buf); } } @@ -264,7 +256,7 @@ zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp) *chunkp = CHAIN_END; while (chunk != CHAIN_END) { - int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next; + uint_t nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next; ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==, ZAP_CHUNK_ARRAY); zap_leaf_chunk_free(l, chunk); @@ -333,7 +325,7 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk, static boolean_t zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn, - int chunk, int array_numints) + uint_t chunk, int array_numints) { int bseen = 0; @@ -562,7 +554,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd, uint64_t valuelen = integer_size * num_integers; - int numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints * + uint_t numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints * zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen); if (numchunks > ZAP_LEAF_NUMCHUNKS(l)) return (SET_ERROR(E2BIG)); @@ -624,7 +616,7 @@ zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd, /* link it into the hash chain */ /* XXX if we did the search above, we could just use that */ - uint16_t *chunkp = zap_leaf_rehash_entry(l, chunk); + uint16_t *chunkp = zap_leaf_rehash_entry(l, le, chunk); zap_leaf_phys(l)->l_hdr.lh_nentries++; @@ -687,9 +679,8 @@ zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn, */ static uint16_t * -zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry) +zap_leaf_rehash_entry(zap_leaf_t *l, struct zap_leaf_entry *le, uint16_t entry) { - struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry); struct zap_leaf_entry *le2; uint16_t *chunkp; @@ -722,7 +713,7 @@ zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl) &ZAP_LEAF_CHUNK(nl, nchunk).l_array; struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; - int nextchunk = la->la_next; + uint_t nextchunk = la->la_next; ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l)); @@ -739,7 +730,7 @@ zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl) } static void -zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl) +zap_leaf_transfer_entry(zap_leaf_t *l, uint_t entry, zap_leaf_t *nl) { struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry); ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); @@ -748,7 +739,7 @@ zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl) struct zap_leaf_entry *nle = ZAP_LEAF_ENTRY(nl, chunk); *nle = *le; /* structure assignment */ - (void) zap_leaf_rehash_entry(nl, chunk); + (void) zap_leaf_rehash_entry(nl, nle, chunk); nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl); nle->le_value_chunk = @@ -766,7 +757,7 @@ zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl) void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort) { - int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len; + uint_t bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len; /* set new prefix and prefix_len */ zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1; @@ -777,7 +768,7 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort) zap_leaf_phys(l)->l_hdr.lh_prefix_len; /* break existing hash chains */ - zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END, + memset(zap_leaf_phys(l)->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l)); if (sort) @@ -792,7 +783,7 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort) * but this accesses memory more sequentially, and when we're * called, the block is usually pretty full. */ - for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { + for (uint_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i); if (le->le_type != ZAP_CHUNK_ENTRY) continue; @@ -800,14 +791,14 @@ zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort) if (le->le_hash & (1ULL << bit)) zap_leaf_transfer_entry(l, i, nl); else - (void) zap_leaf_rehash_entry(l, i); + (void) zap_leaf_rehash_entry(l, le, i); } } void zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs) { - int n = zap_f_phys(zap)->zap_ptrtbl.zt_shift - + uint_t n = zap_f_phys(zap)->zap_ptrtbl.zt_shift - zap_leaf_phys(l)->l_hdr.lh_prefix_len; n = MIN(n, ZAP_HISTOGRAM_SIZE-1); zs->zs_leafs_with_2n_pointers[n]++; @@ -823,9 +814,9 @@ zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs) n = MIN(n, ZAP_HISTOGRAM_SIZE-1); zs->zs_blocks_n_tenths_full[n]++; - for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) { - int nentries = 0; - int chunk = zap_leaf_phys(l)->l_hash[i]; + for (uint_t i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) { + uint_t nentries = 0; + uint_t chunk = zap_leaf_phys(l)->l_hash[i]; while (chunk != CHAIN_END) { struct zap_leaf_entry *le = diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index 085d9cd8b4b6..d806988af96d 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -415,7 +415,7 @@ mze_destroy(zap_t *zap) } static zap_t * -mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) +mzap_open(dmu_buf_t *db) { zap_t *winner; uint64_t *zap_hdr = (uint64_t *)db->db_data; @@ -427,8 +427,8 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL); rw_enter(&zap->zap_rwlock, RW_WRITER); - zap->zap_objset = os; - zap->zap_object = obj; + zap->zap_objset = dmu_buf_get_objset(db); + zap->zap_object = db->db_object; zap->zap_dbuf = db; if (zap_block_type != ZBT_MICRO) { @@ -518,7 +518,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) * have the specified tag. */ static int -zap_lockdir_impl(dmu_buf_t *db, const void *tag, dmu_tx_t *tx, +zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) { ASSERT0(db->db_offset); @@ -528,13 +528,13 @@ zap_lockdir_impl(dmu_buf_t *db, const void *tag, dmu_tx_t *tx, *zapp = NULL; - dmu_object_info_from_db(db, &doi); + dmu_object_info_from_dnode(dn, &doi); if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP) return (SET_ERROR(EINVAL)); zap_t *zap = dmu_buf_get_user(db); if (zap == NULL) { - zap = mzap_open(os, obj, db); + zap = mzap_open(db); if (zap == NULL) { /* * mzap_open() didn't like what it saw on-disk. @@ -563,6 +563,7 @@ zap_lockdir_impl(dmu_buf_t *db, const void *tag, dmu_tx_t *tx, } zap->zap_objset = os; + zap->zap_dnode = dn; if (lt == RW_WRITER) dmu_buf_will_dirty(db, tx); @@ -598,23 +599,16 @@ zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx, zap_t **zapp) { dmu_buf_t *db; + int err; - int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); - if (err != 0) { + err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); + if (err != 0) return (err); - } -#ifdef ZFS_DEBUG - { - dmu_object_info_t doi; - dmu_object_info_from_db(db, &doi); - ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); - } -#endif - - err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); - if (err != 0) { + err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp); + if (err != 0) dmu_buf_rele(db, tag); - } + else + VERIFY(dnode_add_ref(dn, tag)); return (err); } @@ -623,21 +617,23 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag, zap_t **zapp) { + dnode_t *dn; dmu_buf_t *db; + int err; - int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH); + err = dnode_hold(os, obj, tag, &dn); if (err != 0) return (err); -#ifdef ZFS_DEBUG - { - dmu_object_info_t doi; - dmu_object_info_from_db(db, &doi); - ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); + err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); + if (err != 0) { + dnode_rele(dn, tag); + return (err); } -#endif - err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); - if (err != 0) + err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp); + if (err != 0) { dmu_buf_rele(db, tag); + dnode_rele(dn, tag); + } return (err); } @@ -645,6 +641,7 @@ void zap_unlockdir(zap_t *zap, const void *tag) { rw_exit(&zap->zap_rwlock); + dnode_rele(zap->zap_dnode, tag); dmu_buf_rele(zap->zap_dbuf, tag); } @@ -730,7 +727,8 @@ mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx) if (flags != 0) { zap_t *zap; /* Only fat zap supports flags; upgrade immediately. */ - VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER, + VERIFY(dnode_add_ref(dn, FTAG)); + VERIFY0(zap_lockdir_impl(dn, db, FTAG, tx, RW_WRITER, B_FALSE, B_FALSE, &zap)); VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags)); zap_unlockdir(zap, FTAG); @@ -1325,6 +1323,26 @@ zap_add_by_dnode(dnode_t *dn, const char *key, return (err); } +static int +zap_add_uint64_impl(zap_t *zap, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx, const void *tag) +{ + int err; + + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, tag); + return (SET_ERROR(ENOTSUP)); + } + err = fzap_add(zn, integer_size, num_integers, val, tag, tx); + zap = zn->zn_zap; /* fzap_add() may change zap */ + zap_name_free(zn); + if (zap != NULL) /* may be NULL if fzap_add() failed */ + zap_unlockdir(zap, tag); + return (err); +} + int zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, @@ -1336,16 +1354,26 @@ zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx); - zap = zn->zn_zap; /* fzap_add() may change zap */ - zap_name_free(zn); - if (zap != NULL) /* may be NULL if fzap_add() failed */ - zap_unlockdir(zap, FTAG); + err = zap_add_uint64_impl(zap, key, key_numints, + integer_size, num_integers, val, tx, FTAG); + /* zap_add_uint64_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + + int err = + zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_add_uint64_impl(zap, key, key_numints, + integer_size, num_integers, val, tx, FTAG); + /* zap_add_uint64_impl() calls zap_unlockdir() */ return (err); } @@ -1396,27 +1424,56 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, return (err); } -int -zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, - int key_numints, - int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) +static int +zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx, + const void *tag) { - zap_t *zap; + int err; - int err = - zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); - if (err != 0) - return (err); zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { - zap_unlockdir(zap, FTAG); + zap_unlockdir(zap, tag); return (SET_ERROR(ENOTSUP)); } - err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx); + err = fzap_update(zn, integer_size, num_integers, val, tag, tx); zap = zn->zn_zap; /* fzap_update() may change zap */ zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ - zap_unlockdir(zap, FTAG); + zap_unlockdir(zap, tag); + return (err); +} + +int +zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, const void *val, + dmu_tx_t *tx) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_update_uint64_impl(zap, key, key_numints, + integer_size, num_integers, val, tx, FTAG); + /* zap_update_uint64_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + + int err = + zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_update_uint64_impl(zap, key, key_numints, + integer_size, num_integers, val, tx, FTAG); + /* zap_update_uint64_impl() calls zap_unlockdir() */ return (err); } @@ -1481,6 +1538,23 @@ zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx) return (err); } +static int +zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints, + dmu_tx_t *tx, const void *tag) +{ + int err; + + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, tag); + return (SET_ERROR(ENOTSUP)); + } + err = fzap_remove(zn, tx); + zap_name_free(zn); + zap_unlockdir(zap, tag); + return (err); +} + int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, dmu_tx_t *tx) @@ -1491,14 +1565,23 @@ zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); - zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); - return (SET_ERROR(ENOTSUP)); - } - err = fzap_remove(zn, tx); - zap_name_free(zn); - zap_unlockdir(zap, FTAG); + err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG); + /* zap_remove_uint64_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints, + dmu_tx_t *tx) +{ + zap_t *zap; + + int err = + zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG); + /* zap_remove_uint64_impl() calls zap_unlockdir() */ return (err); } @@ -1704,14 +1787,17 @@ EXPORT_SYMBOL(zap_prefetch_uint64); EXPORT_SYMBOL(zap_add); EXPORT_SYMBOL(zap_add_by_dnode); EXPORT_SYMBOL(zap_add_uint64); +EXPORT_SYMBOL(zap_add_uint64_by_dnode); EXPORT_SYMBOL(zap_update); EXPORT_SYMBOL(zap_update_uint64); +EXPORT_SYMBOL(zap_update_uint64_by_dnode); EXPORT_SYMBOL(zap_length); EXPORT_SYMBOL(zap_length_uint64); EXPORT_SYMBOL(zap_remove); EXPORT_SYMBOL(zap_remove_by_dnode); EXPORT_SYMBOL(zap_remove_norm); EXPORT_SYMBOL(zap_remove_uint64); +EXPORT_SYMBOL(zap_remove_uint64_by_dnode); EXPORT_SYMBOL(zap_count); EXPORT_SYMBOL(zap_value_search); EXPORT_SYMBOL(zap_join); diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index c4eb74e873db..481af2ba826b 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -222,6 +222,12 @@ vdev_prop_get_inherited(vdev_t *vd, vdev_prop_t prop) case VDEV_PROP_IO_T: propval = vd->vdev_io_t; break; + case VDEV_PROP_SLOW_IO_N: + propval = vd->vdev_slow_io_n; + break; + case VDEV_PROP_SLOW_IO_T: + propval = vd->vdev_slow_io_t; + break; default: propval = propdef; break; @@ -741,6 +747,26 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, NULL); } + if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) { + uint64_t slow_io_n, slow_io_t; + + slow_io_n = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_N); + if (slow_io_n != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N)) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N, + DATA_TYPE_UINT64, + slow_io_n, + NULL); + + slow_io_t = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_T); + if (slow_io_t != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T)) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T, + DATA_TYPE_UINT64, + slow_io_t, + NULL); + } + mutex_exit(&spa->spa_errlist_lock); *ereport_out = ereport; diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 01e16e102fc4..eee6ad62ff30 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -27,7 +27,7 @@ * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. - * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2011, 2024 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Integros [integros.com] @@ -1887,7 +1887,7 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc) error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config); if (error == 0) { - error = spa_vdev_add(spa, config); + error = spa_vdev_add(spa, config, zc->zc_flags); nvlist_free(config); } spa_close(spa, FTAG); @@ -5816,10 +5816,13 @@ zfs_ioc_clear(zfs_cmd_t *zc) /* * If multihost is enabled, resuming I/O is unsafe as another - * host may have imported the pool. + * host may have imported the pool. Check for remote activity. */ - if (spa_multihost(spa) && spa_suspended(spa)) - return (SET_ERROR(EINVAL)); + if (spa_multihost(spa) && spa_suspended(spa) && + spa_mmp_remote_host_activity(spa)) { + spa_close(spa, FTAG); + return (SET_ERROR(EREMOTEIO)); + } spa_vdev_state_enter(spa, SCL_NONE); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index d0b4016237b9..046e6d64c1a9 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2503,8 +2503,10 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) "failure and the failure mode property for this pool " "is set to panic.", spa_name(spa)); - cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O " - "failure and has been suspended.\n", spa_name(spa)); + if (reason != ZIO_SUSPEND_MMP) { + cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable " + "I/O failure and has been suspended.\n", spa_name(spa)); + } (void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, NULL, 0); diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index 3598351c499d..66ad72fb88e9 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -605,6 +605,12 @@ zio_handle_io_delay(zio_t *zio) if (vd->vdev_guid != handler->zi_record.zi_guid) continue; + /* also match on I/O type (e.g., -T read) */ + if (handler->zi_record.zi_iotype != ZIO_TYPES && + handler->zi_record.zi_iotype != zio->io_type) { + continue; + } + /* * Defensive; should never happen as the array allocation * occurs prior to inserting this handler on the list. diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 35b624de7db2..d2f718bb3e29 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -965,7 +965,7 @@ zvol_prefetch_minors_impl(void *arg) job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); if (job->error == 0) { - dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + dmu_prefetch_dnode(os, ZVOL_OBJ, ZIO_PRIORITY_SYNC_READ); dmu_objset_disown(os, B_TRUE, FTAG); } } diff --git a/rpm/redhat/zfs-kmod.spec.in b/rpm/redhat/zfs-kmod.spec.in index 9c836786baea..876c198c64de 100644 --- a/rpm/redhat/zfs-kmod.spec.in +++ b/rpm/redhat/zfs-kmod.spec.in @@ -17,7 +17,7 @@ BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) # by generating a preamble text file which kmodtool can append to the spec file. %(/bin/echo -e "\ Requires: @PACKAGE@ = %{version}\n\ -Conflicts: @PACKAGE@-dkms) +Conflicts: @PACKAGE@-dkms" > %{_sourcedir}/kmod-preamble) # LDFLAGS are not sanitized by arch/*/Makefile for these architectures. %ifarch ppc ppc64 ppc64le aarch64 diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index c4afde554da5..3e1a3aeb6cbe 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -372,7 +372,8 @@ tags = ['functional', 'cli_root', 'zpool'] tests = ['zpool_add_001_pos', 'zpool_add_002_pos', 'zpool_add_003_pos', 'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg', 'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_010_pos', - 'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output'] + 'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output', + 'zpool_add--allow-ashift-mismatch'] tags = ['functional', 'cli_root', 'zpool_add'] [tests/functional/cli_root/zpool_attach] @@ -458,7 +459,8 @@ tests = ['zpool_import_001_pos', 'zpool_import_002_pos', 'import_devices_missing', 'import_log_missing', 'import_paths_changed', 'import_rewind_config_changed', - 'import_rewind_device_replaced'] + 'import_rewind_device_replaced', + 'zpool_import_status'] tags = ['functional', 'cli_root', 'zpool_import'] timeout = 1200 diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 6a4cd3fe691c..92ce09ec6fcb 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -104,7 +104,8 @@ tags = ['functional', 'devices'] [tests/functional/events:Linux] tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill', - 'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config'] + 'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config', + 'zed_slow_io', 'zed_slow_io_many_vdevs'] tags = ['functional', 'events'] [tests/functional/fadvise:Linux] @@ -145,7 +146,7 @@ tags = ['functional', 'mmap'] tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval', 'mmp_active_import', 'mmp_inactive_import', 'mmp_exported_import', 'mmp_write_uberblocks', 'mmp_reset_interval', 'multihost_history', - 'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid'] + 'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid', 'mmp_write_slow_disk'] tags = ['functional', 'mmp'] [tests/functional/mount:Linux] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index f587e265f15e..cc66d762f3c2 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -989,6 +989,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_add/add_prop_ashift.ksh \ functional/cli_root/zpool_add/cleanup.ksh \ functional/cli_root/zpool_add/setup.ksh \ + functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh \ functional/cli_root/zpool_add/zpool_add_001_pos.ksh \ functional/cli_root/zpool_add/zpool_add_002_pos.ksh \ functional/cli_root/zpool_add/zpool_add_003_pos.ksh \ @@ -1447,6 +1448,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/events/zed_fd_spill.ksh \ functional/events/zed_io_config.ksh \ functional/events/zed_rc_filter.ksh \ + functional/events/zed_slow_io.ksh \ + functional/events/zed_slow_io_many_vdevs.ksh \ functional/exec/cleanup.ksh \ functional/exec/exec_001_pos.ksh \ functional/exec/exec_002_neg.ksh \ @@ -1585,6 +1588,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/mmp/mmp_on_zdb.ksh \ functional/mmp/mmp_reset_interval.ksh \ functional/mmp/mmp_write_distribution.ksh \ + functional/mmp/mmp_write_slow_disk.ksh \ functional/mmp/mmp_write_uberblocks.ksh \ functional/mmp/multihost_history.ksh \ functional/mmp/setup.ksh \ diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib b/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib index 3b8eaea5bb54..84b92b4dcdc9 100644 --- a/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib +++ b/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib @@ -97,20 +97,19 @@ function verify_pool_prop_eq function verify_pool_props { - typeset -r dsize=$1 - typeset -r ratio=$2 + typeset -r oused=$1 + typeset -r osaved=$2 + typeset dsize=$3 + typeset ratio=$4 if [[ $dsize -eq 0 ]]; then - verify_pool_prop_eq bcloneused 0 - verify_pool_prop_eq bclonesaved 0 - verify_pool_prop_eq bcloneratio 1.00 - else - if [[ $ratio -eq 1 ]]; then - verify_pool_prop_eq bcloneused 0 - else - verify_pool_prop_eq bcloneused $dsize - fi - verify_pool_prop_eq bclonesaved $((dsize*(ratio-1))) + ratio=1 + elif [[ $ratio -eq 1 ]]; then + dsize=0 + fi + verify_pool_prop_eq bcloneused $(($oused+$dsize)) + verify_pool_prop_eq bclonesaved $(($osaved+dsize*(ratio-1))) + if [[ $oused -eq 0 ]]; then verify_pool_prop_eq bcloneratio "${ratio}.00" fi } @@ -124,16 +123,22 @@ function bclone_test typeset -r srcdir=$4 typeset -r dstdir=$5 typeset dsize + typeset oused + typeset osaved typeset -r original="${srcdir}/original" typeset -r clone="${dstdir}/clone" log_note "Testing file copy with datatype $datatype, file size $filesize, embedded $embedded" + # Save current block cloning stats for later use. + sync_pool $TESTPOOL + oused=$(get_pool_prop bcloneused $TESTPOOL) + osaved=$(get_pool_prop bclonesaved $TESTPOOL) + # Create a test file with known content. case $datatype in random|text) - sync_pool $TESTPOOL if [[ $datatype = "random" ]]; then dd if=/dev/urandom of=$original bs=$filesize count=1 2>/dev/null else @@ -146,13 +151,13 @@ function bclone_test sync_pool $TESTPOOL # It is hard to predict block sizes that will be used, # so just do one clone and take it from bcloneused. - filesize=$(zpool get -Hp -o value bcloneused $TESTPOOL) + dsize=$(get_pool_prop bcloneused $TESTPOOL) + dsize=$(($dsize-$oused)) if [[ $embedded = "false" ]]; then - log_must test $filesize -gt 0 + log_must test $dsize -gt 0 fi rm -f "${clone}-tmp" sync_pool $TESTPOOL - dsize=$filesize ;; hole) log_must truncate_test -s $filesize -f $original @@ -217,7 +222,7 @@ function bclone_test test_file_integrity $original_checksum "${clone}4" $filesize test_file_integrity $original_checksum "${clone}5" $filesize - verify_pool_props $dsize 7 + verify_pool_props $oused $osaved $dsize 7 # Clear cache and test after fresh import. log_must zpool export $TESTPOOL @@ -240,7 +245,7 @@ function bclone_test sync_pool $TESTPOOL - verify_pool_props $dsize 11 + verify_pool_props $oused $osaved $dsize 11 log_must zpool export $TESTPOOL log_must zpool import $TESTPOOL @@ -268,7 +273,7 @@ function bclone_test test_file_integrity $original_checksum "${clone}8" $filesize test_file_integrity $original_checksum "${clone}9" $filesize - verify_pool_props $dsize 6 + verify_pool_props $oused $osaved $dsize 6 rm -f "${clone}0" "${clone}2" "${clone}4" "${clone}8" "${clone}9" @@ -276,11 +281,11 @@ function bclone_test test_file_integrity $original_checksum "${clone}6" $filesize - verify_pool_props $dsize 1 + verify_pool_props $oused $osaved $dsize 1 rm -f "${clone}6" sync_pool $TESTPOOL - verify_pool_props $dsize 1 + verify_pool_props $oused $osaved $dsize 1 } diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_corner_cases.kshlib b/tests/zfs-tests/tests/functional/bclone/bclone_corner_cases.kshlib index ddfbfc999c4e..aeb8efe91715 100644 --- a/tests/zfs-tests/tests/functional/bclone/bclone_corner_cases.kshlib +++ b/tests/zfs-tests/tests/functional/bclone/bclone_corner_cases.kshlib @@ -66,7 +66,7 @@ function bclone_corner_cases_init export SECOND_HALF_ORIG0_CHECKSUM=$(second_half_checksum $ORIG0) export SECOND_HALF_ORIG1_CHECKSUM=$(second_half_checksum $ORIG1) export SECOND_HALF_ORIG2_CHECKSUM=$(second_half_checksum $ORIG2) - export ZEROS_CHECKSUM=$(dd if=/dev/zero bs=$HALFRECORDSIZE count=1 | sha256digest) + export ZEROS_CHECKSUM=$(dd if=/dev/zero bs=$HALFRECORDSIZE count=1 2>/dev/null | sha256digest) export FIRST_HALF_CHECKSUM="" export SECOND_HALF_CHECKSUM="" } @@ -210,6 +210,8 @@ function bclone_corner_cases_test typeset -r dstdir=$2 typeset limit=$3 typeset -i count=0 + typeset oused + typeset osaved if [[ $srcdir != "count" ]]; then if [[ -n "$limit" ]]; then @@ -217,6 +219,11 @@ function bclone_corner_cases_test limit=$(random_int_between 1 $total_count $((limit*2)) | sort -nu | head -n $limit | xargs) fi bclone_corner_cases_init $srcdir $dstdir + + # Save current block cloning stats for later use. + sync_pool $TESTPOOL + oused=$(get_pool_prop bcloneused $TESTPOOL) + osaved=$(get_pool_prop bclonesaved $TESTPOOL) fi # @@ -285,21 +292,24 @@ function bclone_corner_cases_test overwrite_clone "$second_overwrite" if checksum_compare $read_after; then - log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after" + log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / second_overwrite: $second_overwrite / read_after: $read_after" else - log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after" + log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / second_overwrite: $second_overwrite / read_after: $read_after" fi log_must zpool export $TESTPOOL log_must zpool import $TESTPOOL if checksum_compare "yes"; then - log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after / read_next_txg" + log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / second_overwrite: $second_overwrite / read_after: $read_after / read_next_txg" else - log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after / read_next_txg" + log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / second_overwrite: $second_overwrite / read_after: $read_after / read_next_txg" fi rm -f "$CLONE" + sync_pool $TESTPOOL + verify_pool_prop_eq bcloneused $oused + verify_pool_prop_eq bclonesaved $osaved done done done diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh index 7ecaf849e44b..51871934dd22 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh @@ -22,7 +22,7 @@ # # Copyright 2017, loli10K. All rights reserved. -# Copyright (c) 2020 by Delphix. All rights reserved. +# Copyright (c) 2020, 2024 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -60,12 +60,23 @@ log_must mkfile $SIZE $disk2 logical_ashift=$(get_tunable VDEV_FILE_LOGICAL_ASHIFT) orig_ashift=$(get_tunable VDEV_FILE_PHYSICAL_ASHIFT) max_auto_ashift=$(get_tunable VDEV_MAX_AUTO_ASHIFT) +opt="" typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16") for ashift in ${ashifts[@]} do + # + # Need to add the --allow-ashift-mismatch option to disable the + # ashift mismatch checks in zpool add. + # + if [[ $ashift -eq $orig_ashift ]]; then + opt="" + else + opt="--allow-ashift-mismatch" + fi + log_must zpool create $TESTPOOL $disk1 - log_must zpool add -o ashift=$ashift $TESTPOOL $disk2 + log_must zpool add $opt -o ashift=$ashift $TESTPOOL $disk2 log_must verify_ashift $disk2 $ashift # clean things for the next run @@ -78,7 +89,7 @@ do # log_must zpool create $TESTPOOL $disk1 log_must set_tunable32 VDEV_FILE_PHYSICAL_ASHIFT $ashift - log_must zpool add $TESTPOOL $disk2 + log_must zpool add $opt $TESTPOOL $disk2 exp=$(( (ashift <= max_auto_ashift) ? ashift : logical_ashift )) log_must verify_ashift $disk2 $exp diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add_prop_ashift.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add_prop_ashift.ksh index 228f62232aae..6a3283d0618f 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/add_prop_ashift.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/add_prop_ashift.ksh @@ -22,7 +22,7 @@ # # Copyright 2017, loli10K. All rights reserved. -# Copyright (c) 2020 by Delphix. All rights reserved. +# Copyright (c) 2020, 2024 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -68,8 +68,13 @@ log_must set_tunable32 VDEV_FILE_PHYSICAL_ASHIFT 16 typeset ashifts=("9" "10" "11" "12" "13" "14" "15" "16") for ashift in ${ashifts[@]} do + if [ $ashift -eq $orig_ashift ];then + opt="" + else + opt="--allow-ashift-mismatch" + fi log_must zpool create -o ashift=$ashift $TESTPOOL $disk1 - log_must zpool add $TESTPOOL $disk2 + log_must zpool add $opt $TESTPOOL $disk2 log_must verify_ashift $disk2 $ashift # clean things for the next run @@ -82,8 +87,13 @@ for ashift in ${ashifts[@]} do for cmdval in ${ashifts[@]} do + if [ $ashift -eq $cmdval ];then + opt="" + else + opt="--allow-ashift-mismatch" + fi log_must zpool create -o ashift=$ashift $TESTPOOL $disk1 - log_must zpool add -o ashift=$cmdval $TESTPOOL $disk2 + log_must zpool add $opt -o ashift=$cmdval $TESTPOOL $disk2 log_must verify_ashift $disk2 $cmdval # clean things for the next run diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh new file mode 100755 index 000000000000..e69de29bb2d1 diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_002_pos.ksh index c5c06f76340b..afee34a33469 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_002_pos.ksh @@ -65,4 +65,15 @@ log_mustnot vdevs_in_pool $TESTPOOL $DISK2 log_must zpool add -f $TESTPOOL $DISK2 log_must vdevs_in_pool $TESTPOOL $DISK2 +log_must zpool destroy $TESTPOOL + +create_pool $TESTPOOL mirror $DISK0 $DISK1 +log_must poolexists $TESTPOOL + +log_mustnot zpool add $TESTPOOL $DISK2 +log_mustnot vdevs_in_pool $TESTPOOL $DISK2 + +log_must zpool add --allow-replication-mismatch $TESTPOOL $DISK2 +log_must vdevs_in_pool $TESTPOOL $DISK2 + log_pass "'zpool add -f ...' executes successfully." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_004_pos.ksh index 646edc1a4557..cecda56ab125 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_004_pos.ksh @@ -70,7 +70,7 @@ if is_freebsd; then recursive=$(get_tunable VOL_RECURSIVE) log_must set_tunable64 VOL_RECURSIVE 1 fi -log_must zpool add $TESTPOOL $ZVOL_DEVDIR/$TESTPOOL1/$TESTVOL +log_must zpool add --allow-ashift-mismatch $TESTPOOL $ZVOL_DEVDIR/$TESTPOOL1/$TESTVOL log_must vdevs_in_pool "$TESTPOOL" "$ZVOL_DEVDIR/$TESTPOOL1/$TESTVOL" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_005_pos.ksh index 4990ef9d29b0..0e9d9f5f030f 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_005_pos.ksh @@ -75,7 +75,9 @@ log_must poolexists $TESTPOOL1 unset NOINUSE_CHECK log_mustnot zpool add -f $TESTPOOL $DISK1 +log_mustnot zpool add --allow-in-use $TESTPOOL $DISK1 log_mustnot zpool add -f $TESTPOOL $mnttab_dev +log_mustnot zpool add --allow-in-use $TESTPOOL $mnttab_dev if is_linux; then log_mustnot zpool add $TESTPOOL $vfstab_dev else diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh index d7f3a900e8fd..a13a27160e76 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh @@ -64,7 +64,9 @@ log_mustnot zpool add -f $TESTPOOL $DISK0 for type in "" "mirror" "raidz" "draid" "spare" "log" "dedup" "special" "cache" do log_mustnot zpool add -f $TESTPOOL $type $DISK0 $DISK1 + log_mustnot zpool add --allow-in-use $TESTPOOL $type $DISK0 $DISK1 log_mustnot zpool add -f $TESTPOOL $type $DISK1 $DISK1 + log_mustnot zpool add --allow-in-use $TESTPOOL $type $DISK1 $DISK1 done log_pass "'zpool add' get fail as expected if vdevs are the same or vdev is " \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh index b8b25db1b9f9..22860e9caf1d 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh @@ -138,7 +138,7 @@ function zpool_create_forced_add while ((j < ${#add_args[@]})); do log_must zpool create $TESTPOOL1 ${create_args[$i]} log_mustnot zpool add $TESTPOOL1 ${add_args[$j]} - log_must zpool add -f $TESTPOOL1 ${add_args[$j]} + log_must zpool add --allow-replication-mismatch $TESTPOOL1 ${add_args[$j]} log_must zpool destroy -f $TESTPOOL1 ((j += 1)) diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg index 71a64d4fae7a..c3b9efd6464a 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg @@ -70,4 +70,6 @@ typeset -a properties=( checksum_t io_n io_t + slow_io_n + slow_io_t ) diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_status.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_status.ksh new file mode 100755 index 000000000000..c96961bf6419 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_status.ksh @@ -0,0 +1,132 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2023 Klara, Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.cfg + +# +# DESCRIPTION: +# During a pool import, the 'import_progress' kstat contains details +# on the import progress. +# +# STRATEGY: +# 1. Create test pool with several devices +# 2. Generate some ZIL records and spacemap logs +# 3. Export the pool +# 4. Import the pool in the background and monitor the kstat content +# 5. Check the zfs debug messages for import progress +# + +verify_runnable "global" + +function cleanup +{ + log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 0 + log_must set_tunable64 METASLAB_DEBUG_LOAD 0 + + destroy_pool $TESTPOOL1 +} + +log_assert "During a pool import, the 'import_progress' kstat contains " \ + "notes on the progress" + +log_onexit cleanup + +log_must zpool create $TESTPOOL1 $VDEV0 $VDEV1 $VDEV2 +typeset guid=$(zpool get -H -o value guid $TESTPOOL1) + +log_must zfs create -o recordsize=8k $TESTPOOL1/fs +# +# This dd command works around an issue where ZIL records aren't created +# after freezing the pool unless a ZIL header already exists. Create a file +# synchronously to force ZFS to write one out. +# +log_must dd if=/dev/zero of=/$TESTPOOL1/fs/sync conv=fsync bs=1 count=1 + +# +# Overwrite some blocks to populate spacemap logs +# +log_must dd if=/dev/urandom of=/$TESTPOOL1/fs/00 bs=1M count=200 +sync_all_pools +log_must dd if=/dev/urandom of=/$TESTPOOL1/fs/00 bs=1M count=200 +sync_all_pools + +# +# Freeze the pool to retain intent log records +# +log_must zpool freeze $TESTPOOL1 + +# fill_fs [destdir] [dirnum] [filenum] [bytes] [num_writes] [data] +log_must fill_fs /$TESTPOOL1/fs 1 2000 100 1024 R + +log_must zpool list -v $TESTPOOL1 + +# +# Unmount filesystem and export the pool +# +# At this stage the zfs intent log contains +# a set of records to replay. +# +log_must zfs unmount /$TESTPOOL1/fs + +log_must set_tunable64 KEEP_LOG_SPACEMAPS_AT_EXPORT 1 +log_must zpool export $TESTPOOL1 + +log_must set_tunable64 METASLAB_DEBUG_LOAD 1 +log_note "Starting zpool import in background at" $(date +'%H:%M:%S') +zpool import -d $DEVICE_DIR -f $guid & +pid=$! + +# +# capture progress until import is finished +# +log_note waiting for pid $pid to exit +kstat import_progress +while [[ -d /proc/"$pid" ]]; do + line=$(kstat import_progress | grep -v pool_guid) + if [[ -n $line ]]; then + echo $line + fi + if [[ -f /$TESTPOOL1/fs/00 ]]; then + break; + fi + sleep 0.0001 +done +log_note "zpool import completed at" $(date +'%H:%M:%S') + +entries=$(kstat dbgmsg | grep "spa_import_progress_set_notes_impl(): 'testpool1'" | wc -l) +log_note "found $entries progress notes in dbgmsg" +log_must test $entries -gt 20 + +log_must zpool status $TESTPOOL1 + +log_pass "During a pool import, the 'import_progress' kstat contains " \ + "notes on the progress" diff --git a/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh b/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh index 60817449ab03..4db968ffae05 100755 --- a/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh @@ -76,7 +76,7 @@ log_onexit cleanup SRC_FILE=src.data DST_FILE=dst.data -SRC_SIZE=$(($RANDOM % 2048)) +SRC_SIZE=$((1024 + $RANDOM % 1024)) # A smaller recordsize is used merely to speed up the test. RECORDSIZE=4096 @@ -120,7 +120,7 @@ for mode in "never" "auto" "always"; do # Overwrite a random range of an existing file and immediately copy it. sync_pool $TESTPOOL log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \ - seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc + seek=$(($RANDOM % $SRC_SIZE)) count=$((1 + $RANDOM % 16)) conv=notrunc if [[ "$mode" == "always" ]]; then log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE log_must ls -l $CP_TESTDIR @@ -152,7 +152,7 @@ for mode in "never" "auto" "always"; do # Overwrite a random range of an existing file and immediately copy it. log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \ - seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc + seek=$(($RANDOM % $SRC_SIZE)) count=$((1 + $RANDOM % 16)) conv=notrunc log_must cp --reflink=$mode $SRC_FILE $DST_FILE verify_copy $SRC_FILE $DST_FILE log_must rm -f $SRC_FILE $DST_FILE diff --git a/tests/zfs-tests/tests/functional/events/cleanup.ksh b/tests/zfs-tests/tests/functional/events/cleanup.ksh index ef6e098cf42a..669b8ae99456 100755 --- a/tests/zfs-tests/tests/functional/events/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/events/cleanup.ksh @@ -26,8 +26,10 @@ . $STF_SUITE/include/libtest.shlib +zed_stop + zed_cleanup all-debug.sh all-syslog.sh all-dumpfds -zed_stop +zed_events_drain default_cleanup diff --git a/tests/zfs-tests/tests/functional/events/zed_slow_io.ksh b/tests/zfs-tests/tests/functional/events/zed_slow_io.ksh new file mode 100755 index 000000000000..d9fabb2c3bc9 --- /dev/null +++ b/tests/zfs-tests/tests/functional/events/zed_slow_io.ksh @@ -0,0 +1,205 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Klara Inc. +# + +# DESCRIPTION: +# Verify that vdev properties, slow_io_n and slow_io_t, work with ZED. +# +# STRATEGY: +# 1. Create a pool with single vdev +# 2. Set slow_io_n/slow_io_t to non-default values +# 3. Inject slow io errors +# 4. Verify that ZED degrades vdev +# + +. $STF_SUITE/include/libtest.shlib + +TESTDIR="$TEST_BASE_DIR/zed_slow_io" +VDEV="$TEST_BASE_DIR/vdevfile.$$" +TESTPOOL="slow_io_pool" +FILEPATH="$TESTDIR/slow_io.testfile" + +OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS) +OLD_SLOW_IO_EVENTS=$(get_tunable SLOW_IO_EVENTS_PER_SECOND) + +verify_runnable "both" + +function do_setup +{ + log_must truncate -s 1G $VDEV + default_setup_noexit $VDEV + zed_events_drain + log_must zfs set compression=off $TESTPOOL + log_must zfs set primarycache=none $TESTPOOL + log_must zfs set prefetch=none $TESTPOOL + log_must zfs set recordsize=512 $TESTPOOL + for i in {1..10}; do + dd if=/dev/urandom of=${FILEPATH}$i bs=512 count=1 2>/dev/null + done + zpool sync +} + +# intermediate cleanup +function do_clean +{ + log_must zinject -c all + log_must zpool destroy $TESTPOOL + log_must rm -f $VDEV +} + +# final cleanup +function cleanup +{ + log_must zinject -c all + + # if pool still exists then something failed so log additional info + if poolexists $TESTPOOL ; then + log_note "$(zpool status -s $TESTPOOL)" + echo "=================== zed log search ===================" + grep "Diagnosis Engine" $ZEDLET_DIR/zed.log + destroy_pool $TESTPOOL + fi + log_must zed_stop + + log_must rm -f $VDEV + + log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO + log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS +} + +function start_slow_io +{ + zpool sync + log_must set_tunable64 ZIO_SLOW_IO_MS 10 + log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND 1000 + + log_must zinject -d $VDEV -D10:1 -T read $TESTPOOL + zpool sync +} + +function stop_slow_io +{ + log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO + log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS + + log_must zinject -c all +} + +# Test default ZED settings: +# inject 10 events over 2.5 seconds, should not degrade. +function default_degrade +{ + do_setup + + start_slow_io + for i in {1..10}; do + dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null + sleep 0.25 + done + stop_slow_io + log_note "$(zpool status -s $TESTPOOL)" + + # give slow ZED a chance to process the delay events + sleep 18 + log_note "$(zpool status -s $TESTPOOL)" + + degrades=$(grep "zpool_vdev_degrade" $ZEDLET_DIR/zed.log | wc -l) + log_note $degrades vdev degrades in ZED log + [ $degrades -eq "0" ] || \ + log_fail "expecting no degrade events, found $degrades" + + do_clean +} + +# change slow_io_n, slow_io_t to 5 events in 60 seconds +# fire more than 5 events, should degrade +function slow_io_degrade +{ + do_setup + + zpool set slow_io_n=5 $TESTPOOL $VDEV + zpool set slow_io_t=60 $TESTPOOL $VDEV + + start_slow_io + for i in {1..16}; do + dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null + sleep 0.5 + done + stop_slow_io + zpool sync + + # + # wait up to 60 seconds for kernel to produce at least 5 delay events + # + typeset -i i=0 + typeset -i events=0 + while [[ $i -lt 60 ]]; do + events=$(zpool events | grep "ereport\.fs\.zfs.delay" | wc -l) + [[ $events -ge "5" ]] && break + i=$((i+1)) + sleep 1 + done + log_note "$events delay events found" + + if [[ $events -ge "5" ]]; then + log_must wait_vdev_state $TESTPOOL $VDEV "DEGRADED" 10 + fi + + do_clean +} + +# change slow_io_n, slow_io_t to 10 events in 1 second +# inject events spaced 0.5 seconds apart, should not degrade +function slow_io_no_degrade +{ + do_setup + + zpool set slow_io_n=10 $TESTPOOL $VDEV + zpool set slow_io_t=1 $TESTPOOL $VDEV + + start_slow_io + for i in {1..16}; do + dd if=${FILEPATH}$i of=/dev/null count=1 bs=512 2>/dev/null + sleep 0.5 + done + stop_slow_io + zpool sync + + log_mustnot wait_vdev_state $TESTPOOL $VDEV "DEGRADED" 45 + + do_clean +} + +log_assert "Test ZED slow io configurability" +log_onexit cleanup + +log_must zed_events_drain +log_must zed_start + +default_degrade +slow_io_degrade +slow_io_no_degrade + +log_pass "Test ZED slow io configurability" diff --git a/tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh b/tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh new file mode 100755 index 000000000000..3357ae2e3510 --- /dev/null +++ b/tests/zfs-tests/tests/functional/events/zed_slow_io_many_vdevs.ksh @@ -0,0 +1,177 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Klara Inc. +# + +# DESCRIPTION: +# Verify that delay events from multiple vdevs doesnt degrade +# +# STRATEGY: +# 1. Create a pool with a 3 disk raidz vdev +# 2. Inject slow io errors +# 3. Verify that ZED detects slow I/Os but doesn't degrade any vdevs +# + +. $STF_SUITE/include/libtest.shlib + +TESTDIR="$TEST_BASE_DIR/zed_slow_io" +VDEV1="$TEST_BASE_DIR/vdevfile1.$$" +VDEV2="$TEST_BASE_DIR/vdevfile2.$$" +VDEV3="$TEST_BASE_DIR/vdevfile3.$$" +VDEV4="$TEST_BASE_DIR/vdevfile4.$$" +VDEVS="$VDEV1 $VDEV2 $VDEV3 $VDEV4" +TESTPOOL="slow_io_pool" +FILEPATH="$TESTDIR/slow_io.testfile" + +OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS) +OLD_SLOW_IO_EVENTS=$(get_tunable SLOW_IO_EVENTS_PER_SECOND) + +verify_runnable "both" + +function cleanup +{ + log_must zinject -c all + + # if pool still exists then something failed so log additional info + if poolexists $TESTPOOL ; then + log_note "$(zpool status -s $TESTPOOL)" + echo "=================== zed log search ===================" + grep "Diagnosis Engine" $ZEDLET_DIR/zed.log + destroy_pool $TESTPOOL + fi + log_must zed_stop + + log_must rm -f $VDEVS + log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO + log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS +} + +function start_slow_io +{ + for vdev in $VDEVS + do + log_must zpool set slow_io_n=4 $TESTPOOL $vdev + log_must zpool set slow_io_t=60 $TESTPOOL $vdev + done + zpool sync + + log_must set_tunable64 ZIO_SLOW_IO_MS 10 + log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND 1000 + + for vdev in $VDEVS + do + log_must zinject -d $vdev -D10:1 $TESTPOOL + done + zpool sync +} + +function stop_slow_io +{ + log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO + log_must set_tunable64 SLOW_IO_EVENTS_PER_SECOND $OLD_SLOW_IO_EVENTS + + log_must zinject -c all +} + +function multiple_slow_vdevs_test +{ + log_must truncate -s 1G $VDEVS + default_raidz_setup_noexit $VDEVS + + log_must zpool events -c + log_must zfs set compression=off $TESTPOOL + log_must zfs set primarycache=none $TESTPOOL + log_must zfs set recordsize=4K $TESTPOOL + + log_must dd if=/dev/urandom of=$FILEPATH bs=1M count=20 + zpool sync + + # + # Read the file with slow io injected on the disks + # This will cause multiple errors on each disk to trip ZED SERD + # + # pool: slow_io_pool + # state: ONLINE + # config: + # + # NAME STATE READ WRITE CKSUM SLOW + # slow_io_pool ONLINE 0 0 0 - + # raidz1-0 ONLINE 0 0 0 - + # /var/tmp/vdevfile1.499278 ONLINE 0 0 0 113 + # /var/tmp/vdevfile2.499278 ONLINE 0 0 0 109 + # /var/tmp/vdevfile3.499278 ONLINE 0 0 0 96 + # /var/tmp/vdevfile4.499278 ONLINE 0 0 0 109 + # + start_slow_io + dd if=$FILEPATH of=/dev/null bs=1M count=20 2>/dev/null + stop_slow_io + + # count events available for processing + typeset -i i=0 + typeset -i events=0 + while [[ $i -lt 60 ]]; do + events=$(zpool events | grep "ereport\.fs\.zfs.delay" | wc -l) + [[ $events -ge "50" ]] && break + i=$((i+1)) + sleep 1 + done + log_note "$events delay events found" + if [[ $events -lt "50" ]]; then + log_note "bailing: not enough events to complete the test" + destroy_pool $TESTPOOL + return + fi + + # + # give slow ZED a chance to process the delay events + # + typeset -i i=0 + typeset -i skips=0 + while [[ $i -lt 75 ]]; do + skips=$(grep "retiring case" \ + $ZEDLET_DIR/zed.log | wc -l) + [[ $skips -gt "0" ]] && break + i=$((i+1)) + sleep 1 + done + + log_note $skips degrade skips in ZED log after $i seconds + [ $skips -gt "0" ] || log_fail "expecting to see skips" + + degrades=$(grep "zpool_vdev_degrade" $ZEDLET_DIR/zed.log | wc -l) + log_note $degrades vdev degrades in ZED log + [ $degrades -eq "0" ] || \ + log_fail "expecting no degrade events, found $degrades" + + destroy_pool $TESTPOOL +} + +log_assert "Test ZED slow io across multiple vdevs" +log_onexit cleanup + +log_must zed_events_drain +log_must zed_start +multiple_slow_vdevs_test + +log_pass "Test ZED slow io across multiple vdevs" diff --git a/tests/zfs-tests/tests/functional/fault/cleanup.ksh b/tests/zfs-tests/tests/functional/fault/cleanup.ksh index 654343c0cf00..2959236b59a3 100755 --- a/tests/zfs-tests/tests/functional/fault/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/fault/cleanup.ksh @@ -32,5 +32,6 @@ cleanup_devices $DISKS zed_stop zed_cleanup resilver_finish-start-scrub.sh +zed_events_drain log_pass diff --git a/tests/zfs-tests/tests/functional/fault/setup.ksh b/tests/zfs-tests/tests/functional/fault/setup.ksh index 62f1c8ab56cb..61b9206ec1a6 100755 --- a/tests/zfs-tests/tests/functional/fault/setup.ksh +++ b/tests/zfs-tests/tests/functional/fault/setup.ksh @@ -28,6 +28,7 @@ verify_runnable "global" +zed_events_drain zed_setup resilver_finish-start-scrub.sh zed_start diff --git a/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh b/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh new file mode 100755 index 000000000000..8b118684aa7f --- /dev/null +++ b/tests/zfs-tests/tests/functional/mmp/mmp_write_slow_disk.ksh @@ -0,0 +1,97 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024, Klara Inc +# + +# DESCRIPTION: +# Verify that long VDEV probes do not cause MMP checks to suspend pool +# Note: without PR-15839 fix, this test will suspend the pool. +# +# A device that is returning unexpected errors will trigger a vdev_probe. +# When the device additionally has slow response times, the probe can hold +# the spa config lock as a writer for a long period of time such that the +# mmp uberblock updates stall when trying to acquire the spa config lock. +# +# STRATEGY: +# 1. Create a pool with multiple leaf vdevs +# 2. Enable multihost and multihost_history +# 3. Delay for MMP writes to occur +# 4. Verify that a long VDEV probe didn't cause MMP check to suspend pool +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/mmp/mmp.cfg +. $STF_SUITE/tests/functional/mmp/mmp.kshlib + +verify_runnable "both" + +function cleanup +{ + log_must zinject -c all + + if [[ $(zpool list -H -o health $MMP_POOL) == "SUSPENDED" ]]; then + log_must zpool clear $MMP_POOL + zpool get state $MMP_POOL $MMP_DIR/file.3 + zpool events | grep ".fs.zfs." | grep -v "history_event" + fi + + poolexists $MMP_POOL && destroy_pool $MMP_POOL + log_must rm -r $MMP_DIR + log_must mmp_clear_hostid +} + +log_assert "A long VDEV probe doesn't cause a MMP check suspend" +log_onexit cleanup + +MMP_HISTORY_URL=/proc/spl/kstat/zfs/$MMP_POOL/multihost + +# Create a multiple drive pool +log_must zpool events -c +log_must mkdir -p $MMP_DIR +log_must truncate -s 128M $MMP_DIR/file.{0,1,2,3,4,5} +log_must zpool create -f $MMP_POOL \ + mirror $MMP_DIR/file.{0,1,2} \ + mirror $MMP_DIR/file.{3,4,5} + +# Enable MMP +log_must mmp_set_hostid $HOSTID1 +log_must zpool set multihost=on $MMP_POOL +clear_mmp_history + +# Inject vdev write error along with a delay +log_must zinject -f 33 -e io -L pad2 -T write -d $MMP_DIR/file.3 $MMP_POOL +log_must zinject -f 50 -e io -L uber -T write -d $MMP_DIR/file.3 $MMP_POOL +log_must zinject -D 2000:4 -T write -d $MMP_DIR/file.3 $MMP_POOL + +log_must dd if=/dev/urandom of=/$MMP_POOL/data bs=1M count=5 +sleep 10 +sync_pool $MMP_POOL + +# Confirm mmp writes to the non-slow disks have taken place +for x in {0,1,2,4}; do + write_count=$(grep -c file.${x} $MMP_HISTORY_URL) + [[ $write_count -gt 0 ]] || log_fail "expecting mmp writes" +done + +# Expect that the pool was not suspended +log_must check_state $MMP_POOL "" "ONLINE" +health=$(zpool list -H -o health $MMP_POOL) +log_note "$MMP_POOL health is $health" +[[ "$health" == "SUSPENDED" ]] && log_fail "$MMP_POOL $health unexpected" + +log_pass "A long VDEV probe doesn't cause a MMP check suspend" diff --git a/tests/zfs-tests/tests/functional/user_namespace/user_namespace_004.ksh b/tests/zfs-tests/tests/functional/user_namespace/user_namespace_004.ksh index 37ef84b72377..e6ad25f23f93 100755 --- a/tests/zfs-tests/tests/functional/user_namespace/user_namespace_004.ksh +++ b/tests/zfs-tests/tests/functional/user_namespace/user_namespace_004.ksh @@ -44,8 +44,6 @@ user_ns_cleanup() { log_must zfs destroy -r "$TESTPOOL/userns" } -log_onexit user_ns_cleanup - log_assert "Check zfs zone command handling of non-namespace files" # Pass if user namespaces are not supported. @@ -54,6 +52,8 @@ if [ "$?" -ne "0" ]; then log_unsupported "Failed to create user namespace" fi +log_onexit user_ns_cleanup + # Create the baseline datasets. log_must zfs create -o zoned=on "$TESTPOOL/userns" diff --git a/udev/zvol_id.c b/udev/zvol_id.c index 5960b978787a..609349594767 100644 --- a/udev/zvol_id.c +++ b/udev/zvol_id.c @@ -51,7 +51,7 @@ const char *__asan_default_options(void) { int main(int argc, const char *const *argv) { - if (argc != 2) { + if (argc != 2 || strncmp(argv[1], "/dev/zd", 7) != 0) { fprintf(stderr, "usage: %s /dev/zdX\n", argv[0]); return (1); } @@ -72,9 +72,10 @@ main(int argc, const char *const *argv) return (1); } - unsigned int dev_part = minor(sb.st_rdev) % ZVOL_MINORS; - if (dev_part != 0) - sprintf(zvol_name + strlen(zvol_name), "-part%u", dev_part); + const char *dev_part = strrchr(dev_name, 'p'); + if (dev_part != NULL) { + sprintf(zvol_name + strlen(zvol_name), "-part%s", dev_part + 1); + } for (size_t i = 0; i < strlen(zvol_name); ++i) if (isblank(zvol_name[i]))