diff --git a/cmd/arcstat/arcstat.py b/cmd/arcstat/arcstat.py index b516cf285fdc..bbf43100aa4e 100755 --- a/cmd/arcstat/arcstat.py +++ b/cmd/arcstat/arcstat.py @@ -82,7 +82,6 @@ "mrug": [4, 1000, "MRU Ghost List hits per second"], "eskip": [5, 1000, "evict_skip per second"], "mtxmis": [6, 1000, "mutex_miss per second"], - "rmis": [4, 1000, "recycle_miss per second"], "dread": [5, 1000, "Demand accesses per second"], "pread": [5, 1000, "Prefetch accesses per second"], "l2hits": [6, 1000, "L2ARC hits per second"], @@ -406,7 +405,6 @@ def calculate(): v["mrug"] = d["mru_ghost_hits"] / sint v["mfug"] = d["mfu_ghost_hits"] / sint v["eskip"] = d["evict_skip"] / sint - v["rmis"] = d["recycle_miss"] / sint v["mtxmis"] = d["mutex_miss"] / sint if l2exist: diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index b0d7170b92b2..ab722e94338f 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -1169,7 +1169,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp, print_indirect(bp, zb, dnp); if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) { - uint32_t flags = ARC_WAIT; + arc_flags_t flags = ARC_FLAG_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; @@ -1183,7 +1183,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp, ASSERT(buf->b_data); /* recursively visit blocks below this */ - cbp = buf->b_data; + cbp = ABD_TO_BUF(buf->b_data); for (i = 0; i < epb; i++, cbp++) { zbookmark_phys_t czb; @@ -1355,7 +1355,7 @@ dump_bptree(objset_t *os, uint64_t obj, char *name) return; VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); - bt = db->db_data; + bt = ABD_TO_BUF(db->db_data); zdb_nicenum(bt->bt_bytes, bytes); (void) printf("\n %s: %llu datasets, %s\n", name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes); @@ -1805,7 +1805,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) if (error) fatal("dmu_bonus_hold(%llu) failed, errno %u", object, error); - bonus = db->db_data; + bonus = ABD_TO_BUF(db->db_data); bsize = db->db_size; dn = DB_DNODE((dmu_buf_impl_t *)db); } @@ -2028,7 +2028,7 @@ dump_config(spa_t *spa) spa->spa_config_object, FTAG, &db); if (error == 0) { - nvsize = *(uint64_t *)db->db_data; + nvsize = *(uint64_t *)ABD_TO_BUF(db->db_data); dmu_buf_rele(db, FTAG); (void) printf("\nMOS Configuration:\n"); @@ -2315,7 +2315,7 @@ zdb_blkptr_done(zio_t *zio) zdb_cb_t *zcb = zio->io_private; zbookmark_phys_t *zb = &zio->io_bookmark; - zio_data_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_data, zio->io_size); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; @@ -2378,7 +2378,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, if (!BP_IS_EMBEDDED(bp) && (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) { size_t size = BP_GET_PSIZE(bp); - void *data = zio_data_buf_alloc(size); + abd_t *data = abd_alloc_linear(size); int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; /* If it's an intent log block, failure is expected. */ @@ -3141,6 +3141,7 @@ zdb_read_block(char *thing, spa_t *spa) zio_t *zio; vdev_t *vd; void *pbuf, *lbuf, *buf; + abd_t *pbuf_abd; char *s, *p, *dup, *vdev, *flagstr; int i, error; @@ -3212,6 +3213,7 @@ zdb_read_block(char *thing, spa_t *spa) lsize = size; pbuf = umem_alloc_aligned(SPA_MAXBLOCKSIZE, 512, UMEM_NOFAIL); + pbuf_abd = abd_get_from_buf(pbuf, SPA_MAXBLOCKSIZE); lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); BP_ZERO(bp); @@ -3239,15 +3241,15 @@ zdb_read_block(char *thing, spa_t *spa) /* * Treat this as a normal block read. */ - zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL, + zio_nowait(zio_read(zio, spa, bp, pbuf_abd, psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL)); } else { /* * Treat this as a vdev child I/O. */ - zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize, - ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, + zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf_abd, + psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL)); @@ -3321,6 +3323,7 @@ zdb_read_block(char *thing, spa_t *spa) zdb_dump_block(thing, buf, size, flags); out: + abd_put(pbuf_abd); umem_free(pbuf, SPA_MAXBLOCKSIZE); umem_free(lbuf, SPA_MAXBLOCKSIZE); free(dup); diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c index b85ef7ddd97e..02e772c0c8e4 100644 --- a/cmd/zdb/zdb_il.c +++ b/cmd/zdb/zdb_il.c @@ -125,6 +125,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr) blkptr_t *bp = &lr->lr_blkptr; zbookmark_phys_t zb; char buf[SPA_MAXBLOCKSIZE]; + abd_t *abd; int verbose = MAX(dump_opt['d'], dump_opt['i']); int error; @@ -158,9 +159,11 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr) lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); + abd = abd_get_from_buf(buf, BP_GET_LSIZE(bp)); error = zio_wait(zio_read(NULL, zilog->zl_spa, - bp, buf, BP_GET_LSIZE(bp), NULL, NULL, + bp, abd, BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb)); + abd_put(abd); if (error) return; data = buf; diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 0602a7ec54bf..fdc42f4433ed 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -1305,19 +1305,23 @@ ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) } static void -ztest_pattern_set(void *buf, uint64_t size, uint64_t value) +ztest_pattern_set(abd_t *abd, uint64_t size, uint64_t value) { + void *buf = abd_borrow_buf(abd, size); uint64_t *ip = buf; uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size); while (ip < ip_end) *ip++ = value; + + abd_return_buf_copy(abd, buf, size); } #ifndef NDEBUG static boolean_t -ztest_pattern_match(void *buf, uint64_t size, uint64_t value) +ztest_pattern_match(abd_t *abd, uint64_t size, uint64_t value) { + void *buf = abd_borrow_buf_copy(abd, size); uint64_t *ip = buf; uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size); uint64_t diff = 0; @@ -1325,6 +1329,7 @@ ztest_pattern_match(void *buf, uint64_t size, uint64_t value) while (ip < ip_end) diff |= (value - *ip++); + abd_return_buf(abd, buf, size); return (diff == 0); } #endif @@ -1364,7 +1369,8 @@ ztest_bt_bonus(dmu_buf_t *db) dmu_object_info_from_db(db, &doi); ASSERT3U(doi.doi_bonus_size, <=, db->db_size); ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); - bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); + bt = (void *)((char *)ABD_TO_BUF(db->db_data) + doi.doi_bonus_size - + sizeof (*bt)); return (bt); } @@ -1726,7 +1732,7 @@ ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap) if (abuf == NULL) { dmu_write(os, lr->lr_foid, offset, length, data, tx); } else { - bcopy(data, abuf->b_data, length); + abd_copy_from_buf(abuf->b_data, data, length); dmu_assign_arcbuf(db, offset, abuf, tx); } @@ -4051,7 +4057,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) * assign an arcbuf to a dbuf. */ for (j = 0; j < s; j++) { - if (i != 5) { + if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { bigbuf_arcbufs[j] = dmu_request_arcbuf(bonus_db, chunksize); } else { @@ -4075,7 +4081,8 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) umem_free(packbuf, packsize); umem_free(bigbuf, bigsize); for (j = 0; j < s; j++) { - if (i != 5) { + if (i != 5 || + chunksize < (SPA_MINBLOCKSIZE * 2)) { dmu_return_arcbuf(bigbuf_arcbufs[j]); } else { dmu_return_arcbuf( @@ -4120,24 +4127,27 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) } for (off = bigoff, j = 0; j < s; j++, off += chunksize) { dmu_buf_t *dbt; - if (i != 5) { - bcopy((caddr_t)bigbuf + (off - bigoff), - bigbuf_arcbufs[j]->b_data, chunksize); + if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { + abd_copy_from_buf(bigbuf_arcbufs[j]->b_data, + (caddr_t)bigbuf + (off - bigoff), + chunksize); } else { - bcopy((caddr_t)bigbuf + (off - bigoff), + abd_copy_from_buf( bigbuf_arcbufs[2 * j]->b_data, + (caddr_t)bigbuf + (off - bigoff), chunksize / 2); - bcopy((caddr_t)bigbuf + (off - bigoff) + - chunksize / 2, + + abd_copy_from_buf( bigbuf_arcbufs[2 * j + 1]->b_data, - chunksize / 2); + (caddr_t)bigbuf + (off - bigoff) + + chunksize / 2, chunksize / 2); } if (i == 1) { VERIFY(dmu_buf_hold(os, bigobj, off, FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); } - if (i != 5) { + if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { dmu_assign_arcbuf(bonus_db, off, bigbuf_arcbufs[j], tx); } else { @@ -5181,7 +5191,7 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) enum zio_checksum checksum = spa_dedup_checksum(spa); dmu_buf_t *db; dmu_tx_t *tx; - void *buf; + abd_t *buf; blkptr_t blk; int copies = 2 * ZIO_DEDUPDITTO_MIN; int i; @@ -5262,14 +5272,14 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) * Damage the block. Dedup-ditto will save us when we read it later. */ psize = BP_GET_PSIZE(&blk); - buf = zio_buf_alloc(psize); + buf = abd_alloc_linear(psize); ztest_pattern_set(buf, psize, ~pattern); (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk, buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL)); - zio_buf_free(buf, psize); + abd_free(buf, psize); (void) rw_unlock(&ztest_name_lock); umem_free(od, sizeof (ztest_od_t)); diff --git a/config/kernel-kmap-atomic-args.m4 b/config/kernel-kmap-atomic-args.m4 new file mode 100644 index 000000000000..f7228907af49 --- /dev/null +++ b/config/kernel-kmap-atomic-args.m4 @@ -0,0 +1,24 @@ +dnl # +dnl # 2.6.37 API change +dnl # kmap_atomic changed from assigning hard-coded named slot to using +dnl # push/pop based dynamical allocation. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS], [ + AC_MSG_CHECKING([whether kmap_atomic wants 1 args]) + ZFS_LINUX_TRY_COMPILE([ + #include + + void test_kmap_atomic(void) + { + struct page page; + kmap_atomic(&page); + } + ],[ + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_1ARG_KMAP_ATOMIC, 1, + [kmap_atomic wants 1 args]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index ce10707e4954..ec25b64b110a 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -96,6 +96,7 @@ AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [ ZFS_AC_KERNEL_5ARG_SGET ZFS_AC_KERNEL_LSEEK_EXECUTE ZFS_AC_KERNEL_VFS_ITERATE + ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS AS_IF([test "$LINUX_OBJ" != "$LINUX"], [ KERNELMAKE_PARAMS="$KERNELMAKE_PARAMS O=$LINUX_OBJ" diff --git a/configure.ac b/configure.ac index 80493d06e333..e0829205afbf 100644 --- a/configure.ac +++ b/configure.ac @@ -30,7 +30,8 @@ * CDDL HEADER END */ -AC_INIT +AC_INIT(m4_esyscmd(grep Name META | cut -d ':' -f 2 | tr -d ' \n'), + m4_esyscmd(grep Version META | cut -d ':' -f 2 | tr -d ' \n')) AC_LANG(C) ZFS_AC_META AC_CONFIG_AUX_DIR([config]) @@ -38,7 +39,7 @@ AC_CONFIG_MACRO_DIR([config]) AC_CANONICAL_SYSTEM AM_MAINTAINER_MODE m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) -AM_INIT_AUTOMAKE([$ZFS_META_NAME], [$ZFS_META_VERSION]) +AM_INIT_AUTOMAKE AC_CONFIG_HEADERS([zfs_config.h], [ (mv zfs_config.h zfs_config.h.tmp && awk -f ${ac_srcdir}/config/config.awk zfs_config.h.tmp >zfs_config.h && diff --git a/include/linux/Makefile.am b/include/linux/Makefile.am index d00b1c8ad798..595d1db01128 100644 --- a/include/linux/Makefile.am +++ b/include/linux/Makefile.am @@ -5,7 +5,8 @@ KERNEL_H = \ $(top_srcdir)/include/linux/xattr_compat.h \ $(top_srcdir)/include/linux/vfs_compat.h \ $(top_srcdir)/include/linux/blkdev_compat.h \ - $(top_srcdir)/include/linux/utsname_compat.h + $(top_srcdir)/include/linux/utsname_compat.h \ + $(top_srcdir)/include/linux/kmap_compat.h USER_H = diff --git a/include/linux/kmap_compat.h b/include/linux/kmap_compat.h new file mode 100644 index 000000000000..f581fb20965a --- /dev/null +++ b/include/linux/kmap_compat.h @@ -0,0 +1,40 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + */ + +#ifndef _ZFS_KMAP_H +#define _ZFS_KMAP_H + +#include + +#ifdef HAVE_1ARG_KMAP_ATOMIC +/* 2.6.37 API change */ +#define zfs_kmap_atomic(page, km_type) kmap_atomic(page) +#define zfs_kunmap_atomic(addr, km_type) kunmap_atomic(addr) +#else +#define zfs_kmap_atomic(page, km_type) kmap_atomic(page, km_type) +#define zfs_kunmap_atomic(addr, km_type) kunmap_atomic(addr, km_type) +#endif + +#endif /* _ZFS_KMAP_H */ diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index 5211e656456d..11e965d43657 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -3,6 +3,7 @@ SUBDIRS = fm fs COMMON_H = \ $(top_srcdir)/include/sys/arc.h \ $(top_srcdir)/include/sys/arc_impl.h \ + $(top_srcdir)/include/sys/abd.h \ $(top_srcdir)/include/sys/avl.h \ $(top_srcdir)/include/sys/avl_impl.h \ $(top_srcdir)/include/sys/blkptr.h \ @@ -33,6 +34,7 @@ COMMON_H = \ $(top_srcdir)/include/sys/efi_partition.h \ $(top_srcdir)/include/sys/metaslab.h \ $(top_srcdir)/include/sys/metaslab_impl.h \ + $(top_srcdir)/include/sys/multilist.h \ $(top_srcdir)/include/sys/nvpair.h \ $(top_srcdir)/include/sys/nvpair_impl.h \ $(top_srcdir)/include/sys/range_tree.h \ diff --git a/include/sys/abd.h b/include/sys/abd.h new file mode 100644 index 000000000000..d6d1dcc91ce2 --- /dev/null +++ b/include/sys/abd.h @@ -0,0 +1,235 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + */ + +/* + * ABD - ARC buffer data + * ABD is an abstract data structure for ARC. There are two types of ABD: + * linear for metadata and scatter for data. + * Their type is determined by the lowest bit of abd_t pointer. + * The public API will automatically determine the type + */ + +#ifndef _ABD_H +#define _ABD_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define ARC_BUF_DATA_MAGIC 0xa7cb0fda + +#ifndef _KERNEL + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +#ifndef PAGE_SHIFT +#define PAGE_SHIFT 12 +#endif + +#ifdef ZFS_DEBUG +#define DEBUG_ABD +#define PAGE_SHIFT 12 +#endif + +#endif /* !_KERNEL */ + +typedef struct arc_buf_data { +#ifdef DEBUG_ABD + char pad[4096]; /* debug, coredumps when accessed */ +#endif + uint32_t abd_magic; /* ARC_BUF_DATA_MAGIC */ + uint32_t abd_flags; + size_t abd_size; /* buffer size, excluding offset */ + size_t abd_offset; /* offset in the first segment */ + int abd_nents; /* num of sgl entries */ + union { + struct scatterlist *abd_sgl; + void *abd_buf; + }; + uint64_t __abd_sgl[0]; +} abd_t; + +#define ABD_F_SCATTER (0x0) +#define ABD_F_LINEAR (0x1) +#define ABD_F_OWNER (0x2) + +/* + * Convert an linear ABD to normal buffer + */ +#define ABD_TO_BUF(abd) \ +( \ +{ \ + ASSERT((abd)->abd_magic == ARC_BUF_DATA_MAGIC); \ + ASSERT_ABD_LINEAR(abd); \ + abd->abd_buf; \ +} \ +) + +#define ABD_IS_SCATTER(abd) (!((abd)->abd_flags & ABD_F_LINEAR)) +#define ABD_IS_LINEAR(abd) (!ABD_IS_SCATTER(abd)) +#define ASSERT_ABD_SCATTER(abd) ASSERT(ABD_IS_SCATTER(abd)) +#define ASSERT_ABD_LINEAR(abd) ASSERT(ABD_IS_LINEAR(abd)) + +/* + * Allocations and deallocations + */ +abd_t *abd_alloc_scatter(size_t); +abd_t *abd_alloc_linear(size_t); +void abd_free(abd_t *, size_t); +abd_t *abd_get_offset(abd_t *, size_t); +abd_t *abd_get_from_buf(void *, size_t); +void abd_put(abd_t *); + +/* + * ABD operations + */ +void abd_iterate_rfunc(abd_t *, size_t, + int (*)(const void *, uint64_t, void *), void *); +void abd_iterate_wfunc(abd_t *, size_t, + int (*)(void *, uint64_t, void *), void *); +void abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, + int (*)(void *, void *, uint64_t, uint64_t, void *), void *); +void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t); +void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t); +void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t); +int abd_cmp(abd_t *, abd_t *, size_t); +int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t); +void abd_zero_off(abd_t *, size_t, size_t); +#ifdef _KERNEL +int abd_copy_to_user_off(void __user *, abd_t *, size_t, size_t); +int abd_copy_from_user_off(abd_t *, const void __user *, size_t, size_t); +int abd_uiomove_off(abd_t *, size_t, enum uio_rw, uio_t *, size_t); +int abd_uiocopy_off(abd_t *, size_t, enum uio_rw, uio_t *, size_t *, + size_t); +unsigned int abd_scatter_bio_map_off(struct bio *, abd_t *, unsigned int, + size_t); +unsigned long abd_bio_nr_pages_off(abd_t *, unsigned int, size_t); + +#define abd_bio_map_off(bio, abd, size, off) \ +( \ +{ \ + unsigned int ___ret; \ + if (ABD_IS_LINEAR(abd)) \ + ___ret = bio_map(bio, ABD_TO_BUF(abd) + (off), size); \ + else \ + ___ret = abd_scatter_bio_map_off(bio, abd, size, off); \ + ___ret; \ +} \ +) +#endif /* _KERNEL */ + +/* + * Borrow a linear buffer for an ABD + * Will allocate if ABD is scatter + */ +#define abd_borrow_buf(a, n) \ +( \ +{ \ + void *___b; \ + if (ABD_IS_LINEAR(a)) { \ + ___b = ABD_TO_BUF(a); \ + } else { \ + ___b = zio_buf_alloc(n); \ + } \ + ___b; \ +} \ +) + +/* + * Borrow a linear buffer for an ABD + * Will allocate and copy if ABD is scatter + */ +#define abd_borrow_buf_copy(a, n) \ +( \ +{ \ + void *___b = abd_borrow_buf(a, n); \ + if (!ABD_IS_LINEAR(a)) \ + abd_copy_to_buf(___b, a, n); \ + ___b; \ +} \ +) + +/* + * Return the borrowed linear buffer + */ +#define abd_return_buf(a, b, n) \ +do { \ + if (ABD_IS_LINEAR(a)) \ + ASSERT((b) == ABD_TO_BUF(a)); \ + else \ + zio_buf_free(b, n); \ +} while (0) + +/* + * Copy back to ABD and return the borrowed linear buffer + */ +#define abd_return_buf_copy(a, b, n) \ +do { \ + if (!ABD_IS_LINEAR(a)) \ + abd_copy_from_buf(a, b, n); \ + abd_return_buf(a, b, n); \ +} while (0) + +/* + * Wrappers for zero off functions + */ +#define abd_copy(dabd, sabd, size) \ + abd_copy_off(dabd, sabd, size, 0, 0) + +#define abd_copy_from_buf(abd, buf, size) \ + abd_copy_from_buf_off(abd, buf, size, 0) + +#define abd_copy_to_buf(buf, abd, size) \ + abd_copy_to_buf_off(buf, abd, size, 0) + +#define abd_cmp_buf(abd, buf, size) \ + abd_cmp_buf_off(abd, buf, size, 0) + +#define abd_zero(abd, size) \ + abd_zero_off(abd, size, 0) + +#ifdef _KERNEL +#define abd_copy_to_user(buf, abd, size) \ + abd_copy_to_user_off(buf, abd, size, 0) + +#define abd_copy_from_user(abd, buf, size) \ + abd_copy_from_user_off(abd, buf, size, 0) + +#define abd_uiomove(abd, n, rw, uio) \ + abd_uiomove_off(abd, n, rw, uio, 0) + +#define abd_uiocopy(abd, n, rw, uio, c) \ + abd_uiocopy_off(abd, n, rw, uio, c, 0) +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ABD_H */ diff --git a/include/sys/arc.h b/include/sys/arc.h index 215c75b6dfa3..5f1a164f39a2 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -28,6 +28,7 @@ #define _SYS_ARC_H #include +#include #ifdef __cplusplus extern "C" { @@ -38,6 +39,12 @@ extern "C" { #include #include +/* + * Used by arc_flush() to inform arc_evict_state() that it should evict + * all available buffers from the arc state being passed in. + */ +#define ARC_EVICT_ALL -1ULL + typedef struct arc_buf_hdr arc_buf_hdr_t; typedef struct arc_buf arc_buf_t; typedef struct arc_prune arc_prune_t; @@ -57,11 +64,60 @@ struct arc_prune { refcount_t p_refcnt; }; +typedef enum arc_flags +{ + /* + * Public flags that can be passed into the ARC by external consumers. + */ + ARC_FLAG_NONE = 1 << 0, /* No flags set */ + ARC_FLAG_WAIT = 1 << 1, /* perform sync I/O */ + ARC_FLAG_NOWAIT = 1 << 2, /* perform async I/O */ + ARC_FLAG_PREFETCH = 1 << 3, /* I/O is a prefetch */ + ARC_FLAG_CACHED = 1 << 4, /* I/O was in cache */ + ARC_FLAG_L2CACHE = 1 << 5, /* cache in L2ARC */ + ARC_FLAG_L2COMPRESS = 1 << 6, /* compress in L2ARC */ + + /* + * Private ARC flags. These flags are private ARC only flags that + * will show up in b_flags in the arc_hdr_buf_t. These flags should + * only be set by ARC code. + */ + ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */ + ARC_FLAG_IO_IN_PROGRESS = 1 << 8, /* I/O in progress */ + ARC_FLAG_IO_ERROR = 1 << 9, /* I/O failed for buf */ + ARC_FLAG_FREED_IN_READ = 1 << 10, /* freed during read */ + ARC_FLAG_BUF_AVAILABLE = 1 << 11, /* block not in use */ + ARC_FLAG_INDIRECT = 1 << 12, /* indirect block */ + ARC_FLAG_L2_WRITING = 1 << 13, /* write in progress */ + ARC_FLAG_L2_EVICTED = 1 << 14, /* evicted during I/O */ + ARC_FLAG_L2_WRITE_HEAD = 1 << 15, /* head of write list */ + /* indicates that the buffer contains metadata (otherwise, data) */ + ARC_FLAG_BUFC_METADATA = 1 << 16, + + /* Flags specifying whether optional hdr struct fields are defined */ + ARC_FLAG_HAS_L1HDR = 1 << 17, + ARC_FLAG_HAS_L2HDR = 1 << 18, + + /* + * The arc buffer's compression mode is stored in the top 7 bits of the + * flags field, so these dummy flags are included so that MDB can + * interpret the enum properly. + */ + ARC_FLAG_COMPRESS_0 = 1 << 24, + ARC_FLAG_COMPRESS_1 = 1 << 25, + ARC_FLAG_COMPRESS_2 = 1 << 26, + ARC_FLAG_COMPRESS_3 = 1 << 27, + ARC_FLAG_COMPRESS_4 = 1 << 28, + ARC_FLAG_COMPRESS_5 = 1 << 29, + ARC_FLAG_COMPRESS_6 = 1 << 30 + +} arc_flags_t; + struct arc_buf { arc_buf_hdr_t *b_hdr; arc_buf_t *b_next; kmutex_t b_evict_lock; - void *b_data; + abd_t *b_data; arc_evict_func_t *b_efunc; void *b_private; }; @@ -71,15 +127,6 @@ typedef enum arc_buf_contents { ARC_BUFC_METADATA, /* buffer contains metadata */ ARC_BUFC_NUMTYPES } arc_buf_contents_t; -/* - * These are the flags we pass into calls to the arc - */ -#define ARC_WAIT (1 << 1) /* perform I/O synchronously */ -#define ARC_NOWAIT (1 << 2) /* perform I/O asynchronously */ -#define ARC_PREFETCH (1 << 3) /* I/O is a prefetch */ -#define ARC_CACHED (1 << 4) /* I/O was already in cache */ -#define ARC_L2CACHE (1 << 5) /* cache in L2ARC */ -#define ARC_L2COMPRESS (1 << 6) /* compress in L2ARC */ /* * The following breakdows of arc_size exist for kstat only. @@ -146,7 +193,7 @@ int arc_referenced(arc_buf_t *buf); int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, void *private, zio_priority_t priority, int flags, - uint32_t *arc_flags, const zbookmark_phys_t *zb); + arc_flags_t *arc_flags, const zbookmark_phys_t *zb); zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, @@ -160,7 +207,7 @@ void arc_freed(spa_t *spa, const blkptr_t *bp); void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private); boolean_t arc_clear_callback(arc_buf_t *buf); -void arc_flush(spa_t *spa); +void arc_flush(spa_t *spa, boolean_t retry); void arc_tempreserve_clear(uint64_t reserve); int arc_tempreserve_space(uint64_t reserve, uint64_t txg); diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index e7068ea188e3..176ddd7d162b 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -67,15 +67,25 @@ extern "C" { */ typedef struct arc_state { - list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ - uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ - uint64_t arcs_size; /* total amount of data in this state */ - kmutex_t arcs_mtx; + /* + * list of evictable buffers + */ + multilist_t arcs_list[ARC_BUFC_NUMTYPES]; + /* + * total amount of evictable data in this state + */ + uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; + /* + * total amount of data in this state; this includes: evictable, + * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. + */ + uint64_t arcs_size; + /* + * supports the "dbufs" kstat + */ arc_state_type_t arcs_state; } arc_state_t; -typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; - typedef struct arc_callback arc_callback_t; struct arc_callback { @@ -96,31 +106,49 @@ struct arc_write_callback { arc_buf_t *awcb_buf; }; -struct arc_buf_hdr { - /* protected by hash lock */ - dva_t b_dva; - uint64_t b_birth; - uint64_t b_cksum0; - +/* + * ARC buffers are separated into multiple structs as a memory saving measure: + * - Common fields struct, always defined, and embedded within it: + * - L2-only fields, always allocated but undefined when not in L2ARC + * - L1-only fields, only allocated when in L1ARC + * + * Buffer in L1 Buffer only in L2 + * +------------------------+ +------------------------+ + * | arc_buf_hdr_t | | arc_buf_hdr_t | + * | | | | + * | | | | + * | | | | + * +------------------------+ +------------------------+ + * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | + * | (undefined if L1-only) | | | + * +------------------------+ +------------------------+ + * | l1arc_buf_hdr_t | + * | | + * | | + * | | + * | | + * +------------------------+ + * + * Because it's possible for the L2ARC to become extremely large, we can wind + * up eating a lot of memory in L2ARC buffer headers, so the size of a header + * is minimized by only allocating the fields necessary for an L1-cached buffer + * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and + * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple + * words in pointers. arc_hdr_realloc() is used to switch a header between + * these two allocation states. + */ +typedef struct l1arc_buf_hdr { kmutex_t b_freeze_lock; - zio_cksum_t *b_freeze_cksum; - arc_buf_hdr_t *b_hash_next; arc_buf_t *b_buf; - uint32_t b_flags; uint32_t b_datacnt; - - arc_callback_t *b_acb; + /* for waiting on writes to complete */ kcondvar_t b_cv; - /* immutable */ - arc_buf_contents_t b_type; - uint64_t b_size; - uint64_t b_spa; /* protected by arc state mutex */ arc_state_t *b_state; - list_node_t b_arc_node; + multilist_node_t b_arc_node; /* updated atomically */ clock_t b_arc_access; @@ -133,9 +161,10 @@ struct arc_buf_hdr { /* self protecting */ refcount_t b_refcnt; - l2arc_buf_hdr_t *b_l2hdr; - list_node_t b_l2node; -}; + arc_callback_t *b_acb; + /* temporary buffer holder for in-flight compressed data */ + abd_t *b_tmp_cdata; +} l1arc_buf_hdr_t; typedef struct l2arc_dev { vdev_t *l2ad_vdev; /* vdev */ @@ -146,15 +175,46 @@ typedef struct l2arc_dev { uint64_t l2ad_evict; /* last addr eviction reached */ boolean_t l2ad_first; /* first sweep through */ boolean_t l2ad_writing; /* currently writing */ - list_t *l2ad_buflist; /* buffer list */ + kmutex_t l2ad_mtx; /* lock for buffer list */ + list_t l2ad_buflist; /* buffer list */ list_node_t l2ad_node; /* device list node */ } l2arc_dev_t; -typedef struct l2arc_write_callback { - l2arc_dev_t *l2wcb_dev; /* device info */ - arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ -} l2arc_write_callback_t; +typedef struct l2arc_buf_hdr { + /* protected by arc_buf_hdr mutex */ + l2arc_dev_t *b_dev; /* L2ARC device */ + uint64_t b_daddr; /* disk address, offset byte */ + /* real alloc'd buffer size depending on b_compress applied */ + uint32_t b_hits; + int32_t b_asize; + + list_node_t b_l2node; +} l2arc_buf_hdr_t; + +struct arc_buf_hdr { + /* protected by hash lock */ + dva_t b_dva; + uint64_t b_birth; + /* + * Even though this checksum is only set/verified when a buffer is in + * the L1 cache, it needs to be in the set of common fields because it + * must be preserved from the time before a buffer is written out to + * L2ARC until after it is read back in. + */ + zio_cksum_t *b_freeze_cksum; + + arc_buf_hdr_t *b_hash_next; + arc_flags_t b_flags; + + /* immutable */ + int32_t b_size; + uint64_t b_spa; + /* L2ARC fields. Undefined when not in L2ARC. */ + l2arc_buf_hdr_t b_l2hdr; + /* L1ARC fields. Undefined when in l2arc_only state */ + l1arc_buf_hdr_t b_l1hdr; +}; #ifdef __cplusplus } #endif diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 3befcb84427c..77e1d4d5eb87 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -108,7 +108,7 @@ struct ddt_entry { ddt_key_t dde_key; ddt_phys_t dde_phys[DDT_PHYS_TYPES]; zio_t *dde_lead_zio[DDT_PHYS_TYPES]; - void *dde_repair_data; + abd_t *dde_repair_data; enum ddt_type dde_type; enum ddt_class dde_class; uint8_t dde_loading; @@ -216,8 +216,6 @@ extern void ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len); extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp); extern void ddt_enter(ddt_t *ddt); extern void ddt_exit(ddt_t *ddt); -extern void ddt_init(void); -extern void ddt_fini(void); extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add); extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp); extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde); diff --git a/include/sys/dmu.h b/include/sys/dmu.h index c9c687b5aa62..3996b07cbaea 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -46,6 +46,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -285,7 +286,7 @@ typedef struct dmu_buf { uint64_t db_object; /* object that this buffer is part of */ uint64_t db_offset; /* byte offset in this object */ uint64_t db_size; /* size of buffer in bytes */ - void *db_data; /* data in buffer */ + abd_t *db_data; /* data in buffer */ } dmu_buf_t; typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); diff --git a/include/sys/multilist.h b/include/sys/multilist.h new file mode 100644 index 000000000000..5ebb7fe1a1d5 --- /dev/null +++ b/include/sys/multilist.h @@ -0,0 +1,106 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + */ + +#ifndef _SYS_MULTILIST_H +#define _SYS_MULTILIST_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef list_node_t multilist_node_t; +typedef struct multilist multilist_t; +typedef struct multilist_sublist multilist_sublist_t; +typedef unsigned int multilist_sublist_index_func_t(multilist_t *, void *); + +struct multilist_sublist { + /* + * The mutex used internally to implement thread safe insertions + * and removals to this individual sublist. It can also be locked + * by a consumer using multilist_sublist_{lock,unlock}, which is + * useful if a consumer needs to traverse the list in a thread + * safe manner. + */ + kmutex_t mls_lock; + /* + * The actual list object containing all objects in this sublist. + */ + list_t mls_list; + /* + * Pad to cache line (64 bytes), in an effort to try and prevent + * cache line contention. + */ + uint8_t mls_pad[24]; +}; + +struct multilist { + /* + * This is used to get to the multilist_node_t structure given + * the void *object contained on the list. + */ + size_t ml_offset; + /* + * The number of sublists used internally by this multilist. + */ + uint64_t ml_num_sublists; + /* + * The array of pointers to the actual sublists. + */ + multilist_sublist_t *ml_sublists; + /* + * Pointer to function which determines the sublist to use + * when inserting and removing objects from this multilist. + * Please see the comment above multilist_create for details. + */ + multilist_sublist_index_func_t *ml_index_func; +}; + +void multilist_destroy(multilist_t *); +void multilist_create(multilist_t *, size_t, size_t, unsigned int, + multilist_sublist_index_func_t *); + +void multilist_insert(multilist_t *, void *); +void multilist_remove(multilist_t *, void *); +int multilist_is_empty(multilist_t *); + +unsigned int multilist_get_num_sublists(multilist_t *); +unsigned int multilist_get_random_index(multilist_t *); + +multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int); +void multilist_sublist_unlock(multilist_sublist_t *); + +void multilist_sublist_insert_head(multilist_sublist_t *, void *); +void multilist_sublist_insert_tail(multilist_sublist_t *, void *); +void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj); +void multilist_sublist_remove(multilist_sublist_t *, void *); + +void *multilist_sublist_head(multilist_sublist_t *); +void *multilist_sublist_tail(multilist_sublist_t *); +void *multilist_sublist_next(multilist_sublist_t *, void *); +void *multilist_sublist_prev(multilist_sublist_t *, void *); + +void multilist_link_init(multilist_node_t *); +int multilist_link_active(multilist_node_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_MULTILIST_H */ diff --git a/include/sys/sa_impl.h b/include/sys/sa_impl.h index fcbd8eb34e91..9dbe8f105d91 100644 --- a/include/sys/sa_impl.h +++ b/include/sys/sa_impl.h @@ -221,8 +221,7 @@ struct sa_handle { (dmu_buf_impl_t *)((type == SA_BONUS) ? hdl->sa_bonus : hdl->sa_spill) #define SA_GET_HDR(hdl, type) \ - ((sa_hdr_phys_t *)((dmu_buf_impl_t *)(SA_GET_DB(hdl, \ - type))->db.db_data)) + ((sa_hdr_phys_t *)ABD_TO_BUF((SA_GET_DB(hdl, type))->db.db_data)) #define SA_IDX_TAB_GET(hdl, type) \ (type == SA_BONUS ? hdl->sa_bonus_tab : hdl->sa_spill_tab) diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index a8dc9510e3e9..6dae2955b847 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -83,7 +83,7 @@ typedef const struct vdev_ops { * Virtual device properties */ struct vdev_cache_entry { - char *ve_data; + abd_t *ve_data; uint64_t ve_offset; clock_t ve_lastused; avl_node_t ve_offset_node; diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index b8eff58bc615..cc534f2c0801 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -610,6 +610,7 @@ extern void delay(clock_t ticks); } while (0); #define max_ncpus 64 +#define num_online_cpus() (sysconf(_SC_NPROCESSORS_ONLN)) #define minclsyspri 60 #define maxclsyspri 99 diff --git a/include/sys/zfs_vfsops.h b/include/sys/zfs_vfsops.h index eeeffbe4c72c..4b88260de2b9 100644 --- a/include/sys/zfs_vfsops.h +++ b/include/sys/zfs_vfsops.h @@ -73,6 +73,7 @@ typedef struct zfs_sb { uint64_t z_nr_znodes; /* number of znodes in the fs */ unsigned long z_rollback_time; /* last online rollback time */ kmutex_t z_znodes_lock; /* lock for z_all_znodes */ + arc_prune_t *z_arc_prune; /* called by ARC to prune caches */ struct inode *z_ctldir; /* .zfs directory inode */ avl_tree_t z_ctldir_snaps; /* .zfs/snapshot entries */ kmutex_t z_ctldir_lock; /* .zfs ctldir lock */ diff --git a/include/sys/zio.h b/include/sys/zio.h index 18e7a40a3080..e1b1b8c705cc 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -35,6 +35,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -351,12 +352,12 @@ typedef struct zio_gang_node { } zio_gang_node_t; typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp, - zio_gang_node_t *gn, void *data); + zio_gang_node_t *gn, abd_t *data, uint64_t offset); -typedef void zio_transform_func_t(zio_t *zio, void *data, uint64_t size); +typedef void zio_transform_func_t(zio_t *zio, abd_t *data, uint64_t size); typedef struct zio_transform { - void *zt_orig_data; + abd_t *zt_orig_data; uint64_t zt_orig_size; uint64_t zt_bufsize; zio_transform_func_t *zt_transform; @@ -412,8 +413,8 @@ struct zio { blkptr_t io_bp_orig; /* Data represented by this I/O */ - void *io_data; - void *io_orig_data; + abd_t *io_data; + abd_t *io_orig_data; uint64_t io_size; uint64_t io_orig_size; @@ -463,18 +464,18 @@ extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, extern zio_t *zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags); -extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, +extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, const zio_prop_t *zp, + abd_t *data, uint64_t size, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, zio_done_func_t *done, void *private, + abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb); extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, @@ -490,12 +491,12 @@ extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio_done_func_t *done, void *private, enum zio_flag flags); extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, - uint64_t size, void *data, int checksum, + uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels); extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, - uint64_t size, void *data, int checksum, + uint64_t size, abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels); @@ -526,12 +527,12 @@ extern void zio_data_buf_free(void *buf, size_t size); extern void zio_resubmit_stage_async(void *); extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, - uint64_t offset, void *data, uint64_t size, int type, + uint64_t offset, abd_t *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private); extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, - void *data, uint64_t size, int type, zio_priority_t priority, + abd_t *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private); extern void zio_vdev_io_bypass(zio_t *zio); diff --git a/include/sys/zio_checksum.h b/include/sys/zio_checksum.h index de89bc9a7967..dc01673e653a 100644 --- a/include/sys/zio_checksum.h +++ b/include/sys/zio_checksum.h @@ -34,7 +34,7 @@ extern "C" { /* * Signature for checksum functions. */ -typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp); +typedef void zio_checksum_t(abd_t *data, uint64_t size, zio_cksum_t *zcp); /* * Information about each checksum function. @@ -61,10 +61,16 @@ extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS]; /* * Checksum routines. */ -extern zio_checksum_t zio_checksum_SHA256; +extern void abd_checksum_SHA256(abd_t *, uint64_t, zio_cksum_t *); +extern void abd_fletcher_2_native(abd_t *, uint64_t, zio_cksum_t *); +extern void abd_fletcher_2_byteswap(abd_t *, uint64_t, zio_cksum_t *); +extern void abd_fletcher_4_native(abd_t *, uint64_t, zio_cksum_t *); +extern void abd_fletcher_4_byteswap(abd_t *, uint64_t, zio_cksum_t *); + +extern void zio_checksum_SHA256(const void *, uint64_t, zio_cksum_t *); extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, - void *data, uint64_t size); + abd_t *data, uint64_t size); extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out); extern enum zio_checksum spa_dedup_checksum(spa_t *spa); diff --git a/include/sys/zpl.h b/include/sys/zpl.h index 3fc5d979f76e..c7701aae57d0 100644 --- a/include/sys/zpl.h +++ b/include/sys/zpl.h @@ -63,7 +63,7 @@ extern const struct file_operations zpl_file_operations; extern const struct file_operations zpl_dir_file_operations; /* zpl_super.c */ -extern void zpl_prune_sbs(int64_t bytes_to_scan, void *private); +extern void zpl_prune_sb(int64_t nr_to_scan, void *arg); typedef struct zpl_mount_data { const char *z_osname; /* Dataset name */ diff --git a/include/zfs_fletcher.h b/include/zfs_fletcher.h index b49df0cf4f0f..8f2e7bb09fcd 100644 --- a/include/zfs_fletcher.h +++ b/include/zfs_fletcher.h @@ -37,14 +37,19 @@ extern "C" { * fletcher checksum functions */ +#define fletcher_init(zcp) ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0) +#define fletcher_2_native_init fletcher_init +#define fletcher_2_byteswap_init fletcher_init +#define fletcher_4_native_init fletcher_init +#define fletcher_4_byteswap_init fletcher_init void fletcher_2_native(const void *, uint64_t, zio_cksum_t *); void fletcher_2_byteswap(const void *, uint64_t, zio_cksum_t *); +int fletcher_2_incremental_native(const void *, uint64_t, void *); +int fletcher_2_incremental_byteswap(const void *, uint64_t, void *); void fletcher_4_native(const void *, uint64_t, zio_cksum_t *); void fletcher_4_byteswap(const void *, uint64_t, zio_cksum_t *); -void fletcher_4_incremental_native(const void *, uint64_t, - zio_cksum_t *); -void fletcher_4_incremental_byteswap(const void *, uint64_t, - zio_cksum_t *); +int fletcher_4_incremental_native(const void *, uint64_t, void *); +int fletcher_4_incremental_byteswap(const void *, uint64_t, void *); #ifdef __cplusplus } diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index e3572914c997..70870f766fbf 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -3297,6 +3298,48 @@ zfs_receive(libzfs_handle_t *hdl, const char *tosnap, recvflags_t *flags, int err; int cleanup_fd; uint64_t action_handle = 0; + struct stat sb; + + /* + * The only way fstat can fail is if we do not have a valid file + * descriptor. + */ + if (fstat(infd, &sb) == -1) { + perror("fstat"); + return (-2); + } + +#ifdef __linux__ +#ifndef F_SETPIPE_SZ +#define F_SETPIPE_SZ (F_SETLEASE + 7) +#endif /* F_SETPIPE_SZ */ + +#ifndef F_GETPIPE_SZ +#define F_GETPIPE_SZ (F_GETLEASE + 7) +#endif /* F_GETPIPE_SZ */ + + /* + * It is not uncommon for gigabytes to be processed in zfs receive. + * Speculatively increase the buffer size via Linux-specific fcntl() + * call. + */ + if (S_ISFIFO(sb.st_mode)) { + FILE *procf = fopen("/proc/sys/fs/pipe-max-size", "r"); + + if (procf != NULL) { + unsigned long max_psize; + long cur_psize; + if (fscanf(procf, "%lu", &max_psize) > 0) { + cur_psize = fcntl(infd, F_GETPIPE_SZ); + if (cur_psize > 0 && + max_psize > (unsigned long) cur_psize) + (void) fcntl(infd, F_SETPIPE_SZ, + max_psize); + } + fclose(procf); + } + } +#endif /* __linux__ */ cleanup_fd = open(ZFS_DEV, O_RDWR); VERIFY(cleanup_fd >= 0); diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 85bc0510a81d..07c4787409a0 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -20,6 +20,7 @@ libzpool_la_SOURCES = \ $(top_srcdir)/module/zcommon/zfs_uio.c \ $(top_srcdir)/module/zcommon/zpool_prop.c \ $(top_srcdir)/module/zcommon/zprop_common.c \ + $(top_srcdir)/module/zfs/abd.c \ $(top_srcdir)/module/zfs/arc.c \ $(top_srcdir)/module/zfs/blkptr.c \ $(top_srcdir)/module/zfs/bplist.c \ @@ -55,6 +56,7 @@ libzpool_la_SOURCES = \ $(top_srcdir)/module/zfs/lzjb.c \ $(top_srcdir)/module/zfs/lz4.c \ $(top_srcdir)/module/zfs/metaslab.c \ + $(top_srcdir)/module/zfs/multilist.c \ $(top_srcdir)/module/zfs/range_tree.c \ $(top_srcdir)/module/zfs/refcount.c \ $(top_srcdir)/module/zfs/rrwlock.c \ diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 321b6285cad6..4b3dc3666db0 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -386,7 +386,11 @@ Use \fB1\fR for yes (default) and \fB0\fR to disable. \fBzfs_arc_meta_limit\fR (ulong) .ad .RS 12n -Meta limit for arc size +The maximum allowed size in bytes that meta data buffers are allowed to +consume in the ARC. When this limit is reached meta data buffers will +be reclaimed even if the overall arc_c_max has not been reached. This +value defaults to 0 which indicates that 3/4 of the ARC may be used +for meta data. .sp Default value: \fB0\fR. .RE @@ -397,9 +401,14 @@ Default value: \fB0\fR. \fBzfs_arc_meta_prune\fR (int) .ad .RS 12n -Bytes of meta data to prune +The number of dentries and inodes to be scanned looking for entries +which can be dropped. This may be required when the ARC reaches the +\fBzfs_arc_meta_limit\fR because dentries and inodes can pin buffers +in the ARC. Increasing this value will cause to dentry and inode caches +to be pruned more aggressively. Setting this value to 0 will disable +pruning the inode and dentry caches. .sp -Default value: \fB1,048,576\fR. +Default value: \fB10,000\fR. .RE .sp diff --git a/man/man8/zfs.8 b/man/man8/zfs.8 index 45fafb9a9739..ab3d0214b739 100644 --- a/man/man8/zfs.8 +++ b/man/man8/zfs.8 @@ -2773,6 +2773,24 @@ be the origin snapshot, or an earlier snapshot in the origin's filesystem, or the origin's origin, etc. .RE +.sp +.ne 2 +.mk +.na +\fB\fB-e\fR\fR +.ad +.sp .6 +.RS 4n +Generate a more compact stream by using WRITE_EMBEDDED records for blocks +which are stored more compactly on disk by the \fBembedded_data\fR pool +feature. This flag has no effect if the \fBembedded_data\fR feature is +disabled. The receiving system must have the \fBembedded_data\fR feature +enabled. If the \fBlz4_compress\fR feature is active on the sending system, +then the receiving system must have that feature enabled as well. See +\fBzpool-features\fR(5) for details on ZFS feature flags and the +\fBembedded_data\fR feature. +.RE + .RE .sp .ne 2 @@ -2862,24 +2880,6 @@ Do not actually receive the stream. This can be useful in conjunction with the \ Force a rollback of the file system to the most recent snapshot before performing the receive operation. If receiving an incremental replication stream (for example, one generated by \fBzfs send -R -[iI]\fR), destroy snapshots and file systems that do not exist on the sending side. .RE -.sp -.ne 2 -.na -\fB\fB-e\fR\fR -.ad -.sp .6 -.RS 4n -Generate a more compact stream by using WRITE_EMBEDDED records for blocks -which are stored more compactly on disk by the \fBembedded_data\fR pool -feature. This flag has no effect if the \fBembedded_data\fR feature is -disabled. The receiving system must have the \fBembedded_data\fR feature -enabled. If the \fBlz4_compress\fR feature is active on the sending system, -then the receiving system must have that feature enabled as well. See -\fBzpool-features\fR(5) for details on ZFS feature flags and the -\fBembedded_data\fR feature. -.RE -.RE - .sp .ne 2 .mk diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c index edd0cbe6c611..0a7082894866 100644 --- a/module/zcommon/zfs_fletcher.c +++ b/module/zcommon/zfs_fletcher.c @@ -130,15 +130,22 @@ #include #include #include +#include -void -fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp) +int +fletcher_2_incremental_native(const void *buf, uint64_t size, void *private) { + zio_cksum_t *zcp = private; const uint64_t *ip = buf; const uint64_t *ipend = ip + (size / sizeof (uint64_t)); uint64_t a0, b0, a1, b1; - for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { + a0 = zcp->zc_word[0]; + a1 = zcp->zc_word[1]; + b0 = zcp->zc_word[2]; + b1 = zcp->zc_word[3]; + + for (; ip < ipend; ip += 2) { a0 += ip[0]; a1 += ip[1]; b0 += a0; @@ -146,16 +153,30 @@ fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp) } ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); + return (0); } void -fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) +fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + fletcher_2_native_init(zcp); + fletcher_2_incremental_native(buf, size, zcp); +} + +int +fletcher_2_incremental_byteswap(const void *buf, uint64_t size, void *private) { + zio_cksum_t *zcp = private; const uint64_t *ip = buf; const uint64_t *ipend = ip + (size / sizeof (uint64_t)); uint64_t a0, b0, a1, b1; - for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { + a0 = zcp->zc_word[0]; + a1 = zcp->zc_word[1]; + b0 = zcp->zc_word[2]; + b1 = zcp->zc_word[3]; + + for (; ip < ipend; ip += 2) { a0 += BSWAP_64(ip[0]); a1 += BSWAP_64(ip[1]); b0 += a0; @@ -163,46 +184,20 @@ fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) } ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); + return (0); } void -fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp) -{ - const uint32_t *ip = buf; - const uint32_t *ipend = ip + (size / sizeof (uint32_t)); - uint64_t a, b, c, d; - - for (a = b = c = d = 0; ip < ipend; ip++) { - a += ip[0]; - b += a; - c += b; - d += c; - } - - ZIO_SET_CHECKSUM(zcp, a, b, c, d); -} - -void -fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) +fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) { - const uint32_t *ip = buf; - const uint32_t *ipend = ip + (size / sizeof (uint32_t)); - uint64_t a, b, c, d; - - for (a = b = c = d = 0; ip < ipend; ip++) { - a += BSWAP_32(ip[0]); - b += a; - c += b; - d += c; - } - - ZIO_SET_CHECKSUM(zcp, a, b, c, d); + fletcher_2_byteswap_init(zcp); + fletcher_2_incremental_byteswap(buf, size, zcp); } -void -fletcher_4_incremental_native(const void *buf, uint64_t size, - zio_cksum_t *zcp) +int +fletcher_4_incremental_native(const void *buf, uint64_t size, void *private) { + zio_cksum_t *zcp = private; const uint32_t *ip = buf; const uint32_t *ipend = ip + (size / sizeof (uint32_t)); uint64_t a, b, c, d; @@ -220,12 +215,20 @@ fletcher_4_incremental_native(const void *buf, uint64_t size, } ZIO_SET_CHECKSUM(zcp, a, b, c, d); + return (0); } void -fletcher_4_incremental_byteswap(const void *buf, uint64_t size, - zio_cksum_t *zcp) +fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + fletcher_4_native_init(zcp); + fletcher_4_incremental_native(buf, size, zcp); +} + +int +fletcher_4_incremental_byteswap(const void *buf, uint64_t size, void *private) { + zio_cksum_t *zcp = private; const uint32_t *ip = buf; const uint32_t *ipend = ip + (size / sizeof (uint32_t)); uint64_t a, b, c, d; @@ -243,11 +246,21 @@ fletcher_4_incremental_byteswap(const void *buf, uint64_t size, } ZIO_SET_CHECKSUM(zcp, a, b, c, d); + return (0); +} + +void +fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + fletcher_4_byteswap_init(zcp); + fletcher_4_incremental_byteswap(buf, size, zcp); } #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(fletcher_2_native); EXPORT_SYMBOL(fletcher_2_byteswap); +EXPORT_SYMBOL(fletcher_2_incremental_native); +EXPORT_SYMBOL(fletcher_2_incremental_byteswap); EXPORT_SYMBOL(fletcher_4_native); EXPORT_SYMBOL(fletcher_4_byteswap); EXPORT_SYMBOL(fletcher_4_incremental_native); diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 954841f33137..c33976b93bbd 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -4,6 +4,7 @@ EXTRA_CFLAGS = $(ZFS_MODULE_CFLAGS) @KERNELCPPFLAGS@ obj-$(CONFIG_ZFS) := $(MODULE).o +$(MODULE)-objs += @top_srcdir@/module/zfs/abd.o $(MODULE)-objs += @top_srcdir@/module/zfs/arc.o $(MODULE)-objs += @top_srcdir@/module/zfs/blkptr.o $(MODULE)-objs += @top_srcdir@/module/zfs/bplist.o @@ -37,6 +38,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/gzip.o $(MODULE)-objs += @top_srcdir@/module/zfs/lzjb.o $(MODULE)-objs += @top_srcdir@/module/zfs/lz4.o $(MODULE)-objs += @top_srcdir@/module/zfs/metaslab.o +$(MODULE)-objs += @top_srcdir@/module/zfs/multilist.o $(MODULE)-objs += @top_srcdir@/module/zfs/range_tree.o $(MODULE)-objs += @top_srcdir@/module/zfs/refcount.o $(MODULE)-objs += @top_srcdir@/module/zfs/rrwlock.o diff --git a/module/zfs/abd.c b/module/zfs/abd.c new file mode 100644 index 000000000000..9c59839a0bc8 --- /dev/null +++ b/module/zfs/abd.c @@ -0,0 +1,1113 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + */ + +#include +#include +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include + +#else /* _KERNEL */ + +/* + * Userspace compatibility layer + */ + +/* + * page + */ +struct page; + +#define alloc_page(gfp) \ + ((struct page *)umem_alloc_aligned(PAGE_SIZE, PAGE_SIZE, UMEM_DEFAULT)) + +#define __free_page(page) \ + umem_free(page, PAGE_SIZE) + +/* + * scatterlist + */ +struct scatterlist { + struct page *page; + int length; + int end; +}; + +static void +sg_init_table(struct scatterlist *sg, int nr) { + memset(sg, 0, nr * sizeof (struct scatterlist)); + sg[nr - 1].end = 1; +} + +static inline void +sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, + unsigned int offset) { + /* currently we don't use offset */ + ASSERT(offset == 0); + sg->page = page; + sg->length = len; +} + +static inline struct page * +sg_page(struct scatterlist *sg) { + return (sg->page); +} + +static inline struct scatterlist * +sg_next(struct scatterlist *sg) +{ + if (sg->end) + return (NULL); + return (sg + 1); +} + +/* + * misc + */ +#ifndef DIV_ROUND_UP +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#endif + +#ifndef unlikely +#define unlikely(x) (x) +#endif + +#define kmap(page) ((void *)page) +#define kunmap(page) do { } while (0) +#define KM_USER0 (0) +#define KM_USER1 (1) +#define zfs_kmap_atomic(page, type) ((void *)page) +#define zfs_kunmap_atomic(addr, type) do { } while (0) +#define pagefault_disable() do { } while (0) +#define pagefault_enable() do { } while (0) +#define flush_kernel_dcache_page(page) do { } while (0) +#define set_current_state(state) do { } while (0) +static inline long +schedule_timeout(long timeout) +{ + sleep(timeout); + return (0); +} + +#endif /* _KERNEL */ + + +struct abd_miter { + void *addr; /* mapped addr, adjusted by offset */ + int length; /* current segment length, adjusted by offset */ + int offset; /* offset in current segment */ + int is_linear; /* the type of the abd */ + union { + struct scatterlist *sg; + void *buf; + }; + int nents; /* num of sg entries */ + int rw; /* r/w access, whether to flush cache */ +#ifndef HAVE_1ARG_KMAP_ATOMIC + int km_type; /* KM_USER0 or KM_USER1 */ +#endif +}; + +#define ABD_MITER_W (1) +#define ABD_MITER_R (0) + +/* + * Initialize the abd_miter. + * Pass ABD_MITER_W to rw if you will write to the abd buffer. + * Please use abd_miter_init or abd_miter_init2 for one or two iterators + * respectively, they will setup KM_USERx accordingly. + */ +static void +abd_miter_init_km(struct abd_miter *aiter, abd_t *abd, int rw, int km) +{ + ASSERT(abd->abd_nents != 0); + aiter->addr = NULL; + if (ABD_IS_LINEAR(abd)) { + ASSERT(abd->abd_nents == 1); + aiter->is_linear = 1; + aiter->buf = abd->abd_buf; + aiter->length = abd->abd_size; + } else { + aiter->is_linear = 0; + aiter->sg = abd->abd_sgl; + aiter->length = aiter->sg->length - abd->abd_offset; + } + aiter->offset = abd->abd_offset; + aiter->nents = abd->abd_nents; + aiter->rw = rw; +#ifndef HAVE_1ARG_KMAP_ATOMIC + aiter->km_type = (km ? KM_USER1 : KM_USER0); +#endif +} + + +#define abd_miter_init(a, abd, rw) abd_miter_init_km(a, abd, rw, 0) +#define abd_miter_init2(a, aabd, arw, b, babd, brw) \ +do { \ + abd_miter_init_km(a, aabd, arw, 0); \ + abd_miter_init_km(b, babd, brw, 1); \ +} while (0); + +/* + * Map the current page in abd_miter. + * Pass 1 to atmoic if you want to use kmap_atomic. + * This can be safely called when the aiter has already exhausted, in which + * case this does nothing. + * The mapped address and length will be aiter->addr and aiter->length. + */ +static void +abd_miter_map_x(struct abd_miter *aiter, int atomic) +{ + void *paddr; + + ASSERT(!aiter->addr); + + if (!aiter->nents) + return; + + if (aiter->is_linear) { + paddr = aiter->buf; + /* + * Turn of pagefault to keep the context the same as + * kmap_atomic. + */ + if (atomic) + pagefault_disable(); + } else { + ASSERT(aiter->length == aiter->sg->length - aiter->offset); + + if (atomic) + paddr = zfs_kmap_atomic(sg_page(aiter->sg), + aiter->km_type); + else + paddr = kmap(sg_page(aiter->sg)); + } + aiter->addr = paddr + aiter->offset; +} + +/* + * Unmap the current page in abd_miter. + * Pass 1 to atmoic if you want to use kmap_atomic. + * This can be safely called when the aiter has already exhausted, in which + * case this does nothing. + */ +static void +abd_miter_unmap_x(struct abd_miter *aiter, int atomic) +{ + void *paddr; + + if (!aiter->nents) + return; + + ASSERT(aiter->addr); + + if (aiter->is_linear) { + pagefault_enable(); + } else { + paddr = aiter->addr - aiter->offset; + if (atomic) { + if (aiter->rw == ABD_MITER_W) + flush_kernel_dcache_page(sg_page(aiter->sg)); + zfs_kunmap_atomic(paddr, aiter->km_type); + } else { + kunmap(sg_page(aiter->sg)); + } + } + aiter->addr = NULL; +} + +#define abd_miter_map_atomic(a) abd_miter_map_x(a, 1) +#define abd_miter_map(a) abd_miter_map_x(a, 0) +#define abd_miter_unmap_atomic(a) abd_miter_unmap_x(a, 1) +#define abd_miter_unmap(a) abd_miter_unmap_x(a, 0) + +/* + * Use abd_miter_{,un}map_atomic2 if you want to map 2 abd_miters. + * You need to pass the arguments in the same order for these two. + */ +#define abd_miter_map_atomic2(a, b) \ +do { \ + abd_miter_map_atomic(a); \ + abd_miter_map_atomic(b); \ +} while (0) + +#define abd_miter_unmap_atomic2(a, b) \ +do { \ + abd_miter_unmap_atomic(b); \ + abd_miter_unmap_atomic(a); \ +} while (0) + +/* + * Advance the iterator by offset. + * Cannot be called when a page is mapped. + * Returns 0 if exhausted. + * This can be safely called when the aiter has already exhausted, in which + * case this does nothing. + */ +static int +abd_miter_advance(struct abd_miter *aiter, int offset) +{ + ASSERT(!aiter->addr); + + if (!aiter->nents) + return (0); + + aiter->offset += offset; + if (aiter->is_linear) { + aiter->length -= offset; + if (aiter->length <= 0) { + aiter->nents--; + aiter->length = 0; + return (0); + } + } else { + while (aiter->offset >= aiter->sg->length) { + aiter->offset -= aiter->sg->length; + aiter->nents--; + aiter->sg = sg_next(aiter->sg); + if (!aiter->nents) { + aiter->length = 0; + return (0); + } + } + aiter->length = aiter->sg->length - aiter->offset; + } + return (1); +} + +#define ABD_CHECK(abd) \ +( \ +{ \ + ASSERT((abd)->abd_magic == ARC_BUF_DATA_MAGIC); \ + ASSERT((abd)->abd_size > 0); \ + if (ABD_IS_LINEAR(abd)) { \ + ASSERT((abd)->abd_offset == 0); \ + ASSERT((abd)->abd_nents == 1); \ + } else { \ + ASSERT((abd)->abd_offset < PAGE_SIZE); \ + ASSERT((abd)->abd_nents > 0); \ + } \ +} \ +) + +static void +abd_iterate_func(abd_t *abd, size_t size, + int (*func)(void *, uint64_t, void *), void *private, int rw) +{ + size_t len; + int stop; + struct abd_miter aiter; + + ABD_CHECK(abd); + ASSERT(size <= abd->abd_size); + + abd_miter_init(&aiter, abd, rw); + + while (size > 0) { + len = MIN(aiter.length, size); + ASSERT(len > 0); + /* + * The iterated function likely will not do well if each + * segment except the last one is not multiple of 16. + */ + ASSERT(size == len || (len & 15) == 0); + + abd_miter_map_atomic(&aiter); + + stop = func(aiter.addr, len, private); + + abd_miter_unmap_atomic(&aiter); + + if (stop) + break; + size -= len; + abd_miter_advance(&aiter, len); + } +} + +/* + * Iterate over ABD and call a read function @func. + * @func should be implemented so that its behaviour is the same when taking + * linear and when taking scatter + */ +void +abd_iterate_rfunc(abd_t *abd, size_t size, + int (*func)(const void *, uint64_t, void *), void *private) +{ + /* skip type checking on func */ + abd_iterate_func(abd, size, (void *)func, private, ABD_MITER_R); +} + +/* + * Iterate over ABD and call a write function @func. + * @func should be implemented so that its behaviour is the same when taking + * linear and when taking scatter + */ +void +abd_iterate_wfunc(abd_t *abd, size_t size, + int (*func)(void *, uint64_t, void *), void *private) +{ + abd_iterate_func(abd, size, func, private, ABD_MITER_W); +} + +/* + * Iterate over two ABD and call @func2. + * @func2 should be implemented so that its behaviour is the same when taking + * linear and when taking scatter + */ +void +abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t dsize, size_t ssize, + int (*func2)(void *, void *, uint64_t, uint64_t, void *), void *private) +{ + size_t dlen, slen; + int stop; + struct abd_miter daiter, saiter; + + ABD_CHECK(dabd); + ABD_CHECK(sabd); + + ASSERT(dsize <= dabd->abd_size); + ASSERT(ssize <= sabd->abd_size); + + abd_miter_init2(&daiter, dabd, ABD_MITER_W, + &saiter, sabd, ABD_MITER_W); + + while (dsize > 0 || ssize > 0) { + dlen = MIN(daiter.length, dsize); + slen = MIN(saiter.length, ssize); + + /* there are remainings after this run, use equal len */ + if (dsize > dlen || ssize > slen) { + if (MIN(dlen, slen) > 0) + slen = dlen = MIN(dlen, slen); + } + + /* must be progressive */ + ASSERT(dlen > 0 || slen > 0); + /* + * The iterated function likely will not do well if each + * segment except the last one is not multiple of 16. + */ + ASSERT(dsize == dlen || (dlen & 15) == 0); + ASSERT(ssize == slen || (slen & 15) == 0); + + abd_miter_map_atomic2(&daiter, &saiter); + + stop = func2(daiter.addr, saiter.addr, dlen, slen, private); + + abd_miter_unmap_atomic2(&daiter, &saiter); + + if (stop) + break; + + dsize -= dlen; + ssize -= slen; + abd_miter_advance(&daiter, dlen); + abd_miter_advance(&saiter, slen); + } +} + +/* + * Copy from @sabd to @dabd + * @doff is offset in dabd + * @soff is offset in sabd + */ +void +abd_copy_off(abd_t *dabd, abd_t *sabd, size_t size, size_t doff, + size_t soff) +{ + size_t len; + struct abd_miter daiter, saiter; + + ABD_CHECK(dabd); + ABD_CHECK(sabd); + + ASSERT(size <= dabd->abd_size); + ASSERT(size <= sabd->abd_size); + + abd_miter_init2(&daiter, dabd, ABD_MITER_W, + &saiter, sabd, ABD_MITER_R); + abd_miter_advance(&daiter, doff); + abd_miter_advance(&saiter, soff); + + while (size > 0) { + len = MIN(daiter.length, size); + len = MIN(len, saiter.length); + ASSERT(len > 0); + + abd_miter_map_atomic2(&daiter, &saiter); + + memcpy(daiter.addr, saiter.addr, len); + + abd_miter_unmap_atomic2(&daiter, &saiter); + + size -= len; + abd_miter_advance(&daiter, len); + abd_miter_advance(&saiter, len); + } +} + +/* + * Copy from @buf to @abd + * @off is the offset in @abd + */ +void +abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t size, + size_t off) +{ + size_t len; + struct abd_miter aiter; + + ABD_CHECK(abd); + ASSERT(size <= abd->abd_size - off); + + abd_miter_init(&aiter, abd, ABD_MITER_W); + abd_miter_advance(&aiter, off); + + while (size > 0) { + len = MIN(aiter.length, size); + ASSERT(len > 0); + + abd_miter_map_atomic(&aiter); + + memcpy(aiter.addr, buf, len); + + abd_miter_unmap_atomic(&aiter); + + size -= len; + buf += len; + abd_miter_advance(&aiter, len); + } +} + +/* + * Copy from @abd to @buf + * @off is the offset in @abd + */ +void +abd_copy_to_buf_off(void *buf, abd_t *abd, size_t size, size_t off) +{ + size_t len; + struct abd_miter aiter; + + ABD_CHECK(abd); + ASSERT(size <= abd->abd_size - off); + + abd_miter_init(&aiter, abd, ABD_MITER_R); + abd_miter_advance(&aiter, off); + + while (size > 0) { + len = MIN(aiter.length, size); + ASSERT(len > 0); + + abd_miter_map_atomic(&aiter); + + memcpy(buf, aiter.addr, len); + + abd_miter_unmap_atomic(&aiter); + + size -= len; + buf += len; + abd_miter_advance(&aiter, len); + } +} + +/* + * Compare between @dabd and @sabd. + */ +int +abd_cmp(abd_t *dabd, abd_t *sabd, size_t size) +{ + size_t len; + int ret = 0; + struct abd_miter daiter, saiter; + + ABD_CHECK(dabd); + ABD_CHECK(sabd); + ASSERT(size <= dabd->abd_size); + ASSERT(size <= sabd->abd_size); + + abd_miter_init2(&daiter, dabd, ABD_MITER_R, + &saiter, sabd, ABD_MITER_R); + + while (size > 0) { + len = MIN(daiter.length, size); + len = MIN(len, saiter.length); + ASSERT(len > 0); + + abd_miter_map_atomic2(&daiter, &saiter); + + ret = memcmp(daiter.addr, saiter.addr, len); + + abd_miter_unmap_atomic2(&daiter, &saiter); + + if (ret) + break; + + size -= len; + abd_miter_advance(&daiter, len); + abd_miter_advance(&saiter, len); + } + return (ret); +} + +/* + * Compare between @abd and @buf. + * @off is the offset in @abd + */ +int +abd_cmp_buf_off(abd_t *abd, const void *buf, size_t size, size_t off) +{ + size_t len; + int ret = 0; + struct abd_miter aiter; + + ABD_CHECK(abd); + ASSERT(size <= abd->abd_size - off); + + abd_miter_init(&aiter, abd, ABD_MITER_R); + abd_miter_advance(&aiter, off); + + while (size > 0) { + len = MIN(aiter.length, size); + ASSERT(len > 0); + + abd_miter_map_atomic(&aiter); + + ret = memcmp(aiter.addr, buf, len); + + abd_miter_unmap_atomic(&aiter); + + if (ret) + break; + + size -= len; + buf += len; + abd_miter_advance(&aiter, len); + } + return (ret); +} + +/* + * Zero out @abd. + * @off is the offset in @abd + */ +void +abd_zero_off(abd_t *abd, size_t size, size_t off) +{ + size_t len; + struct abd_miter aiter; + + ABD_CHECK(abd); + ASSERT(size <= abd->abd_size - off); + + abd_miter_init(&aiter, abd, ABD_MITER_W); + abd_miter_advance(&aiter, off); + + while (size > 0) { + len = MIN(aiter.length, size); + ASSERT(len > 0); + + abd_miter_map_atomic(&aiter); + + memset(aiter.addr, 0, len); + + abd_miter_unmap_atomic(&aiter); + + size -= len; + abd_miter_advance(&aiter, len); + } +} + +#ifdef _KERNEL +/* + * Copy from @abd to user buffer @buf. + * @off is the offset in @abd + */ +int +abd_copy_to_user_off(void __user *buf, abd_t *abd, size_t size, + size_t off) +{ + int ret = 0; + size_t len; + struct abd_miter aiter; + + ABD_CHECK(abd); + ASSERT(size <= abd->abd_size - off); + + abd_miter_init(&aiter, abd, ABD_MITER_R); + abd_miter_advance(&aiter, off); + + while (size > 0) { + len = MIN(aiter.length, size); + ASSERT(len > 0); + + abd_miter_map_atomic(&aiter); + + ret = __copy_to_user_inatomic(buf, aiter.addr, len); + + abd_miter_unmap_atomic(&aiter); + if (ret) { + abd_miter_map(&aiter); + ret = copy_to_user(buf, aiter.addr, len); + abd_miter_unmap(&aiter); + if (ret) + break; + } + + size -= len; + buf += len; + abd_miter_advance(&aiter, len); + } + return (ret ? EFAULT : 0); +} + +/* + * Copy from user buffer @buf to @abd. + * @off is the offset in @abd + */ +int +abd_copy_from_user_off(abd_t *abd, const void __user *buf, size_t size, + size_t off) +{ + int ret = 0; + size_t len; + struct abd_miter aiter; + + ABD_CHECK(abd); + ASSERT(size <= abd->abd_size - off); + + abd_miter_init(&aiter, abd, ABD_MITER_W); + abd_miter_advance(&aiter, off); + + while (size > 0) { + len = MIN(aiter.length, size); + ASSERT(len > 0); + + abd_miter_map_atomic(&aiter); + + ret = __copy_from_user_inatomic(aiter.addr, buf, len); + + abd_miter_unmap_atomic(&aiter); + if (ret) { + abd_miter_map(&aiter); + ret = copy_from_user(aiter.addr, buf, len); + abd_miter_unmap(&aiter); + if (ret) + break; + } + + size -= len; + buf += len; + abd_miter_advance(&aiter, len); + } + return (ret ? EFAULT : 0); +} + +/* + * uiomove for ABD. + * @off is the offset in @abd + */ +int +abd_uiomove_off(abd_t *abd, size_t n, enum uio_rw rw, uio_t *uio, + size_t off) +{ + struct iovec *iov; + ulong_t cnt; + + while (n && uio->uio_resid) { + iov = uio->uio_iov; + cnt = MIN(iov->iov_len, n); + if (cnt == 0l) { + uio->uio_iov++; + uio->uio_iovcnt--; + continue; + } + switch (uio->uio_segflg) { + case UIO_USERSPACE: + case UIO_USERISPACE: + /* + * p = kernel data pointer + * iov->iov_base = user data pointer + */ + if (rw == UIO_READ) { + if (abd_copy_to_user_off(iov->iov_base, + abd, cnt, off)) + return (EFAULT); + } else { + if (abd_copy_from_user_off(abd, + iov->iov_base, cnt, off)) + return (EFAULT); + } + break; + case UIO_SYSSPACE: + if (rw == UIO_READ) + abd_copy_to_buf_off(iov->iov_base, abd, + cnt, off); + else + abd_copy_from_buf_off(abd, iov->iov_base, + cnt, off); + break; + } + iov->iov_base += cnt; + iov->iov_len -= cnt; + uio->uio_resid -= cnt; + uio->uio_loffset += cnt; + off += cnt; + n -= cnt; + } + return (0); +} + +/* + * uiocopy for ABD. + * @off is the offset in @abd + */ +int +abd_uiocopy_off(abd_t *abd, size_t n, enum uio_rw rw, uio_t *uio, + size_t *cbytes, size_t off) +{ + struct iovec *iov; + ulong_t cnt; + int iovcnt; + + iovcnt = uio->uio_iovcnt; + *cbytes = 0; + + for (iov = uio->uio_iov; n && iovcnt; iov++, iovcnt--) { + cnt = MIN(iov->iov_len, n); + if (cnt == 0) + continue; + + switch (uio->uio_segflg) { + + case UIO_USERSPACE: + case UIO_USERISPACE: + /* + * p = kernel data pointer + * iov->iov_base = user data pointer + */ + if (rw == UIO_READ) { + /* UIO_READ = copy data from kernel to user */ + if (abd_copy_to_user_off(iov->iov_base, + abd, cnt, off)) + return (EFAULT); + } else { + /* UIO_WRITE = copy data from user to kernel */ + if (abd_copy_from_user_off(abd, + iov->iov_base, cnt, off)) + return (EFAULT); + } + break; + + case UIO_SYSSPACE: + if (rw == UIO_READ) + abd_copy_to_buf_off(iov->iov_base, abd, + cnt, off); + else + abd_copy_from_buf_off(abd, iov->iov_base, + cnt, off); + break; + } + off += cnt; + n -= cnt; + *cbytes += cnt; + } + return (0); +} + +/* + * bio_map for scatter ABD. + * @off is the offset in @abd + * You should use abd_bio_map_off, it will choose the right function according + * to the ABD type. + */ +unsigned int +abd_scatter_bio_map_off(struct bio *bio, abd_t *abd, unsigned int bio_size, + size_t off) +{ + int i; + size_t len; + struct abd_miter aiter; + + ABD_CHECK(abd); + ASSERT_ABD_SCATTER(abd); + ASSERT(bio_size <= abd->abd_size - off); + + abd_miter_init(&aiter, abd, ABD_MITER_R); + abd_miter_advance(&aiter, off); + + for (i = 0; i < bio->bi_max_vecs; i++) { + if (bio_size <= 0) + break; + + len = MIN(bio_size, aiter.length); + ASSERT(len > 0); + + if (bio_add_page(bio, sg_page(aiter.sg), len, + aiter.offset) != len) + break; + + bio_size -= len; + abd_miter_advance(&aiter, len); + } + return (bio_size); +} + +/* + * bio_nr_pages for ABD. + * @off is the offset in @abd + */ +unsigned long +abd_bio_nr_pages_off(abd_t *abd, unsigned int bio_size, size_t off) +{ + unsigned long pos = 0; + ABD_CHECK(abd); + + if (ABD_IS_LINEAR(abd)) + pos = (unsigned long)abd->abd_buf + off; + else + pos = abd->abd_offset + off; + return ((pos + bio_size + PAGE_SIZE-1)>>PAGE_SHIFT)-(pos>>PAGE_SHIFT); +} +#endif /* _KERNEL */ + +static inline abd_t * +abd_alloc_struct(int nr_pages) +{ + abd_t *abd; + size_t asize = sizeof (abd_t) + nr_pages*sizeof (struct scatterlist); + /* + * If the maximum block size increases, inline sgl might not fit into + * a single page. We might want to consider using chained sgl if + * that's the case. + */ + ASSERT(nr_pages * sizeof (struct scatterlist) <= PAGE_SIZE); +#ifndef DEBUG_ABD + abd = kmem_alloc(asize, KM_PUSHPAGE); +#else + abd = umem_alloc_aligned(asize, PAGE_SIZE, UMEM_DEFAULT); + /* deny access to padding */ + if (mprotect(abd, PAGE_SIZE, PROT_NONE) != 0) { + perror("mprotect failed"); + ASSERT(0); + } +#endif + ASSERT(abd); + + return (abd); +} + +static inline void +abd_free_struct(abd_t *abd, int nr_pages) +{ +#ifndef DEBUG_ABD + kmem_free(abd, sizeof (abd_t) + nr_pages*sizeof (struct scatterlist)); +#else + if (mprotect(abd, PAGE_SIZE, PROT_READ|PROT_WRITE) != 0) { + perror("mprotect failed"); + ASSERT(0); + } + umem_free(abd, sizeof (abd_t) + nr_pages*sizeof (struct scatterlist)); +#endif +} + +/* + * Allocate a new ABD to point to offset @off of the original ABD. + * It shares the underlying buffer with the original ABD. + * Use abd_put to free. The original ABD(allocated from abd_alloc) must + * not be freed before any of its derived ABD. + */ +abd_t * +abd_get_offset(abd_t *sabd, size_t off) +{ + size_t offset; + abd_t *abd; + + ABD_CHECK(sabd); + ASSERT(off <= sabd->abd_size); + + abd = abd_alloc_struct(0); + + abd->abd_magic = ARC_BUF_DATA_MAGIC; + abd->abd_size = sabd->abd_size - off; + + if (ABD_IS_LINEAR(sabd)) { + abd->abd_flags = ABD_F_LINEAR; + abd->abd_offset = 0; + abd->abd_nents = 1; + abd->abd_buf = sabd->abd_buf + off; + } else { + abd->abd_flags = ABD_F_SCATTER; + offset = sabd->abd_offset + off; + abd->abd_offset = offset & (PAGE_SIZE - 1); + /* make sure the new abd start as sgl[0] */ + abd->abd_sgl = &sabd->abd_sgl[offset >> PAGE_SHIFT]; + abd->abd_nents = sabd->abd_nents - (offset >> PAGE_SHIFT); + } + + return (abd); +} + +/* + * Allocate a linear ABD structure for @buf + * Use abd_put to free. + */ +abd_t * +abd_get_from_buf(void *buf, size_t size) +{ + abd_t *abd; + + abd = abd_alloc_struct(0); + + abd->abd_magic = ARC_BUF_DATA_MAGIC; + abd->abd_flags = ABD_F_LINEAR; + abd->abd_size = size; + abd->abd_offset = 0; + abd->abd_nents = 1; + abd->abd_buf = buf; + + return (abd); +} + +/* + * Free an ABD allocated from abd_get_{offset,from_buf}. + * Must not be used on ABD from elsewhere. + * Will not free the underlying scatterlist or buffer. + */ +void +abd_put(abd_t *abd) +{ + ABD_CHECK(abd); + ASSERT(!(abd->abd_flags & ABD_F_OWNER)); + + abd->abd_magic = 0; + abd_free_struct(abd, 0); +} + +/* + * Allocate a scatter ABD + */ +abd_t * +abd_alloc_scatter(size_t size) +{ + abd_t *abd; + struct page *page; + int i, n = DIV_ROUND_UP(size, PAGE_SIZE); + size_t last_size = size - ((n-1) << PAGE_SHIFT); + + abd = abd_alloc_struct(n); + + abd->abd_magic = ARC_BUF_DATA_MAGIC; + abd->abd_flags = ABD_F_SCATTER|ABD_F_OWNER; + abd->abd_size = size; + abd->abd_offset = 0; + abd->abd_nents = n; + abd->abd_sgl = (struct scatterlist *)&abd->__abd_sgl[0]; + sg_init_table(abd->abd_sgl, n); + + for (i = 0; i < n; i++) { +retry: + page = alloc_page(GFP_NOIO|__GFP_HIGHMEM); + if (unlikely(page == NULL)) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(1); + goto retry; + } + sg_set_page(&abd->abd_sgl[i], page, + (i == n-1 ? last_size : PAGE_SIZE), 0); + } + + return (abd); +} + +/* + * Allocate a linear ABD + */ +abd_t * +abd_alloc_linear(size_t size) +{ + abd_t *abd; + + abd = abd_alloc_struct(0); + + abd->abd_magic = ARC_BUF_DATA_MAGIC; + abd->abd_flags = ABD_F_LINEAR|ABD_F_OWNER; + abd->abd_size = size; + abd->abd_offset = 0; + abd->abd_nents = 1; + + abd->abd_buf = zio_buf_alloc(size); + + return (abd); +} + +static void +abd_free_scatter(abd_t *abd, size_t size) +{ + int i, n; + struct page *page; + + ASSERT(abd->abd_sgl == (struct scatterlist *)&abd->__abd_sgl[0]); + ASSERT(abd->abd_size == size); + ASSERT(abd->abd_nents == DIV_ROUND_UP(abd->abd_size, PAGE_SIZE)); + + n = abd->abd_nents; + abd->abd_magic = 0; + for (i = 0; i < n; i++) { + page = sg_page(&abd->abd_sgl[i]); + if (page) + __free_page(page); + } + abd_free_struct(abd, n); +} + +static void +abd_free_linear(abd_t *abd, size_t size) +{ + abd->abd_magic = 0; + zio_buf_free(abd->abd_buf, size); + abd_free_struct(abd, 0); +} + +/* + * Free a ABD. + * Only use this on ABD allocated with abd_alloc_{scatter,linear}. + */ +void +abd_free(abd_t *abd, size_t size) +{ + ABD_CHECK(abd); + ASSERT(abd->abd_flags & ABD_F_OWNER); + if (ABD_IS_LINEAR(abd)) + abd_free_linear(abd, size); + else + abd_free_scatter(abd, size); +} diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 9a81b4c59944..f35a7abcbd5e 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -118,7 +118,7 @@ * Note that the majority of the performance stats are manipulated * with atomic operations. * - * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: + * The L2ARC uses the l2ad_mtx on each vdev for the following: * * - L2ARC buflist creation * - L2ARC buflist eviction @@ -135,6 +135,8 @@ #include #include #include +#include +#include #ifdef _KERNEL #include #include @@ -154,12 +156,17 @@ boolean_t arc_watch = B_FALSE; #endif -static kmutex_t arc_reclaim_thr_lock; -static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ -static uint8_t arc_thread_exit; +static kmutex_t arc_reclaim_lock; +static kcondvar_t arc_reclaim_thread_cv; +static boolean_t arc_reclaim_thread_exit; +static kcondvar_t arc_reclaim_waiters_cv; -/* number of bytes to prune from caches when at arc_meta_limit is reached */ -int zfs_arc_meta_prune = 1048576; +static kmutex_t arc_user_evicts_lock; +static kcondvar_t arc_user_evicts_cv; +static boolean_t arc_user_evicts_thread_exit; + +/* number of objects to prune from caches when arc_meta_limit is reached */ +int zfs_arc_meta_prune = 10000; typedef enum arc_reclaim_strategy { ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ @@ -167,13 +174,26 @@ typedef enum arc_reclaim_strategy { } arc_reclaim_strategy_t; /* - * The number of iterations through arc_evict_*() before we - * drop & reacquire the lock. + * The number of headers to evict in arc_evict_state_impl() before + * dropping the sublist lock and evicting from another sublist. A lower + * value means we're more likely to evict the "correct" header (i.e. the + * oldest header in the arc state), but comes with higher overhead + * (i.e. more invocations of arc_evict_state_impl()). + */ +int zfs_arc_evict_batch_limit = 10; + +/* + * The number of sublists used for each of the arc state lists. If this + * is not set to a suitable value by the user, it will be configured to + * the number of CPUs on the system in arc_init(). */ -int arc_evict_iterations = 100; +int zfs_arc_num_sublists_per_state = 0; /* number of seconds before growing cache again */ -int zfs_arc_grow_retry = 5; +int zfs_arc_grow_retry = 20; + +/* shift of arc_c for calculating overflow limit in arc_get_data_buf */ +int zfs_arc_overflow_shift = 8; /* disable anon data aggressively growing arc_p */ int zfs_arc_p_aggressive_disable = 1; @@ -182,7 +202,7 @@ int zfs_arc_p_aggressive_disable = 1; int zfs_arc_p_dampener_disable = 1; /* log2(fraction of arc to reclaim) */ -int zfs_arc_shrink_shift = 5; +int zfs_arc_shrink_shift = 4; /* * minimum lifespan of a prefetch block in clock ticks @@ -199,6 +219,12 @@ int zfs_disable_dup_eviction = 0; /* average block used to size buf_hash_table */ int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ +/* + * minimum lifespan of a prefetch block in clock ticks + * (initialized in arc_init()) + */ +static int arc_min_prefetch_lifespan; + /* * If this percent of memory is free, don't throttle. */ @@ -220,6 +246,7 @@ static boolean_t arc_warm; unsigned long zfs_arc_max = 0; unsigned long zfs_arc_min = 0; unsigned long zfs_arc_meta_limit = 0; +unsigned long zfs_arc_meta_min = 0; /* The 6 states: */ static arc_state_t ARC_anon; @@ -245,7 +272,6 @@ typedef struct arc_stats { kstat_named_t arcstat_mfu_hits; kstat_named_t arcstat_mfu_ghost_hits; kstat_named_t arcstat_deleted; - kstat_named_t arcstat_recycle_miss; /* * Number of buffers that could not be evicted because the hash lock * was held by another thread. The lock may not necessarily be held @@ -259,9 +285,15 @@ typedef struct arc_stats { * not from the spa we're trying to evict from. */ kstat_named_t arcstat_evict_skip; + /* + * Number of times arc_evict_state() was unable to evict enough + * buffers to reach it's target amount. + */ + kstat_named_t arcstat_evict_not_enough; kstat_named_t arcstat_evict_l2_cached; kstat_named_t arcstat_evict_l2_eligible; kstat_named_t arcstat_evict_l2_ineligible; + kstat_named_t arcstat_evict_l2_skip; kstat_named_t arcstat_hash_elements; kstat_named_t arcstat_hash_elements_max; kstat_named_t arcstat_hash_collisions; @@ -300,9 +332,10 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_writes_sent; kstat_named_t arcstat_l2_writes_done; kstat_named_t arcstat_l2_writes_error; - kstat_named_t arcstat_l2_writes_hdr_miss; + kstat_named_t arcstat_l2_writes_lock_retry; kstat_named_t arcstat_l2_evict_lock_retry; kstat_named_t arcstat_l2_evict_reading; + kstat_named_t arcstat_l2_evict_l1cached; kstat_named_t arcstat_l2_free_on_write; kstat_named_t arcstat_l2_cdata_free_on_write; kstat_named_t arcstat_l2_abort_lowmem; @@ -327,6 +360,7 @@ typedef struct arc_stats { kstat_named_t arcstat_meta_used; kstat_named_t arcstat_meta_limit; kstat_named_t arcstat_meta_max; + kstat_named_t arcstat_meta_min; } arc_stats_t; static arc_stats_t arc_stats = { @@ -345,12 +379,13 @@ static arc_stats_t arc_stats = { { "mfu_hits", KSTAT_DATA_UINT64 }, { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, { "deleted", KSTAT_DATA_UINT64 }, - { "recycle_miss", KSTAT_DATA_UINT64 }, { "mutex_miss", KSTAT_DATA_UINT64 }, { "evict_skip", KSTAT_DATA_UINT64 }, + { "evict_not_enough", KSTAT_DATA_UINT64 }, { "evict_l2_cached", KSTAT_DATA_UINT64 }, { "evict_l2_eligible", KSTAT_DATA_UINT64 }, { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, + { "evict_l2_skip", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, { "hash_elements_max", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, @@ -389,9 +424,10 @@ static arc_stats_t arc_stats = { { "l2_writes_sent", KSTAT_DATA_UINT64 }, { "l2_writes_done", KSTAT_DATA_UINT64 }, { "l2_writes_error", KSTAT_DATA_UINT64 }, - { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, + { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_reading", KSTAT_DATA_UINT64 }, + { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, @@ -416,6 +452,7 @@ static arc_stats_t arc_stats = { { "arc_meta_used", KSTAT_DATA_UINT64 }, { "arc_meta_limit", KSTAT_DATA_UINT64 }, { "arc_meta_max", KSTAT_DATA_UINT64 }, + { "arc_meta_min", KSTAT_DATA_UINT64 }, }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) @@ -481,6 +518,7 @@ static arc_state_t *arc_l2c_only; #define arc_tempreserve ARCSTAT(arcstat_tempreserve) #define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes) #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ +#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ #define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ @@ -490,60 +528,50 @@ static arc_state_t *arc_l2c_only; static list_t arc_prune_list; static kmutex_t arc_prune_mtx; static arc_buf_t *arc_eviction_list; -static kmutex_t arc_eviction_mtx; static arc_buf_hdr_t arc_eviction_hdr; -static void arc_get_data_buf(arc_buf_t *buf); -static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); -static int arc_evict_needed(arc_buf_contents_t type); -static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, - arc_buf_contents_t type); -static void arc_buf_watch(arc_buf_t *buf); - -static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ (state) == arc_l2c_only) -/* - * Private ARC flags. These flags are private ARC only flags that will show up - * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can - * be passed in as arc_flags in things like arc_read. However, these flags - * should never be passed and should only be set by ARC code. When adding new - * public flags, make sure not to smash the private ones. - */ - -#define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ -#define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ -#define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ -#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ -#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ -#define ARC_INDIRECT (1 << 14) /* this is an indirect block */ -#define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ -#define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */ -#define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ -#define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ - -#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) -#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) -#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) -#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH) -#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) -#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) -#define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) -#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE) -#define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ - (hdr)->b_l2hdr != NULL) -#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) -#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) -#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) +#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) +#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) +#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) +#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) +#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) +#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) + +#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) +#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS) +#define HDR_L2_READING(hdr) \ + (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ + ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) +#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) +#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) +#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) + +#define HDR_ISTYPE_METADATA(hdr) \ + ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) +#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) + +#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) +#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) + +/* For storing compression mode in b_flags */ +#define HDR_COMPRESS_OFFSET 24 +#define HDR_COMPRESS_NBITS 7 + +#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET(hdr->b_flags, \ + HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS)) +#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET(hdr->b_flags, \ + HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS, (cmp)) /* * Other sizes */ -#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) -#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) +#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) +#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) /* * Hash table routines @@ -613,7 +641,6 @@ static list_t L2ARC_dev_list; /* device list */ static list_t *l2arc_dev_list; /* device list pointer */ static kmutex_t l2arc_dev_mtx; /* device list mutex */ static l2arc_dev_t *l2arc_dev_last; /* last device used */ -static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ static list_t L2ARC_free_on_write; /* free after write buf list */ static list_t *l2arc_free_on_write; /* free after write list ptr */ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ @@ -628,24 +655,16 @@ typedef struct l2arc_read_callback { enum zio_compress l2rcb_compress; /* applied compress */ } l2arc_read_callback_t; -struct l2arc_buf_hdr { - /* protected by arc_buf_hdr mutex */ - l2arc_dev_t *b_dev; /* L2ARC device */ - uint64_t b_daddr; /* disk address, offset byte */ - /* compression applied to buffer data */ - enum zio_compress b_compress; - /* real alloc'd buffer size depending on b_compress applied */ - uint32_t b_hits; - uint64_t b_asize; - /* temporary buffer holder for in-flight compressed data */ - void *b_tmp_cdata; -}; +typedef struct l2arc_write_callback { + l2arc_dev_t *l2wcb_dev; /* device info */ + arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ +} l2arc_write_callback_t; typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ - void *l2df_data; + abd_t *l2df_data; size_t l2df_size; - void (*l2df_func)(void *, size_t); + void (*l2df_func)(abd_t *, size_t); list_node_t l2df_list_node; } l2arc_data_free_t; @@ -653,14 +672,20 @@ static kmutex_t l2arc_feed_thr_lock; static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; -static void l2arc_read_done(zio_t *zio); -static void l2arc_hdr_stat_add(void); -static void l2arc_hdr_stat_remove(void); +static void arc_get_data_buf(arc_buf_t *); +static void arc_access(arc_buf_hdr_t *, kmutex_t *); +static boolean_t arc_is_overflowing(void); +static void arc_buf_watch(arc_buf_t *); + +static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); +static uint32_t arc_bufc_to_flags(arc_buf_contents_t); + +static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); +static void l2arc_read_done(zio_t *); -static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr); -static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, - enum zio_compress c); -static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab); +static boolean_t l2arc_compress_buf(arc_buf_hdr_t *); +static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); +static void l2arc_release_cdata_buf(arc_buf_hdr_t *); static uint64_t buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) @@ -681,8 +706,7 @@ buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) #define BUF_EMPTY(buf) \ ((buf)->b_dva.dva_word[0] == 0 && \ - (buf)->b_dva.dva_word[1] == 0 && \ - (buf)->b_cksum0 == 0) + (buf)->b_dva.dva_word[1] == 0) #define BUF_EQUAL(spa, dva, birth, buf) \ ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ @@ -695,7 +719,6 @@ buf_discard_identity(arc_buf_hdr_t *hdr) hdr->b_dva.dva_word[0] = 0; hdr->b_dva.dva_word[1] = 0; hdr->b_birth = 0; - hdr->b_cksum0 = 0; } static arc_buf_hdr_t * @@ -705,14 +728,14 @@ buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) uint64_t birth = BP_PHYSICAL_BIRTH(bp); uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); - arc_buf_hdr_t *buf; + arc_buf_hdr_t *hdr; mutex_enter(hash_lock); - for (buf = buf_hash_table.ht_table[idx]; buf != NULL; - buf = buf->b_hash_next) { - if (BUF_EQUAL(spa, dva, birth, buf)) { + for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; + hdr = hdr->b_hash_next) { + if (BUF_EQUAL(spa, dva, birth, hdr)) { *lockp = hash_lock; - return (buf); + return (hdr); } } mutex_exit(hash_lock); @@ -725,29 +748,36 @@ buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. + * If lockp == NULL, the caller is assumed to already hold the hash lock. */ static arc_buf_hdr_t * -buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) +buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) { - uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); + uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); - arc_buf_hdr_t *fbuf; + arc_buf_hdr_t *fhdr; uint32_t i; - ASSERT(!DVA_IS_EMPTY(&buf->b_dva)); - ASSERT(buf->b_birth != 0); - ASSERT(!HDR_IN_HASH_TABLE(buf)); - *lockp = hash_lock; - mutex_enter(hash_lock); - for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; - fbuf = fbuf->b_hash_next, i++) { - if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) - return (fbuf); + ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); + ASSERT(hdr->b_birth != 0); + ASSERT(!HDR_IN_HASH_TABLE(hdr)); + + if (lockp != NULL) { + *lockp = hash_lock; + mutex_enter(hash_lock); + } else { + ASSERT(MUTEX_HELD(hash_lock)); + } + + for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; + fhdr = fhdr->b_hash_next, i++) { + if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) + return (fhdr); } - buf->b_hash_next = buf_hash_table.ht_table[idx]; - buf_hash_table.ht_table[idx] = buf; - buf->b_flags |= ARC_IN_HASH_TABLE; + hdr->b_hash_next = buf_hash_table.ht_table[idx]; + buf_hash_table.ht_table[idx] = hdr; + hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; /* collect some hash table performance data */ if (i > 0) { @@ -765,22 +795,22 @@ buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) } static void -buf_hash_remove(arc_buf_hdr_t *buf) +buf_hash_remove(arc_buf_hdr_t *hdr) { - arc_buf_hdr_t *fbuf, **bufp; - uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); + arc_buf_hdr_t *fhdr, **hdrp; + uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); - ASSERT(HDR_IN_HASH_TABLE(buf)); + ASSERT(HDR_IN_HASH_TABLE(hdr)); - bufp = &buf_hash_table.ht_table[idx]; - while ((fbuf = *bufp) != buf) { - ASSERT(fbuf != NULL); - bufp = &fbuf->b_hash_next; + hdrp = &buf_hash_table.ht_table[idx]; + while ((fhdr = *hdrp) != hdr) { + ASSERT(fhdr != NULL); + hdrp = &fhdr->b_hash_next; } - *bufp = buf->b_hash_next; - buf->b_hash_next = NULL; - buf->b_flags &= ~ARC_IN_HASH_TABLE; + *hdrp = hdr->b_hash_next; + hdr->b_hash_next = NULL; + hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE; /* collect some hash table performance data */ ARCSTAT_BUMPDOWN(arcstat_hash_elements); @@ -793,9 +823,9 @@ buf_hash_remove(arc_buf_hdr_t *buf) /* * Global data structures and functions for the buf kmem cache. */ -static kmem_cache_t *hdr_cache; +static kmem_cache_t *hdr_full_cache; +static kmem_cache_t *hdr_l2only_cache; static kmem_cache_t *buf_cache; -static kmem_cache_t *l2arc_hdr_cache; static void buf_fini(void) @@ -815,9 +845,9 @@ buf_fini(void) #endif for (i = 0; i < BUF_LOCKS; i++) mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); - kmem_cache_destroy(hdr_cache); + kmem_cache_destroy(hdr_full_cache); + kmem_cache_destroy(hdr_l2only_cache); kmem_cache_destroy(buf_cache); - kmem_cache_destroy(l2arc_hdr_cache); } /* @@ -826,17 +856,30 @@ buf_fini(void) */ /* ARGSUSED */ static int -hdr_cons(void *vbuf, void *unused, int kmflag) +hdr_full_cons(void *vbuf, void *unused, int kmflag) +{ + arc_buf_hdr_t *hdr = vbuf; + + bzero(hdr, HDR_FULL_SIZE); + cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); + refcount_create(&hdr->b_l1hdr.b_refcnt); + mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); + list_link_init(&hdr->b_l1hdr.b_arc_node); + list_link_init(&hdr->b_l2hdr.b_l2node); + multilist_link_init(&hdr->b_l1hdr.b_arc_node); + arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); + + return (0); +} + +/* ARGSUSED */ +static int +hdr_l2only_cons(void *vbuf, void *unused, int kmflag) { - arc_buf_hdr_t *buf = vbuf; + arc_buf_hdr_t *hdr = vbuf; - bzero(buf, sizeof (arc_buf_hdr_t)); - refcount_create(&buf->b_refcnt); - cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); - mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); - list_link_init(&buf->b_arc_node); - list_link_init(&buf->b_l2node); - arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); + bzero(hdr, HDR_L2ONLY_SIZE); + arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); return (0); } @@ -860,15 +903,26 @@ buf_cons(void *vbuf, void *unused, int kmflag) */ /* ARGSUSED */ static void -hdr_dest(void *vbuf, void *unused) +hdr_full_dest(void *vbuf, void *unused) { - arc_buf_hdr_t *buf = vbuf; + arc_buf_hdr_t *hdr = vbuf; - ASSERT(BUF_EMPTY(buf)); - refcount_destroy(&buf->b_refcnt); - cv_destroy(&buf->b_cv); - mutex_destroy(&buf->b_freeze_lock); - arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); + ASSERT(BUF_EMPTY(hdr)); + cv_destroy(&hdr->b_l1hdr.b_cv); + refcount_destroy(&hdr->b_l1hdr.b_refcnt); + mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); + arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); +} + +/* ARGSUSED */ +static void +hdr_l2only_dest(void *vbuf, void *unused) +{ + ASSERTV(arc_buf_hdr_t *hdr = vbuf); + + ASSERT(BUF_EMPTY(hdr)); + arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); } /* ARGSUSED */ @@ -915,12 +969,13 @@ buf_init(void) goto retry; } - hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), - 0, hdr_cons, hdr_dest, NULL, NULL, NULL, 0); + hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, + 0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0); + hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", + HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL, + NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); - l2arc_hdr_cache = kmem_cache_create("l2arc_buf_hdr_t", L2HDR_SIZE, - 0, NULL, NULL, NULL, NULL, NULL, 0); for (i = 0; i < 256; i++) for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) @@ -932,6 +987,94 @@ buf_init(void) } } +/* + * Transition between the two allocation states for the arc_buf_hdr struct. + * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without + * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller + * version is used when a cache buffer is only in the L2ARC in order to reduce + * memory usage. + */ +static arc_buf_hdr_t * +arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) +{ + arc_buf_hdr_t *nhdr; + l2arc_dev_t *dev; + + ASSERT(HDR_HAS_L2HDR(hdr)); + ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || + (old == hdr_l2only_cache && new == hdr_full_cache)); + + dev = hdr->b_l2hdr.b_dev; + nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); + + ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); + buf_hash_remove(hdr); + + bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); + if (new == hdr_full_cache) { + nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; + /* + * arc_access and arc_change_state need to be aware that a + * header has just come out of L2ARC, so we set its state to + * l2c_only even though it's about to change. + */ + nhdr->b_l1hdr.b_state = arc_l2c_only; + + /* Verify previous threads set to NULL before freeing */ + ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL); + } else { + ASSERT(hdr->b_l1hdr.b_buf == NULL); + ASSERT0(hdr->b_l1hdr.b_datacnt); + + /* + * If we've reached here, We must have been called from + * arc_evict_hdr(), as such we should have already been + * removed from any ghost list we were previously on + * (which protects us from racing with arc_evict_state), + * thus no locking is needed during this check. + */ + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); + + /* + * A buffer must not be moved into the arc_l2c_only + * state if it's not finished being written out to the + * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field + * might try to be accessed, even though it was removed. + */ + VERIFY(!HDR_L2_WRITING(hdr)); + VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); + + nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; + } + /* + * The header has been reallocated so we need to re-insert it into any + * lists it was on. + */ + (void) buf_hash_insert(nhdr, NULL); + + ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); + + mutex_enter(&dev->l2ad_mtx); + + /* + * We must place the realloc'ed header back into the list at + * the same spot. Otherwise, if it's placed earlier in the list, + * l2arc_write_buffers() could find it during the function's + * write phase, and try to write it out to the l2arc. + */ + list_insert_after(&dev->l2ad_buflist, hdr, nhdr); + list_remove(&dev->l2ad_buflist, hdr); + + mutex_exit(&dev->l2ad_mtx); + + buf_discard_identity(hdr); + hdr->b_freeze_cksum = NULL; + kmem_cache_free(old, hdr); + + return (nhdr); +} + + #define ARC_MINTIME (hz>>4) /* 62 ms */ static void @@ -942,16 +1085,15 @@ arc_cksum_verify(arc_buf_t *buf) if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; - mutex_enter(&buf->b_hdr->b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum == NULL || - (buf->b_hdr->b_flags & ARC_IO_ERROR)) { - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); + if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) { + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); return; } - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); + abd_fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) panic("buffer modified while frozen!"); - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); } static int @@ -960,10 +1102,10 @@ arc_cksum_equal(arc_buf_t *buf) zio_cksum_t zc; int equal; - mutex_enter(&buf->b_hdr->b_freeze_lock); - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); + abd_fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); return (equal); } @@ -974,16 +1116,16 @@ arc_cksum_compute(arc_buf_t *buf, boolean_t force) if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) return; - mutex_enter(&buf->b_hdr->b_freeze_lock); + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); if (buf->b_hdr->b_freeze_cksum != NULL) { - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); return; } buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); - fletcher_2_native(buf->b_data, buf->b_hdr->b_size, + abd_fletcher_2_native(buf->b_data, buf->b_hdr->b_size, buf->b_hdr->b_freeze_cksum); - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); arc_buf_watch(buf); } @@ -993,6 +1135,13 @@ arc_buf_sigsegv(int sig, siginfo_t *si, void *unused) { panic("Got SIGSEGV at address: 0x%lx\n", (long) si->si_addr); } + +static int +arc_abd_watch(const void *buf, uint64_t len, void *private) +{ + ASSERT0(mprotect((void *)buf, len, *(int *)private)); + return (0); +} #endif /* ARGSUSED */ @@ -1001,8 +1150,9 @@ arc_buf_unwatch(arc_buf_t *buf) { #ifndef _KERNEL if (arc_watch) { - ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size, - PROT_READ | PROT_WRITE)); + int prot = PROT_READ | PROT_WRITE; + abd_iterate_rfunc(buf->b_data, buf->b_hdr->b_size, + arc_abd_watch, &prot); } #endif } @@ -1012,29 +1162,58 @@ static void arc_buf_watch(arc_buf_t *buf) { #ifndef _KERNEL - if (arc_watch) - ASSERT0(mprotect(buf->b_data, buf->b_hdr->b_size, PROT_READ)); + if (arc_watch) { + int prot = PROT_READ; + abd_iterate_rfunc(buf->b_data, buf->b_hdr->b_size, + arc_abd_watch, &prot); + } #endif } +static arc_buf_contents_t +arc_buf_type(arc_buf_hdr_t *hdr) +{ + if (HDR_ISTYPE_METADATA(hdr)) { + return (ARC_BUFC_METADATA); + } else { + return (ARC_BUFC_DATA); + } +} + +static uint32_t +arc_bufc_to_flags(arc_buf_contents_t type) +{ + switch (type) { + case ARC_BUFC_DATA: + /* metadata field is 0 if buffer contains normal data */ + return (0); + case ARC_BUFC_METADATA: + return (ARC_FLAG_BUFC_METADATA); + default: + break; + } + panic("undefined ARC buffer type!"); + return ((uint32_t)-1); +} + void arc_buf_thaw(arc_buf_t *buf) { if (zfs_flags & ZFS_DEBUG_MODIFY) { - if (buf->b_hdr->b_state != arc_anon) + if (buf->b_hdr->b_l1hdr.b_state != arc_anon) panic("modifying non-anon buffer!"); - if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) + if (HDR_IO_IN_PROGRESS(buf->b_hdr)) panic("modifying buffer while i/o in progress!"); arc_cksum_verify(buf); } - mutex_enter(&buf->b_hdr->b_freeze_lock); + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); if (buf->b_hdr->b_freeze_cksum != NULL) { kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); buf->b_hdr->b_freeze_cksum = NULL; } - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); arc_buf_unwatch(buf); } @@ -1051,62 +1230,72 @@ arc_buf_freeze(arc_buf_t *buf) mutex_enter(hash_lock); ASSERT(buf->b_hdr->b_freeze_cksum != NULL || - buf->b_hdr->b_state == arc_anon); + buf->b_hdr->b_l1hdr.b_state == arc_anon); arc_cksum_compute(buf, B_FALSE); mutex_exit(hash_lock); } static void -add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) +add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { + arc_state_t *state; + + ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(MUTEX_HELD(hash_lock)); - if ((refcount_add(&ab->b_refcnt, tag) == 1) && - (ab->b_state != arc_anon)) { - uint64_t delta = ab->b_size * ab->b_datacnt; - list_t *list = &ab->b_state->arcs_list[ab->b_type]; - uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; - - ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); - mutex_enter(&ab->b_state->arcs_mtx); - ASSERT(list_link_active(&ab->b_arc_node)); - list_remove(list, ab); - if (GHOST_STATE(ab->b_state)) { - ASSERT0(ab->b_datacnt); - ASSERT3P(ab->b_buf, ==, NULL); - delta = ab->b_size; + state = hdr->b_l1hdr.b_state; + + if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && + (state != arc_anon)) { + /* We don't use the L2-only state list. */ + if (state != arc_l2c_only) { + arc_buf_contents_t type = arc_buf_type(hdr); + uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; + multilist_t *list = &state->arcs_list[type]; + uint64_t *size = &state->arcs_lsize[type]; + + multilist_remove(list, hdr); + + if (GHOST_STATE(state)) { + ASSERT0(hdr->b_l1hdr.b_datacnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + delta = hdr->b_size; + } + ASSERT(delta > 0); + ASSERT3U(*size, >=, delta); + atomic_add_64(size, -delta); } - ASSERT(delta > 0); - ASSERT3U(*size, >=, delta); - atomic_add_64(size, -delta); - mutex_exit(&ab->b_state->arcs_mtx); /* remove the prefetch flag if we get a reference */ - if (ab->b_flags & ARC_PREFETCH) - ab->b_flags &= ~ARC_PREFETCH; + hdr->b_flags &= ~ARC_FLAG_PREFETCH; } } static int -remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) +remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { int cnt; - arc_state_t *state = ab->b_state; + arc_state_t *state = hdr->b_l1hdr.b_state; + ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); ASSERT(!GHOST_STATE(state)); - if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && + /* + * arc_l2c_only counts as a ghost state so we don't need to explicitly + * check to prevent usage of the arc_l2c_only list. + */ + if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && (state != arc_anon)) { - uint64_t *size = &state->arcs_lsize[ab->b_type]; + arc_buf_contents_t type = arc_buf_type(hdr); + multilist_t *list = &state->arcs_list[type]; + uint64_t *size = &state->arcs_lsize[type]; - ASSERT(!MUTEX_HELD(&state->arcs_mtx)); - mutex_enter(&state->arcs_mtx); - ASSERT(!list_link_active(&ab->b_arc_node)); - list_insert_head(&state->arcs_list[ab->b_type], ab); - ASSERT(ab->b_datacnt > 0); - atomic_add_64(size, ab->b_size * ab->b_datacnt); - mutex_exit(&state->arcs_mtx); + multilist_insert(list, hdr); + + ASSERT(hdr->b_l1hdr.b_datacnt > 0); + atomic_add_64(size, hdr->b_size * + hdr->b_l1hdr.b_datacnt); } return (cnt); } @@ -1122,31 +1311,46 @@ void arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) { arc_buf_hdr_t *hdr = ab->b_hdr; - arc_state_t *state = hdr->b_state; + l1arc_buf_hdr_t *l1hdr = NULL; + l2arc_buf_hdr_t *l2hdr = NULL; + arc_state_t *state = NULL; + + if (HDR_HAS_L1HDR(hdr)) { + l1hdr = &hdr->b_l1hdr; + state = l1hdr->b_state; + } + if (HDR_HAS_L2HDR(hdr)) + l2hdr = &hdr->b_l2hdr; memset(abi, 0, sizeof (arc_buf_info_t)); abi->abi_flags = hdr->b_flags; - abi->abi_datacnt = hdr->b_datacnt; + + if (l1hdr) { + abi->abi_datacnt = l1hdr->b_datacnt; + abi->abi_access = l1hdr->b_arc_access; + abi->abi_mru_hits = l1hdr->b_mru_hits; + abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits; + abi->abi_mfu_hits = l1hdr->b_mfu_hits; + abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits; + abi->abi_holds = refcount_count(&l1hdr->b_refcnt); + } + + if (l2hdr) { + abi->abi_l2arc_dattr = l2hdr->b_daddr; + abi->abi_l2arc_asize = l2hdr->b_asize; + abi->abi_l2arc_compress = HDR_GET_COMPRESS(hdr); + abi->abi_l2arc_hits = l2hdr->b_hits; + } + abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON; - abi->abi_state_contents = hdr->b_type; + abi->abi_state_contents = arc_buf_type(hdr); abi->abi_state_index = -1; abi->abi_size = hdr->b_size; - abi->abi_access = hdr->b_arc_access; - abi->abi_mru_hits = hdr->b_mru_hits; - abi->abi_mru_ghost_hits = hdr->b_mru_ghost_hits; - abi->abi_mfu_hits = hdr->b_mfu_hits; - abi->abi_mfu_ghost_hits = hdr->b_mfu_ghost_hits; - abi->abi_holds = refcount_count(&hdr->b_refcnt); - - if (hdr->b_l2hdr) { - abi->abi_l2arc_dattr = hdr->b_l2hdr->b_daddr; - abi->abi_l2arc_asize = hdr->b_l2hdr->b_asize; - abi->abi_l2arc_compress = hdr->b_l2hdr->b_compress; - abi->abi_l2arc_hits = hdr->b_l2hdr->b_hits; - } - if (state && state_index && list_link_active(&hdr->b_arc_node)) { - list_t *list = &state->arcs_list[hdr->b_type]; +#if 0 /* XXX tsc */ + if (l1hdr && state && state_index && + list_link_active(&l1hdr->b_arc_node)) { + multilist_t *list = &state->arcs_list[arc_buf_type(hdr)]; arc_buf_hdr_t *h; mutex_enter(&state->arcs_mtx); @@ -1157,97 +1361,113 @@ arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) } mutex_exit(&state->arcs_mtx); } +#endif } /* - * Move the supplied buffer to the indicated state. The mutex + * Move the supplied buffer to the indicated state. The hash lock * for the buffer must be held by the caller. */ static void -arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) +arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, + kmutex_t *hash_lock) { - arc_state_t *old_state = ab->b_state; - int64_t refcnt = refcount_count(&ab->b_refcnt); + arc_state_t *old_state; + int64_t refcnt; + uint32_t datacnt; uint64_t from_delta, to_delta; + arc_buf_contents_t buftype = arc_buf_type(hdr); + + /* + * We almost always have an L1 hdr here, since we call arc_hdr_realloc() + * in arc_read() when bringing a buffer out of the L2ARC. However, the + * L1 hdr doesn't always exist when we change state to arc_anon before + * destroying a header, in which case reallocating to add the L1 hdr is + * pointless. + */ + if (HDR_HAS_L1HDR(hdr)) { + old_state = hdr->b_l1hdr.b_state; + refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); + datacnt = hdr->b_l1hdr.b_datacnt; + } else { + old_state = arc_l2c_only; + refcnt = 0; + datacnt = 0; + } ASSERT(MUTEX_HELD(hash_lock)); ASSERT3P(new_state, !=, old_state); - ASSERT(refcnt == 0 || ab->b_datacnt > 0); - ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); - ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon); + ASSERT(refcnt == 0 || datacnt > 0); + ASSERT(!GHOST_STATE(new_state) || datacnt == 0); + ASSERT(old_state != arc_anon || datacnt <= 1); - from_delta = to_delta = ab->b_datacnt * ab->b_size; + from_delta = to_delta = datacnt * hdr->b_size; /* * If this buffer is evictable, transfer it from the * old state list to the new state list. */ if (refcnt == 0) { - if (old_state != arc_anon) { - int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); - uint64_t *size = &old_state->arcs_lsize[ab->b_type]; + if (old_state != arc_anon && old_state != arc_l2c_only) { + uint64_t *size = &old_state->arcs_lsize[buftype]; - if (use_mutex) - mutex_enter(&old_state->arcs_mtx); - - ASSERT(list_link_active(&ab->b_arc_node)); - list_remove(&old_state->arcs_list[ab->b_type], ab); + ASSERT(HDR_HAS_L1HDR(hdr)); + multilist_remove(&old_state->arcs_list[buftype], hdr); /* * If prefetching out of the ghost cache, * we will have a non-zero datacnt. */ - if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { + if (GHOST_STATE(old_state) && datacnt == 0) { /* ghost elements have a ghost size */ - ASSERT(ab->b_buf == NULL); - from_delta = ab->b_size; + ASSERT(hdr->b_l1hdr.b_buf == NULL); + from_delta = hdr->b_size; } ASSERT3U(*size, >=, from_delta); atomic_add_64(size, -from_delta); - - if (use_mutex) - mutex_exit(&old_state->arcs_mtx); } - if (new_state != arc_anon) { - int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); - uint64_t *size = &new_state->arcs_lsize[ab->b_type]; + if (new_state != arc_anon && new_state != arc_l2c_only) { + uint64_t *size = &new_state->arcs_lsize[buftype]; - if (use_mutex) - mutex_enter(&new_state->arcs_mtx); - - list_insert_head(&new_state->arcs_list[ab->b_type], ab); + /* + * An L1 header always exists here, since if we're + * moving to some L1-cached state (i.e. not l2c_only or + * anonymous), we realloc the header to add an L1hdr + * beforehand. + */ + ASSERT(HDR_HAS_L1HDR(hdr)); + multilist_insert(&new_state->arcs_list[buftype], hdr); /* ghost elements have a ghost size */ if (GHOST_STATE(new_state)) { - ASSERT(ab->b_datacnt == 0); - ASSERT(ab->b_buf == NULL); - to_delta = ab->b_size; + ASSERT0(datacnt); + ASSERT(hdr->b_l1hdr.b_buf == NULL); + to_delta = hdr->b_size; } atomic_add_64(size, to_delta); - - if (use_mutex) - mutex_exit(&new_state->arcs_mtx); } } - ASSERT(!BUF_EMPTY(ab)); - if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab)) - buf_hash_remove(ab); + ASSERT(!BUF_EMPTY(hdr)); + if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) + buf_hash_remove(hdr); - /* adjust state sizes */ - if (to_delta) + /* adjust state sizes (ignore arc_l2c_only) */ + if (to_delta && new_state != arc_l2c_only) atomic_add_64(&new_state->arcs_size, to_delta); - if (from_delta) { + if (from_delta && old_state != arc_l2c_only) { ASSERT3U(old_state->arcs_size, >=, from_delta); atomic_add_64(&old_state->arcs_size, -from_delta); } - ab->b_state = new_state; + if (HDR_HAS_L1HDR(hdr)) + hdr->b_l1hdr.b_state = new_state; - /* adjust l2arc hdr stats */ - if (new_state == arc_l2c_only) - l2arc_hdr_stat_add(); - else if (old_state == arc_l2c_only) - l2arc_hdr_stat_remove(); + /* + * L2 headers should never be on the L2 state list since they don't + * have L1 headers allocated. + */ + ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && + multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); } void @@ -1275,8 +1495,11 @@ arc_space_consume(uint64_t space, arc_space_type_t type) break; } - if (type != ARC_SPACE_DATA) + if (type != ARC_SPACE_DATA) { ARCSTAT_INCR(arcstat_meta_used, space); + if (arc_meta_max < arc_meta_used) + arc_meta_max = arc_meta_used; + } atomic_add_64(&arc_size, space); } @@ -1308,8 +1531,6 @@ arc_space_return(uint64_t space, arc_space_type_t type) if (type != ARC_SPACE_DATA) { ASSERT(arc_meta_used >= space); - if (arc_meta_max < arc_meta_used) - arc_meta_max = arc_meta_used; ARCSTAT_INCR(arcstat_meta_used, -space); } @@ -1324,30 +1545,37 @@ arc_buf_alloc(spa_t *spa, uint64_t size, void *tag, arc_buf_contents_t type) arc_buf_t *buf; VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); - hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); + hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); ASSERT(BUF_EMPTY(hdr)); + ASSERT3P(hdr->b_freeze_cksum, ==, NULL); hdr->b_size = size; - hdr->b_type = type; hdr->b_spa = spa_load_guid(spa); - hdr->b_state = arc_anon; - hdr->b_arc_access = 0; - hdr->b_mru_hits = 0; - hdr->b_mru_ghost_hits = 0; - hdr->b_mfu_hits = 0; - hdr->b_mfu_ghost_hits = 0; - hdr->b_l2_hits = 0; + hdr->b_l1hdr.b_mru_hits = 0; + hdr->b_l1hdr.b_mru_ghost_hits = 0; + hdr->b_l1hdr.b_mfu_hits = 0; + hdr->b_l1hdr.b_mfu_ghost_hits = 0; + hdr->b_l1hdr.b_l2_hits = 0; + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; buf->b_private = NULL; buf->b_next = NULL; - hdr->b_buf = buf; + + hdr->b_flags = arc_bufc_to_flags(type); + hdr->b_flags |= ARC_FLAG_HAS_L1HDR; + + hdr->b_l1hdr.b_buf = buf; + hdr->b_l1hdr.b_state = arc_anon; + hdr->b_l1hdr.b_arc_access = 0; + hdr->b_l1hdr.b_datacnt = 1; + hdr->b_l1hdr.b_tmp_cdata = NULL; + arc_get_data_buf(buf); - hdr->b_datacnt = 1; - hdr->b_flags = 0; - ASSERT(refcount_is_zero(&hdr->b_refcnt)); - (void) refcount_add(&hdr->b_refcnt, tag); + + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); return (buf); } @@ -1380,8 +1608,9 @@ arc_return_buf(arc_buf_t *buf, void *tag) arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(buf->b_data != NULL); - (void) refcount_add(&hdr->b_refcnt, tag); - (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag); + ASSERT(HDR_HAS_L1HDR(hdr)); + (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); + (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); atomic_add_64(&arc_loaned_bytes, -hdr->b_size); } @@ -1390,12 +1619,12 @@ arc_return_buf(arc_buf_t *buf, void *tag) void arc_loan_inuse_buf(arc_buf_t *buf, void *tag) { - arc_buf_hdr_t *hdr; + arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(buf->b_data != NULL); - hdr = buf->b_hdr; - (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); - (void) refcount_remove(&hdr->b_refcnt, tag); + ASSERT(HDR_HAS_L1HDR(hdr)); + (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); + (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); buf->b_efunc = NULL; buf->b_private = NULL; @@ -1409,17 +1638,18 @@ arc_buf_clone(arc_buf_t *from) arc_buf_hdr_t *hdr = from->b_hdr; uint64_t size = hdr->b_size; - ASSERT(hdr->b_state != arc_anon); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(hdr->b_l1hdr.b_state != arc_anon); buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; buf->b_private = NULL; - buf->b_next = hdr->b_buf; - hdr->b_buf = buf; + buf->b_next = hdr->b_l1hdr.b_buf; + hdr->b_l1hdr.b_buf = buf; arc_get_data_buf(buf); - bcopy(from->b_data, buf->b_data, size); + abd_copy(buf->b_data, from->b_data, size); /* * This buffer already exists in the arc so create a duplicate @@ -1427,11 +1657,11 @@ arc_buf_clone(arc_buf_t *from) * then track the size and number of duplicates. These stats will be * updated as duplicate buffers are created and destroyed. */ - if (hdr->b_type == ARC_BUFC_DATA) { + if (HDR_ISTYPE_DATA(hdr)) { ARCSTAT_BUMP(arcstat_duplicate_buffers); ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); } - hdr->b_datacnt += 1; + hdr->b_l1hdr.b_datacnt += 1; return (buf); } @@ -1454,27 +1684,30 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag) hash_lock = HDR_LOCK(buf->b_hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; + ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); mutex_exit(&buf->b_evict_lock); - ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + ASSERT(hdr->b_l1hdr.b_state == arc_mru || + hdr->b_l1hdr.b_state == arc_mfu); + add_reference(hdr, hash_lock, tag); DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), - demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), + demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); } static void -arc_buf_free_on_write(void *data, size_t size, - void (*free_func)(void *, size_t)) +arc_buf_free_on_write(abd_t *data, size_t size, + void (*free_func)(abd_t *, size_t)) { l2arc_data_free_t *df; - df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); + df = kmem_alloc(sizeof (*df), KM_SLEEP); df->l2df_data = data; df->l2df_size = size; df->l2df_func = free_func; @@ -1488,7 +1721,7 @@ arc_buf_free_on_write(void *data, size_t size, * the buffer is placed on l2arc_free_on_write to be freed later. */ static void -arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) +arc_buf_data_free(arc_buf_t *buf, void (*free_func)(abd_t *, size_t)) { arc_buf_hdr_t *hdr = buf->b_hdr; @@ -1500,56 +1733,95 @@ arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) } } -/* - * Free up buf->b_data and if 'remove' is set, then pull the - * arc_buf_t off of the the arc_buf_hdr_t's list and free it. - */ static void arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) { - l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; + ASSERT(HDR_HAS_L2HDR(hdr)); + ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx)); - ASSERT(MUTEX_HELD(&l2arc_buflist_mtx)); + /* + * The b_tmp_cdata field is linked off of the b_l1hdr, so if + * that doesn't exist, the header is in the arc_l2c_only state, + * and there isn't anything to free (it's already been freed). + */ + if (!HDR_HAS_L1HDR(hdr)) + return; - if (l2hdr->b_tmp_cdata == NULL) + /* + * The header isn't being written to the l2arc device, thus it + * shouldn't have a b_tmp_cdata to free. + */ + if (!HDR_L2_WRITING(hdr)) { + ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); return; + } + + /* + * The header does not have compression enabled. This can be due + * to the buffer not being compressible, or because we're + * freeing the buffer before the second phase of + * l2arc_write_buffer() has started (which does the compression + * step). In either case, b_tmp_cdata does not point to a + * separately compressed buffer, so there's nothing to free (it + * points to the same buffer as the arc_buf_t's b_data field). + */ + if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) { + hdr->b_l1hdr.b_tmp_cdata = NULL; + return; + } + + /* + * There's nothing to free since the buffer was all zero's and + * compressed to a zero length buffer. + */ + if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_EMPTY) { + ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); + return; + } + + ASSERT(L2ARC_IS_VALID_COMPRESS(HDR_GET_COMPRESS(hdr))); + + arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, + hdr->b_size, abd_free); - ASSERT(HDR_L2_WRITING(hdr)); - arc_buf_free_on_write(l2hdr->b_tmp_cdata, hdr->b_size, - zio_data_buf_free); ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); - l2hdr->b_tmp_cdata = NULL; + hdr->b_l1hdr.b_tmp_cdata = NULL; } +/* + * Free up buf->b_data and if 'remove' is set, then pull the + * arc_buf_t off of the the arc_buf_hdr_t's list and free it. + */ static void -arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) +arc_buf_destroy(arc_buf_t *buf, boolean_t remove) { arc_buf_t **bufp; /* free up data associated with the buf */ - if (buf->b_data) { - arc_state_t *state = buf->b_hdr->b_state; + if (buf->b_data != NULL) { + arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; uint64_t size = buf->b_hdr->b_size; - arc_buf_contents_t type = buf->b_hdr->b_type; + arc_buf_contents_t type = arc_buf_type(buf->b_hdr); arc_cksum_verify(buf); arc_buf_unwatch(buf); - if (!recycle) { - if (type == ARC_BUFC_METADATA) { - arc_buf_data_free(buf, zio_buf_free); - arc_space_return(size, ARC_SPACE_META); - } else { - ASSERT(type == ARC_BUFC_DATA); - arc_buf_data_free(buf, zio_data_buf_free); - arc_space_return(size, ARC_SPACE_DATA); - } + if (type == ARC_BUFC_METADATA) { + arc_buf_data_free(buf, abd_free); + arc_space_return(size, ARC_SPACE_META); + } else { + ASSERT(type == ARC_BUFC_DATA); + arc_buf_data_free(buf, abd_free); + arc_space_return(size, ARC_SPACE_DATA); } - if (list_link_active(&buf->b_hdr->b_arc_node)) { + + /* protected by hash lock, if in the hash table */ + if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { uint64_t *cnt = &state->arcs_lsize[type]; - ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); - ASSERT(state != arc_anon); + ASSERT(refcount_is_zero( + &buf->b_hdr->b_l1hdr.b_refcnt)); + ASSERT(state != arc_anon && state != arc_l2c_only); ASSERT3U(*cnt, >=, size); atomic_add_64(cnt, -size); @@ -1562,13 +1834,13 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) * If we're destroying a duplicate buffer make sure * that the appropriate statistics are updated. */ - if (buf->b_hdr->b_datacnt > 1 && - buf->b_hdr->b_type == ARC_BUFC_DATA) { + if (buf->b_hdr->b_l1hdr.b_datacnt > 1 && + HDR_ISTYPE_DATA(buf->b_hdr)) { ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); } - ASSERT(buf->b_hdr->b_datacnt > 0); - buf->b_hdr->b_datacnt -= 1; + ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0); + buf->b_hdr->b_l1hdr.b_datacnt -= 1; } /* only remove the buf if requested */ @@ -1576,7 +1848,8 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) return; /* remove the buf from the hdr list */ - for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) + for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf; + bufp = &(*bufp)->b_next) continue; *bufp = buf->b_next; buf->b_next = NULL; @@ -1591,85 +1864,87 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t remove) static void arc_hdr_destroy(arc_buf_hdr_t *hdr) { - l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; - - ASSERT(refcount_is_zero(&hdr->b_refcnt)); - ASSERT3P(hdr->b_state, ==, arc_anon); + if (HDR_HAS_L1HDR(hdr)) { + ASSERT(hdr->b_l1hdr.b_buf == NULL || + hdr->b_l1hdr.b_datacnt > 0); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); + } ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(!HDR_IN_HASH_TABLE(hdr)); + + if (HDR_HAS_L2HDR(hdr)) { + l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; + boolean_t buflist_held = MUTEX_HELD(&l2hdr->b_dev->l2ad_mtx); - if (l2hdr != NULL) { - boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx); - /* - * To prevent arc_free() and l2arc_evict() from - * attempting to free the same buffer at the same time, - * a FREE_IN_PROGRESS flag is given to arc_free() to - * give it priority. l2arc_evict() can't destroy this - * header while we are waiting on l2arc_buflist_mtx. - * - * The hdr may be removed from l2ad_buflist before we - * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. - */ if (!buflist_held) { - mutex_enter(&l2arc_buflist_mtx); - l2hdr = hdr->b_l2hdr; + mutex_enter(&l2hdr->b_dev->l2ad_mtx); + l2hdr = &hdr->b_l2hdr; } - if (l2hdr != NULL) { - list_remove(l2hdr->b_dev->l2ad_buflist, hdr); - arc_buf_l2_cdata_free(hdr); - ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); - ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); - vdev_space_update(l2hdr->b_dev->l2ad_vdev, - -l2hdr->b_asize, 0, 0); - kmem_cache_free(l2arc_hdr_cache, l2hdr); - arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); - if (hdr->b_state == arc_l2c_only) - l2arc_hdr_stat_remove(); - hdr->b_l2hdr = NULL; - } + list_remove(&l2hdr->b_dev->l2ad_buflist, hdr); + + /* + * We don't want to leak the b_tmp_cdata buffer that was + * allocated in l2arc_write_buffers() + */ + arc_buf_l2_cdata_free(hdr); + + arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); if (!buflist_held) - mutex_exit(&l2arc_buflist_mtx); + mutex_exit(&l2hdr->b_dev->l2ad_mtx); + + hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; } - if (!BUF_EMPTY(hdr)) { - ASSERT(!HDR_IN_HASH_TABLE(hdr)); + if (!BUF_EMPTY(hdr)) buf_discard_identity(hdr); - } - while (hdr->b_buf) { - arc_buf_t *buf = hdr->b_buf; - - if (buf->b_efunc) { - mutex_enter(&arc_eviction_mtx); - mutex_enter(&buf->b_evict_lock); - ASSERT(buf->b_hdr != NULL); - arc_buf_destroy(hdr->b_buf, FALSE, FALSE); - hdr->b_buf = buf->b_next; - buf->b_hdr = &arc_eviction_hdr; - buf->b_next = arc_eviction_list; - arc_eviction_list = buf; - mutex_exit(&buf->b_evict_lock); - mutex_exit(&arc_eviction_mtx); - } else { - arc_buf_destroy(hdr->b_buf, FALSE, TRUE); - } - } + if (hdr->b_freeze_cksum != NULL) { kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_freeze_cksum = NULL; } - ASSERT(!list_link_active(&hdr->b_arc_node)); + if (HDR_HAS_L1HDR(hdr)) { + while (hdr->b_l1hdr.b_buf) { + arc_buf_t *buf = hdr->b_l1hdr.b_buf; + + if (buf->b_efunc != NULL) { + mutex_enter(&arc_user_evicts_lock); + mutex_enter(&buf->b_evict_lock); + ASSERT(buf->b_hdr != NULL); + arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE); + hdr->b_l1hdr.b_buf = buf->b_next; + buf->b_hdr = &arc_eviction_hdr; + buf->b_next = arc_eviction_list; + arc_eviction_list = buf; + mutex_exit(&buf->b_evict_lock); + cv_signal(&arc_user_evicts_cv); + mutex_exit(&arc_user_evicts_lock); + } else { + arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE); + } + } + } + ASSERT3P(hdr->b_hash_next, ==, NULL); - ASSERT3P(hdr->b_acb, ==, NULL); - kmem_cache_free(hdr_cache, hdr); + if (HDR_HAS_L1HDR(hdr)) { + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); + kmem_cache_free(hdr_full_cache, hdr); + } else { + kmem_cache_free(hdr_l2only_cache, hdr); + } } void arc_buf_free(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; - int hashed = hdr->b_state != arc_anon; + int hashed = hdr->b_l1hdr.b_state != arc_anon; ASSERT(buf->b_efunc == NULL); ASSERT(buf->b_data != NULL); @@ -1682,12 +1957,12 @@ arc_buf_free(arc_buf_t *buf, void *tag) ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); (void) remove_reference(hdr, hash_lock, tag); - if (hdr->b_datacnt > 1) { - arc_buf_destroy(buf, FALSE, TRUE); + if (hdr->b_l1hdr.b_datacnt > 1) { + arc_buf_destroy(buf, TRUE); } else { - ASSERT(buf == hdr->b_buf); + ASSERT(buf == hdr->b_l1hdr.b_buf); ASSERT(buf->b_efunc == NULL); - hdr->b_flags |= ARC_BUF_AVAILABLE; + hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; } mutex_exit(hash_lock); } else if (HDR_IO_IN_PROGRESS(hdr)) { @@ -1697,16 +1972,16 @@ arc_buf_free(arc_buf_t *buf, void *tag) * this buffer unless the write completes before we finish * decrementing the reference count. */ - mutex_enter(&arc_eviction_mtx); + mutex_enter(&arc_user_evicts_lock); (void) remove_reference(hdr, NULL, tag); - ASSERT(refcount_is_zero(&hdr->b_refcnt)); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); - mutex_exit(&arc_eviction_mtx); + mutex_exit(&arc_user_evicts_lock); if (destroy_hdr) arc_hdr_destroy(hdr); } else { if (remove_reference(hdr, NULL, tag) > 0) - arc_buf_destroy(buf, FALSE, TRUE); + arc_buf_destroy(buf, TRUE); else arc_hdr_destroy(hdr); } @@ -1719,8 +1994,8 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag) kmutex_t *hash_lock = NULL; boolean_t no_callback = (buf->b_efunc == NULL); - if (hdr->b_state == arc_anon) { - ASSERT(hdr->b_datacnt == 1); + if (hdr->b_l1hdr.b_state == arc_anon) { + ASSERT(hdr->b_l1hdr.b_datacnt == 1); arc_buf_free(buf, tag); return (no_callback); } @@ -1728,21 +2003,22 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag) hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; + ASSERT(hdr->b_l1hdr.b_datacnt > 0); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - ASSERT(hdr->b_state != arc_anon); + ASSERT(hdr->b_l1hdr.b_state != arc_anon); ASSERT(buf->b_data != NULL); (void) remove_reference(hdr, hash_lock, tag); - if (hdr->b_datacnt > 1) { + if (hdr->b_l1hdr.b_datacnt > 1) { if (no_callback) - arc_buf_destroy(buf, FALSE, TRUE); + arc_buf_destroy(buf, TRUE); } else if (no_callback) { - ASSERT(hdr->b_buf == buf && buf->b_next == NULL); + ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL); ASSERT(buf->b_efunc == NULL); - hdr->b_flags |= ARC_BUF_AVAILABLE; + hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; } - ASSERT(no_callback || hdr->b_datacnt > 1 || - refcount_is_zero(&hdr->b_refcnt)); + ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 || + refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); mutex_exit(hash_lock); return (no_callback); } @@ -1788,7 +2064,7 @@ arc_buf_eviction_needed(arc_buf_t *buf) return (B_TRUE); } - if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA) + if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr)) evict_needed = B_TRUE; mutex_exit(&buf->b_evict_lock); @@ -1796,505 +2072,718 @@ arc_buf_eviction_needed(arc_buf_t *buf) } /* - * Evict buffers from list until we've removed the specified number of - * bytes. Move the removed buffers to the appropriate evict state. - * If the recycle flag is set, then attempt to "recycle" a buffer: - * - look for a buffer to evict that is `bytes' long. - * - return the data block from this buffer rather than freeing it. - * This flag is used by callers that are trying to make space for a - * new buffer in a full arc cache. + * Evict the arc_buf_hdr that is provided as a parameter. The resultant + * state of the header is dependent on it's state prior to entering this + * function. The following transitions are possible: * - * This function makes a "best effort". It skips over any buffers - * it can't get a hash_lock on, and so may not catch all candidates. - * It may also return without evicting as much space as requested. + * - arc_mru -> arc_mru_ghost + * - arc_mfu -> arc_mfu_ghost + * - arc_mru_ghost -> arc_l2c_only + * - arc_mru_ghost -> deleted + * - arc_mfu_ghost -> arc_l2c_only + * - arc_mfu_ghost -> deleted */ -static void * -arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, - arc_buf_contents_t type) +static int64_t +arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { - arc_state_t *evicted_state; - uint64_t bytes_evicted = 0, skipped = 0, missed = 0; - arc_buf_hdr_t *ab, *ab_prev = NULL; - list_t *list = &state->arcs_list[type]; - kmutex_t *hash_lock; - boolean_t have_lock; - void *stolen = NULL; - arc_buf_hdr_t marker = {{{ 0 }}}; - int count = 0; - - ASSERT(state == arc_mru || state == arc_mfu); - - evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; + arc_state_t *evicted_state, *state; + int64_t bytes_evicted = 0; -top: - mutex_enter(&state->arcs_mtx); - mutex_enter(&evicted_state->arcs_mtx); - - for (ab = list_tail(list); ab; ab = ab_prev) { - ab_prev = list_prev(list, ab); - /* prefetch buffers have a minimum lifespan */ - if (HDR_IO_IN_PROGRESS(ab) || - (spa && ab->b_spa != spa) || - (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && - ddi_get_lbolt() - ab->b_arc_access < - zfs_arc_min_prefetch_lifespan)) { - skipped++; - continue; - } - /* "lookahead" for better eviction candidate */ - if (recycle && ab->b_size != bytes && - ab_prev && ab_prev->b_size == bytes) - continue; + ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(HDR_HAS_L1HDR(hdr)); - /* ignore markers */ - if (ab->b_spa == 0) - continue; + state = hdr->b_l1hdr.b_state; + if (GHOST_STATE(state)) { + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(hdr->b_l1hdr.b_buf == NULL); /* - * It may take a long time to evict all the bufs requested. - * To avoid blocking all arc activity, periodically drop - * the arcs_mtx and give other threads a chance to run - * before reacquiring the lock. - * - * If we are looking for a buffer to recycle, we are in - * the hot code path, so don't sleep. + * l2arc_write_buffers() relies on a header's L1 portion + * (i.e. it's b_tmp_cdata field) during it's write phase. + * Thus, we cannot push a header onto the arc_l2c_only + * state (removing it's L1 piece) until the header is + * done being written to the l2arc. */ - if (!recycle && count++ > arc_evict_iterations) { - list_insert_after(list, ab, &marker); - mutex_exit(&evicted_state->arcs_mtx); - mutex_exit(&state->arcs_mtx); - kpreempt(KPREEMPT_SYNC); - mutex_enter(&state->arcs_mtx); - mutex_enter(&evicted_state->arcs_mtx); - ab_prev = list_prev(list, &marker); - list_remove(list, &marker); - count = 0; - continue; + if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { + ARCSTAT_BUMP(arcstat_evict_l2_skip); + return (bytes_evicted); } - hash_lock = HDR_LOCK(ab); - have_lock = MUTEX_HELD(hash_lock); - if (have_lock || mutex_tryenter(hash_lock)) { - ASSERT0(refcount_count(&ab->b_refcnt)); - ASSERT(ab->b_datacnt > 0); - while (ab->b_buf) { - arc_buf_t *buf = ab->b_buf; - if (!mutex_tryenter(&buf->b_evict_lock)) { - missed += 1; - break; - } - if (buf->b_data) { - bytes_evicted += ab->b_size; - if (recycle && ab->b_type == type && - ab->b_size == bytes && - !HDR_L2_WRITING(ab)) { - stolen = buf->b_data; - recycle = FALSE; - } - } - if (buf->b_efunc) { - mutex_enter(&arc_eviction_mtx); - arc_buf_destroy(buf, - buf->b_data == stolen, FALSE); - ab->b_buf = buf->b_next; - buf->b_hdr = &arc_eviction_hdr; - buf->b_next = arc_eviction_list; - arc_eviction_list = buf; - mutex_exit(&arc_eviction_mtx); - mutex_exit(&buf->b_evict_lock); - } else { - mutex_exit(&buf->b_evict_lock); - arc_buf_destroy(buf, - buf->b_data == stolen, TRUE); - } - } + ARCSTAT_BUMP(arcstat_deleted); + bytes_evicted += hdr->b_size; - if (ab->b_l2hdr) { - ARCSTAT_INCR(arcstat_evict_l2_cached, - ab->b_size); - } else { - if (l2arc_write_eligible(ab->b_spa, ab)) { - ARCSTAT_INCR(arcstat_evict_l2_eligible, - ab->b_size); - } else { - ARCSTAT_INCR( - arcstat_evict_l2_ineligible, - ab->b_size); - } - } + DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); - if (ab->b_datacnt == 0) { - arc_change_state(evicted_state, ab, hash_lock); - ASSERT(HDR_IN_HASH_TABLE(ab)); - ab->b_flags |= ARC_IN_HASH_TABLE; - ab->b_flags &= ~ARC_BUF_AVAILABLE; - DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); - } - if (!have_lock) - mutex_exit(hash_lock); - if (bytes >= 0 && bytes_evicted >= bytes) - break; + if (HDR_HAS_L2HDR(hdr)) { + /* + * This buffer is cached on the 2nd Level ARC; + * don't destroy the header. + */ + arc_change_state(arc_l2c_only, hdr, hash_lock); + /* + * dropping from L1+L2 cached to L2-only, + * realloc to remove the L1 header. + */ + hdr = arc_hdr_realloc(hdr, hdr_full_cache, + hdr_l2only_cache); } else { - missed += 1; + arc_change_state(arc_anon, hdr, hash_lock); + arc_hdr_destroy(hdr); } + return (bytes_evicted); } - mutex_exit(&evicted_state->arcs_mtx); - mutex_exit(&state->arcs_mtx); + ASSERT(state == arc_mru || state == arc_mfu); + evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; - if (list == &state->arcs_list[ARC_BUFC_DATA] && - (bytes < 0 || bytes_evicted < bytes)) { - /* Prevent second pass from recycling metadata into data */ - recycle = FALSE; - type = ARC_BUFC_METADATA; - list = &state->arcs_list[type]; - goto top; + /* prefetch buffers have a minimum lifespan */ + if (HDR_IO_IN_PROGRESS(hdr) || + ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && + ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < + arc_min_prefetch_lifespan)) { + ARCSTAT_BUMP(arcstat_evict_skip); + return (bytes_evicted); } - if (bytes_evicted < bytes) - dprintf("only evicted %lld bytes from %x\n", - (longlong_t)bytes_evicted, state->arcs_state); - - if (skipped) - ARCSTAT_INCR(arcstat_evict_skip, skipped); + ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); + ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); + while (hdr->b_l1hdr.b_buf) { + arc_buf_t *buf = hdr->b_l1hdr.b_buf; + if (!mutex_tryenter(&buf->b_evict_lock)) { + ARCSTAT_BUMP(arcstat_mutex_miss); + break; + } + if (buf->b_data != NULL) + bytes_evicted += hdr->b_size; + if (buf->b_efunc != NULL) { + mutex_enter(&arc_user_evicts_lock); + arc_buf_destroy(buf, FALSE); + hdr->b_l1hdr.b_buf = buf->b_next; + buf->b_hdr = &arc_eviction_hdr; + buf->b_next = arc_eviction_list; + arc_eviction_list = buf; + cv_signal(&arc_user_evicts_cv); + mutex_exit(&arc_user_evicts_lock); + mutex_exit(&buf->b_evict_lock); + } else { + mutex_exit(&buf->b_evict_lock); + arc_buf_destroy(buf, TRUE); + } + } - if (missed) - ARCSTAT_INCR(arcstat_mutex_miss, missed); + if (HDR_HAS_L2HDR(hdr)) { + ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size); + } else { + if (l2arc_write_eligible(hdr->b_spa, hdr)) + ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size); + else + ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size); + } - /* - * Note: we have just evicted some data into the ghost state, - * potentially putting the ghost size over the desired size. Rather - * that evicting from the ghost list in this hot code path, leave - * this chore to the arc_reclaim_thread(). - */ + if (hdr->b_l1hdr.b_datacnt == 0) { + arc_change_state(evicted_state, hdr, hash_lock); + ASSERT(HDR_IN_HASH_TABLE(hdr)); + hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; + hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; + DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); + } - return (stolen); + return (bytes_evicted); } -/* - * Remove buffers from list until we've removed the specified number of - * bytes. Destroy the buffers that are removed. - */ -static void -arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes, - arc_buf_contents_t type) +static uint64_t +arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, + uint64_t spa, int64_t bytes) { - arc_buf_hdr_t *ab, *ab_prev; - arc_buf_hdr_t marker; - list_t *list = &state->arcs_list[type]; + multilist_sublist_t *mls; + uint64_t bytes_evicted = 0; + arc_buf_hdr_t *hdr; kmutex_t *hash_lock; - uint64_t bytes_deleted = 0; - uint64_t bufs_skipped = 0; - int count = 0; + int evict_count = 0; - ASSERT(GHOST_STATE(state)); - bzero(&marker, sizeof (marker)); -top: - mutex_enter(&state->arcs_mtx); - for (ab = list_tail(list); ab; ab = ab_prev) { - ab_prev = list_prev(list, ab); - if (ab->b_type > ARC_BUFC_NUMTYPES) - panic("invalid ab=%p", (void *)ab); - if (spa && ab->b_spa != spa) - continue; + ASSERT3P(marker, !=, NULL); + ASSERTV(if (bytes < 0) ASSERT(bytes == ARC_EVICT_ALL)); - /* ignore markers */ - if (ab->b_spa == 0) - continue; + mls = multilist_sublist_lock(ml, idx); - hash_lock = HDR_LOCK(ab); - /* caller may be trying to modify this buffer, skip it */ - if (MUTEX_HELD(hash_lock)) - continue; + for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL; + hdr = multilist_sublist_prev(mls, marker)) { + if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) || + (evict_count >= zfs_arc_evict_batch_limit)) + break; /* - * It may take a long time to evict all the bufs requested. - * To avoid blocking all arc activity, periodically drop - * the arcs_mtx and give other threads a chance to run - * before reacquiring the lock. + * To keep our iteration location, move the marker + * forward. Since we're not holding hdr's hash lock, we + * must be very careful and not remove 'hdr' from the + * sublist. Otherwise, other consumers might mistake the + * 'hdr' as not being on a sublist when they call the + * multilist_link_active() function (they all rely on + * the hash lock protecting concurrent insertions and + * removals). multilist_sublist_move_forward() was + * specifically implemented to ensure this is the case + * (only 'marker' will be removed and re-inserted). */ - if (count++ > arc_evict_iterations) { - list_insert_after(list, ab, &marker); - mutex_exit(&state->arcs_mtx); - kpreempt(KPREEMPT_SYNC); - mutex_enter(&state->arcs_mtx); - ab_prev = list_prev(list, &marker); - list_remove(list, &marker); - count = 0; + multilist_sublist_move_forward(mls, marker); + + /* + * The only case where the b_spa field should ever be + * zero, is the marker headers inserted by + * arc_evict_state(). It's possible for multiple threads + * to be calling arc_evict_state() concurrently (e.g. + * dsl_pool_close() and zio_inject_fault()), so we must + * skip any markers we see from these other threads. + */ + if (hdr->b_spa == 0) + continue; + + /* we're only interested in evicting buffers of a certain spa */ + if (spa != 0 && hdr->b_spa != spa) { + ARCSTAT_BUMP(arcstat_evict_skip); continue; } + + hash_lock = HDR_LOCK(hdr); + + /* + * We aren't calling this function from any code path + * that would already be holding a hash lock, so we're + * asserting on this assumption to be defensive in case + * this ever changes. Without this check, it would be + * possible to incorrectly increment arcstat_mutex_miss + * below (e.g. if the code changed such that we called + * this function with a hash lock held). + */ + ASSERT(!MUTEX_HELD(hash_lock)); + if (mutex_tryenter(hash_lock)) { - ASSERT(!HDR_IO_IN_PROGRESS(ab)); - ASSERT(ab->b_buf == NULL); - ARCSTAT_BUMP(arcstat_deleted); - bytes_deleted += ab->b_size; + uint64_t evicted = arc_evict_hdr(hdr, hash_lock); + mutex_exit(hash_lock); - if (ab->b_l2hdr != NULL) { - /* - * This buffer is cached on the 2nd Level ARC; - * don't destroy the header. - */ - arc_change_state(arc_l2c_only, ab, hash_lock); - mutex_exit(hash_lock); - } else { - arc_change_state(arc_anon, ab, hash_lock); - mutex_exit(hash_lock); - arc_hdr_destroy(ab); - } + bytes_evicted += evicted; - DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); - if (bytes >= 0 && bytes_deleted >= bytes) - break; - } else if (bytes < 0) { /* - * Insert a list marker and then wait for the - * hash lock to become available. Once its - * available, restart from where we left off. + * If evicted is zero, arc_evict_hdr() must have + * decided to skip this header, don't increment + * evict_count in this case. */ - list_insert_after(list, ab, &marker); - mutex_exit(&state->arcs_mtx); - mutex_enter(hash_lock); - mutex_exit(hash_lock); - mutex_enter(&state->arcs_mtx); - ab_prev = list_prev(list, &marker); - list_remove(list, &marker); + if (evicted != 0) + evict_count++; + + /* + * If arc_size isn't overflowing, signal any + * threads that might happen to be waiting. + * + * For each header evicted, we wake up a single + * thread. If we used cv_broadcast, we could + * wake up "too many" threads causing arc_size + * to significantly overflow arc_c; since + * arc_get_data_buf() doesn't check for overflow + * when it's woken up (it doesn't because it's + * possible for the ARC to be overflowing while + * full of un-evictable buffers, and the + * function should proceed in this case). + * + * If threads are left sleeping, due to not + * using cv_broadcast, they will be woken up + * just before arc_reclaim_thread() sleeps. + */ + mutex_enter(&arc_reclaim_lock); + if (!arc_is_overflowing()) + cv_signal(&arc_reclaim_waiters_cv); + mutex_exit(&arc_reclaim_lock); } else { - bufs_skipped += 1; + ARCSTAT_BUMP(arcstat_mutex_miss); } } - mutex_exit(&state->arcs_mtx); - if (list == &state->arcs_list[ARC_BUFC_DATA] && - (bytes < 0 || bytes_deleted < bytes)) { - list = &state->arcs_list[ARC_BUFC_METADATA]; - goto top; - } - - if (bufs_skipped) { - ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); - ASSERT(bytes >= 0); - } + multilist_sublist_unlock(mls); - if (bytes_deleted < bytes) - dprintf("only deleted %lld bytes from %p\n", - (longlong_t)bytes_deleted, state); + return (bytes_evicted); } -static void -arc_adjust(void) +/* + * Evict buffers from the given arc state, until we've removed the + * specified number of bytes. Move the removed buffers to the + * appropriate evict state. + * + * This function makes a "best effort". It skips over any buffers + * it can't get a hash_lock on, and so, may not catch all candidates. + * It may also return without evicting as much space as requested. + * + * If bytes is specified using the special value ARC_EVICT_ALL, this + * will evict all available (i.e. unlocked and evictable) buffers from + * the given arc state; which is used by arc_flush(). + */ +static uint64_t +arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, + arc_buf_contents_t type) { - int64_t adjustment, delta; + uint64_t total_evicted = 0; + multilist_t *ml = &state->arcs_list[type]; + int num_sublists; + arc_buf_hdr_t **markers; + int i; + + ASSERTV(if (bytes < 0) ASSERT(bytes == ARC_EVICT_ALL)); + + num_sublists = multilist_get_num_sublists(ml); /* - * Adjust MRU size + * If we've tried to evict from each sublist, made some + * progress, but still have not hit the target number of bytes + * to evict, we want to keep trying. The markers allow us to + * pick up where we left off for each individual sublist, rather + * than starting from the tail each time. */ + markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP); + for (i = 0; i < num_sublists; i++) { + multilist_sublist_t *mls; - adjustment = MIN((int64_t)(arc_size - arc_c), - (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size - arc_p)); + markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); - if (adjustment > 0 && arc_mru->arcs_size > 0) { - delta = MIN(arc_mru->arcs_size, adjustment); - (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA); + /* + * A b_spa of 0 is used to indicate that this header is + * a marker. This fact is used in arc_adjust_type() and + * arc_evict_state_impl(). + */ + markers[i]->b_spa = 0; + + mls = multilist_sublist_lock(ml, i); + multilist_sublist_insert_tail(mls, markers[i]); + multilist_sublist_unlock(mls); } /* - * Adjust MFU size + * While we haven't hit our target number of bytes to evict, or + * we're evicting all available buffers. */ + while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { + /* + * Start eviction using a randomly selected sublist, + * this is to try and evenly balance eviction across all + * sublists. Always starting at the same sublist + * (e.g. index 0) would cause evictions to favor certain + * sublists over others. + */ + int sublist_idx = multilist_get_random_index(ml); + uint64_t scan_evicted = 0; - adjustment = arc_size - arc_c; + for (i = 0; i < num_sublists; i++) { + uint64_t bytes_remaining; + uint64_t bytes_evicted; - if (adjustment > 0 && arc_mfu->arcs_size > 0) { - delta = MIN(arc_mfu->arcs_size, adjustment); - (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA); - } + if (bytes == ARC_EVICT_ALL) + bytes_remaining = ARC_EVICT_ALL; + else if (total_evicted < bytes) + bytes_remaining = bytes - total_evicted; + else + break; - /* - * Adjust ghost lists - */ + bytes_evicted = arc_evict_state_impl(ml, sublist_idx, + markers[sublist_idx], spa, bytes_remaining); - adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; + scan_evicted += bytes_evicted; + total_evicted += bytes_evicted; - if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { - delta = MIN(arc_mru_ghost->arcs_size, adjustment); - arc_evict_ghost(arc_mru_ghost, 0, delta, ARC_BUFC_DATA); + /* we've reached the end, wrap to the beginning */ + if (++sublist_idx >= num_sublists) + sublist_idx = 0; + } + + /* + * If we didn't evict anything during this scan, we have + * no reason to believe we'll evict more during another + * scan, so break the loop. + */ + if (scan_evicted == 0) { + /* This isn't possible, let's make that obvious */ + ASSERT3S(bytes, !=, 0); + + /* + * When bytes is ARC_EVICT_ALL, the only way to + * break the loop is when scan_evicted is zero. + * In that case, we actually have evicted enough, + * so we don't want to increment the kstat. + */ + if (bytes != ARC_EVICT_ALL) { + ASSERT3S(total_evicted, <, bytes); + ARCSTAT_BUMP(arcstat_evict_not_enough); + } + + break; + } } - adjustment = - arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; + for (i = 0; i < num_sublists; i++) { + multilist_sublist_t *mls = multilist_sublist_lock(ml, i); + multilist_sublist_remove(mls, markers[i]); + multilist_sublist_unlock(mls); - if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { - delta = MIN(arc_mfu_ghost->arcs_size, adjustment); - arc_evict_ghost(arc_mfu_ghost, 0, delta, ARC_BUFC_DATA); + kmem_cache_free(hdr_full_cache, markers[i]); } + kmem_free(markers, sizeof (*markers) * num_sublists); + + return (total_evicted); } /* - * Request that arc user drop references so that N bytes can be released - * from the cache. This provides a mechanism to ensure the arc can honor - * the arc_meta_limit and reclaim buffers which are pinned in the cache - * by higher layers. (i.e. the zpl) + * Flush all "evictable" data of the given type from the arc state + * specified. This will not evict any "active" buffers (i.e. referenced). + * + * When 'retry' is set to FALSE, the function will make a single pass + * over the state and evict any buffers that it can. Since it doesn't + * continually retry the eviction, it might end up leaving some buffers + * in the ARC due to lock misses. + * + * When 'retry' is set to TRUE, the function will continually retry the + * eviction until *all* evictable buffers have been removed from the + * state. As a result, if concurrent insertions into the state are + * allowed (e.g. if the ARC isn't shutting down), this function might + * wind up in an infinite loop, continually trying to evict buffers. */ -static void -arc_do_user_prune(int64_t adjustment) +static uint64_t +arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, + boolean_t retry) { - arc_prune_func_t *func; - void *private; - arc_prune_t *cp, *np; - - mutex_enter(&arc_prune_mtx); + uint64_t evicted = 0; - cp = list_head(&arc_prune_list); - while (cp != NULL) { - func = cp->p_pfunc; - private = cp->p_private; - np = list_next(&arc_prune_list, cp); - refcount_add(&cp->p_refcnt, func); - mutex_exit(&arc_prune_mtx); + while (state->arcs_lsize[type] != 0) { + evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); - if (func != NULL) - func(adjustment, private); + if (!retry) + break; + } - mutex_enter(&arc_prune_mtx); + return (evicted); +} - /* User removed prune callback concurrently with execution */ - if (refcount_remove(&cp->p_refcnt, func) == 0) { - ASSERT(!list_link_active(&cp->p_node)); - refcount_destroy(&cp->p_refcnt); - kmem_free(cp, sizeof (*cp)); - } +/* + * Evict the specified number of bytes from the state specified, + * restricting eviction to the spa and type given. This function + * prevents us from trying to evict more from a state's list than + * is "evictable", and to skip evicting altogether when passed a + * negative value for "bytes". In contrast, arc_evict_state() will + * evict everything it can, when passed a negative value for "bytes". + */ +static uint64_t +arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, + arc_buf_contents_t type) +{ + int64_t delta; - cp = np; + if (bytes > 0 && state->arcs_lsize[type] > 0) { + delta = MIN(state->arcs_lsize[type], bytes); + return (arc_evict_state(state, spa, delta, type)); } - ARCSTAT_BUMP(arcstat_prune); - mutex_exit(&arc_prune_mtx); + return (0); } -static void -arc_do_user_evicts(void) +/* + * Evict metadata buffers from the cache, such that arc_meta_used is + * capped by the arc_meta_limit tunable. + */ +static uint64_t +arc_adjust_meta(void) { - mutex_enter(&arc_eviction_mtx); - while (arc_eviction_list != NULL) { - arc_buf_t *buf = arc_eviction_list; - arc_eviction_list = buf->b_next; - mutex_enter(&buf->b_evict_lock); - buf->b_hdr = NULL; - mutex_exit(&buf->b_evict_lock); - mutex_exit(&arc_eviction_mtx); + uint64_t total_evicted = 0; + int64_t target; - if (buf->b_efunc != NULL) - VERIFY0(buf->b_efunc(buf->b_private)); + /* + * If we're over the meta limit, we want to evict enough + * metadata to get back under the meta limit. We don't want to + * evict so much that we drop the MRU below arc_p, though. If + * we're over the meta limit more than we're over arc_p, we + * evict some from the MRU here, and some from the MFU below. + */ + target = MIN((int64_t)(arc_meta_used - arc_meta_limit), + (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size - arc_p)); - buf->b_efunc = NULL; - buf->b_private = NULL; - kmem_cache_free(buf_cache, buf); - mutex_enter(&arc_eviction_mtx); - } - mutex_exit(&arc_eviction_mtx); + total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); + + /* + * Similar to the above, we want to evict enough bytes to get us + * below the meta limit, but not so much as to drop us below the + * space alloted to the MFU (which is defined as arc_c - arc_p). + */ + target = MIN((int64_t)(arc_meta_used - arc_meta_limit), + (int64_t)(arc_mfu->arcs_size - (arc_c - arc_p))); + + total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); + + return (total_evicted); } /* - * Evict only meta data objects from the cache leaving the data objects. - * This is only used to enforce the tunable arc_meta_limit, if we are - * unable to evict enough buffers notify the user via the prune callback. + * Return the type of the oldest buffer in the given arc state + * + * This function will select a random sublist of type ARC_BUFC_DATA and + * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist + * is compared, and the type which contains the "older" buffer will be + * returned. */ -static void -arc_adjust_meta(void) +static arc_buf_contents_t +arc_adjust_type(arc_state_t *state) { - int64_t adjustmnt, delta; + multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA]; + multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA]; + int data_idx = multilist_get_random_index(data_ml); + int meta_idx = multilist_get_random_index(meta_ml); + multilist_sublist_t *data_mls; + multilist_sublist_t *meta_mls; + arc_buf_contents_t type; + arc_buf_hdr_t *data_hdr; + arc_buf_hdr_t *meta_hdr; /* - * This slightly differs than the way we evict from the mru in - * arc_adjust because we don't have a "target" value (i.e. no - * "meta" arc_p). As a result, I think we can completely - * cannibalize the metadata in the MRU before we evict the - * metadata from the MFU. I think we probably need to implement a - * "metadata arc_p" value to do this properly. + * We keep the sublist lock until we're finished, to prevent + * the headers from being destroyed via arc_evict_state(). */ - adjustmnt = arc_meta_used - arc_meta_limit; + data_mls = multilist_sublist_lock(data_ml, data_idx); + meta_mls = multilist_sublist_lock(meta_ml, meta_idx); - if (adjustmnt > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { - delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustmnt); - arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_METADATA); - adjustmnt -= delta; + /* + * These two loops are to ensure we skip any markers that + * might be at the tail of the lists due to arc_evict_state(). + */ + + for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; + data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { + if (data_hdr->b_spa != 0) + break; + } + + for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; + meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { + if (meta_hdr->b_spa != 0) + break; } + if (data_hdr == NULL && meta_hdr == NULL) { + type = ARC_BUFC_DATA; + } else if (data_hdr == NULL) { + ASSERT3P(meta_hdr, !=, NULL); + type = ARC_BUFC_METADATA; + } else if (meta_hdr == NULL) { + ASSERT3P(data_hdr, !=, NULL); + type = ARC_BUFC_DATA; + } else { + ASSERT3P(data_hdr, !=, NULL); + ASSERT3P(meta_hdr, !=, NULL); + + /* The headers can't be on the sublist without an L1 header */ + ASSERT(HDR_HAS_L1HDR(data_hdr)); + ASSERT(HDR_HAS_L1HDR(meta_hdr)); + + if (data_hdr->b_l1hdr.b_arc_access < + meta_hdr->b_l1hdr.b_arc_access) { + type = ARC_BUFC_DATA; + } else { + type = ARC_BUFC_METADATA; + } + } + + multilist_sublist_unlock(meta_mls); + multilist_sublist_unlock(data_mls); + + return (type); +} + +/* + * Evict buffers from the cache, such that arc_size is capped by arc_c. + */ +static uint64_t +arc_adjust(void) +{ + uint64_t total_evicted = 0; + uint64_t bytes; + int64_t target; + + /* + * If we're over arc_meta_limit, we want to correct that before + * potentially evicting data buffers below. + */ + total_evicted += arc_adjust_meta(); + /* - * We can't afford to recalculate adjustmnt here. If we do, - * new metadata buffers can sneak into the MRU or ANON lists, - * thus penalize the MFU metadata. Although the fudge factor is - * small, it has been empirically shown to be significant for - * certain workloads (e.g. creating many empty directories). As - * such, we use the original calculation for adjustmnt, and - * simply decrement the amount of data evicted from the MRU. + * Adjust MRU size + * + * If we're over the target cache size, we want to evict enough + * from the list to get back to our target size. We don't want + * to evict too much from the MRU, such that it drops below + * arc_p. So, if we're over our target cache size more than + * the MRU is over arc_p, we'll evict enough to get back to + * arc_p here, and then evict more from the MFU below. */ + target = MIN((int64_t)(arc_size - arc_c), + (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - + arc_p)); - if (adjustmnt > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { - delta = MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], adjustmnt); - arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_METADATA); + /* + * If we're below arc_meta_min, always prefer to evict data. + * Otherwise, try to satisfy the requested number of bytes to + * evict from the type which contains older buffers; in an + * effort to keep newer buffers in the cache regardless of their + * type. If we cannot satisfy the number of bytes from this + * type, spill over into the next type. + */ + if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && + arc_meta_used > arc_meta_min) { + bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); + total_evicted += bytes; + + /* + * If we couldn't evict our target number of bytes from + * metadata, we try to get the rest from data. + */ + target -= bytes; + + total_evicted += + arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); + } else { + bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); + total_evicted += bytes; + + /* + * If we couldn't evict our target number of bytes from + * data, we try to get the rest from metadata. + */ + target -= bytes; + + total_evicted += + arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); } - adjustmnt = arc_mru->arcs_lsize[ARC_BUFC_METADATA] + - arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] - arc_meta_limit; + /* + * Adjust MFU size + * + * Now that we've tried to evict enough from the MRU to get its + * size back to arc_p, if we're still above the target cache + * size, we evict the rest from the MFU. + */ + target = arc_size - arc_c; + + if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && + arc_meta_used > arc_meta_min) { + bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); + total_evicted += bytes; + + /* + * If we couldn't evict our target number of bytes from + * metadata, we try to get the rest from data. + */ + target -= bytes; + + total_evicted += + arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); + } else { + bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); + total_evicted += bytes; + + /* + * If we couldn't evict our target number of bytes from + * data, we try to get the rest from data. + */ + target -= bytes; - if (adjustmnt > 0 && arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] > 0) { - delta = MIN(adjustmnt, - arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA]); - arc_evict_ghost(arc_mru_ghost, 0, delta, ARC_BUFC_METADATA); + total_evicted += + arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); } - adjustmnt = arc_mru_ghost->arcs_lsize[ARC_BUFC_METADATA] + - arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA] - arc_meta_limit; + /* + * Adjust ghost lists + * + * In addition to the above, the ARC also defines target values + * for the ghost lists. The sum of the mru list and mru ghost + * list should never exceed the target size of the cache, and + * the sum of the mru list, mfu list, mru ghost list, and mfu + * ghost list should never exceed twice the target size of the + * cache. The following logic enforces these limits on the ghost + * caches, and evicts from them as needed. + */ + target = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; + + bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); + total_evicted += bytes; + + target -= bytes; + + total_evicted += + arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); + + /* + * We assume the sum of the mru list and mfu list is less than + * or equal to arc_c (we enforced this above), which means we + * can use the simpler of the two equations below: + * + * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c + * mru ghost + mfu ghost <= arc_c + */ + target = arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; + + bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); + total_evicted += bytes; + + target -= bytes; + + total_evicted += + arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); + + return (total_evicted); +} + +static void +arc_do_user_evicts(void) +{ + mutex_enter(&arc_user_evicts_lock); + while (arc_eviction_list != NULL) { + arc_buf_t *buf = arc_eviction_list; + arc_eviction_list = buf->b_next; + mutex_enter(&buf->b_evict_lock); + buf->b_hdr = NULL; + mutex_exit(&buf->b_evict_lock); + mutex_exit(&arc_user_evicts_lock); + + if (buf->b_efunc != NULL) + VERIFY0(buf->b_efunc(buf->b_private)); - if (adjustmnt > 0 && arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA] > 0) { - delta = MIN(adjustmnt, - arc_mfu_ghost->arcs_lsize[ARC_BUFC_METADATA]); - arc_evict_ghost(arc_mfu_ghost, 0, delta, ARC_BUFC_METADATA); + buf->b_efunc = NULL; + buf->b_private = NULL; + kmem_cache_free(buf_cache, buf); + mutex_enter(&arc_user_evicts_lock); } - - if (arc_meta_used > arc_meta_limit) - arc_do_user_prune(zfs_arc_meta_prune); + mutex_exit(&arc_user_evicts_lock); } -/* - * Flush all *evictable* data from the cache for the given spa. - * NOTE: this will not touch "active" (i.e. referenced) data. - */ void -arc_flush(spa_t *spa) +arc_flush(spa_t *spa, boolean_t retry) { uint64_t guid = 0; - if (spa) + /* + * If retry is TRUE, a spa must not be specified since we have + * no good way to determine if all of a spa's buffers have been + * evicted from an arc state. + */ + ASSERT(!retry || spa == 0); + + if (spa != NULL) guid = spa_load_guid(spa); - while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { - (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); - if (spa) - break; - } - while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { - (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); - if (spa) - break; - } - while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { - (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); - if (spa) - break; - } - while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { - (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); - if (spa) - break; - } + (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); - arc_evict_ghost(arc_mru_ghost, guid, -1, ARC_BUFC_DATA); - arc_evict_ghost(arc_mfu_ghost, guid, -1, ARC_BUFC_DATA); + (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); + + (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); + + (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); - mutex_enter(&arc_reclaim_thr_lock); arc_do_user_evicts(); - mutex_exit(&arc_reclaim_thr_lock); ASSERT(spa || arc_eviction_list == NULL); } @@ -2327,7 +2816,7 @@ arc_shrink(uint64_t bytes) } if (arc_size > arc_c) - arc_adjust(); + (void) arc_adjust(); } static void @@ -2358,7 +2847,8 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes) } kmem_cache_reap_now(buf_cache); - kmem_cache_reap_now(hdr_cache); + kmem_cache_reap_now(hdr_full_cache); + kmem_cache_reap_now(hdr_l2only_cache); } /* @@ -2367,19 +2857,39 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat, uint64_t bytes) * reclamation has been entirely delegated to the arc_shrinker_func() * which is registered with the VM. To reflect this change in behavior * the arc_reclaim thread has been renamed to arc_adapt. + * + * The following comment from arc_reclaim_thread() in illumos is still + * applicable: + * + * Threads can block in arc_get_data_buf() waiting for this thread to evict + * enough data and signal them to proceed. When this happens, the threads in + * arc_get_data_buf() are sleeping while holding the hash lock for their + * particular arc header. Thus, we must be careful to never sleep on a + * hash lock in this thread. This is to prevent the following deadlock: + * + * - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L", + * waiting for the reclaim thread to signal it. + * + * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter, + * fails, and goes to sleep forever. + * + * This possible deadlock is avoided by always acquiring a hash lock + * using mutex_tryenter() from arc_reclaim_thread(). */ static void arc_adapt_thread(void) { callb_cpr_t cpr; + uint64_t arc_evicted; - CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); + CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG); - mutex_enter(&arc_reclaim_thr_lock); - while (arc_thread_exit == 0) { + mutex_enter(&arc_reclaim_lock); + while (arc_reclaim_thread_exit == 0) { #ifndef _KERNEL arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; + mutex_exit(&arc_reclaim_lock); if (spa_get_random(100) == 0) { if (arc_no_grow) { @@ -2401,6 +2911,8 @@ arc_adapt_thread(void) arc_kmem_reap_now(last_reclaim, 0); arc_warm = B_TRUE; } +#else /* _KERNEL */ + mutex_exit(&arc_reclaim_lock); #endif /* !_KERNEL */ /* No recent memory pressure allow the ARC to grow. */ @@ -2408,18 +2920,23 @@ arc_adapt_thread(void) ddi_time_after_eq(ddi_get_lbolt(), arc_grow_time)) arc_no_grow = FALSE; - arc_adjust_meta(); + arc_evicted = arc_adjust(); - arc_adjust(); + /* + * We're either no longer overflowing, or we + * can't evict anything more, so we should wake + * up any threads before we go to sleep. + */ + if (arc_size <= arc_c || arc_evicted == 0) + cv_broadcast(&arc_reclaim_waiters_cv); - if (arc_eviction_list != NULL) - arc_do_user_evicts(); + mutex_enter(&arc_reclaim_lock); /* block until needed, or one second, whichever is shorter */ CALLB_CPR_SAFE_BEGIN(&cpr); - (void) cv_timedwait_interruptible(&arc_reclaim_thr_cv, - &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz)); - CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); + (void) cv_timedwait_interruptible(&arc_reclaim_thread_cv, + &arc_reclaim_lock, (ddi_get_lbolt() + hz)); + CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock); /* Allow the module options to be changed */ @@ -2437,14 +2954,56 @@ arc_adapt_thread(void) zfs_arc_meta_limit <= arc_c_max && zfs_arc_meta_limit != arc_meta_limit) arc_meta_limit = zfs_arc_meta_limit; + } + + arc_reclaim_thread_exit = 0; + cv_broadcast(&arc_reclaim_thread_cv); + CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ + thread_exit(); +} + +static void +arc_user_evicts_thread(void) +{ + callb_cpr_t cpr; + + CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG); + + mutex_enter(&arc_user_evicts_lock); + while (!arc_user_evicts_thread_exit) { + mutex_exit(&arc_user_evicts_lock); + + arc_do_user_evicts(); + /* + * This is necessary in order for the mdb ::arc dcmd to + * show up to date information. Since the ::arc command + * does not call the kstat's update function, without + * this call, the command may show stale stats for the + * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even + * with this change, the data might be up to 1 second + * out of date; but that should suffice. The arc_state_t + * structures can be queried directly if more accurate + * information is needed. + */ + if (arc_ksp != NULL) + arc_ksp->ks_update(arc_ksp, KSTAT_READ); + mutex_enter(&arc_user_evicts_lock); + /* + * Block until signaled, or after one second (we need to + * call the arc's kstat update function regularly). + */ + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait_interruptible(&arc_user_evicts_cv, + &arc_user_evicts_lock, ddi_get_lbolt() + hz); + CALLB_CPR_SAFE_END(&cpr, &arc_user_evicts_lock); } - arc_thread_exit = 0; - cv_broadcast(&arc_reclaim_thr_cv); - CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ + arc_user_evicts_thread_exit = FALSE; + cv_broadcast(&arc_user_evicts_cv); + CALLB_CPR_EXIT(&cpr); /* drops arc_user_evicts_lock */ thread_exit(); } @@ -2545,9 +3104,11 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc) return (SHRINK_STOP); /* Reclaim in progress */ - if (mutex_tryenter(&arc_reclaim_thr_lock) == 0) + if (mutex_tryenter(&arc_reclaim_lock) == 0) return (SHRINK_STOP); + mutex_exit(&arc_reclaim_lock); + /* * Evict the requested number of pages by shrinking arc_c the * requested amount. If there is nothing left to evict just @@ -2566,6 +3127,11 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc) pages = SHRINK_STOP; } + /* + * We've reaped what we can, wake up threads. + */ + cv_broadcast(&arc_reclaim_waiters_cv); + /* * When direct reclaim is observed it usually indicates a rapid * increase in memory pressure. This occurs because the kswapd @@ -2581,8 +3147,6 @@ __arc_shrinker_func(struct shrinker *shrink, struct shrink_control *sc) ARCSTAT_BUMP(arcstat_memory_direct_count); } - mutex_exit(&arc_reclaim_thr_lock); - return (pages); } SPL_SHRINKER_CALLBACK_WRAPPER(arc_shrinker_func); @@ -2657,153 +3221,108 @@ arc_adapt(int bytes, arc_state_t *state) } /* - * Check if the cache has reached its limits and eviction is required - * prior to insert. + * Check if arc_size has grown past our upper threshold, determined by + * zfs_arc_overflow_shift. */ -static int -arc_evict_needed(arc_buf_contents_t type) +static boolean_t +arc_is_overflowing(void) { - if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) - return (1); + /* Always allow at least one block of overflow */ + uint64_t overflow = MAX(SPA_MAXBLOCKSIZE, + arc_c >> zfs_arc_overflow_shift); - if (arc_no_grow) - return (1); - - return (arc_size > arc_c); + return (arc_size >= arc_c + overflow); } /* - * The buffer, supplied as the first argument, needs a data block. - * So, if we are at cache max, determine which cache should be victimized. - * We have the following cases: - * - * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> - * In this situation if we're out of space, but the resident size of the MFU is - * under the limit, victimize the MFU cache to satisfy this insertion request. - * - * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> - * Here, we've used up all of the available space for the MRU, so we need to - * evict from our own cache instead. Evict from the set of resident MRU - * entries. - * - * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> - * c minus p represents the MFU space in the cache, since p is the size of the - * cache that is dedicated to the MRU. In this situation there's still space on - * the MFU side, so the MRU side needs to be victimized. - * - * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> - * MFU's resident set is consuming more space than it has been allotted. In - * this situation, we must victimize our own cache, the MFU, for this insertion. + * The buffer, supplied as the first argument, needs a data block. If we + * are hitting the hard limit for the cache size, we must sleep, waiting + * for the eviction thread to catch up. If we're past the target size + * but below the hard limit, we'll only signal the reclaim thread and + * continue on. */ static void arc_get_data_buf(arc_buf_t *buf) { - arc_state_t *state = buf->b_hdr->b_state; + arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; uint64_t size = buf->b_hdr->b_size; - arc_buf_contents_t type = buf->b_hdr->b_type; - arc_buf_contents_t evict = ARC_BUFC_DATA; - boolean_t recycle = TRUE; + arc_buf_contents_t type = arc_buf_type(buf->b_hdr); arc_adapt(size, state); /* - * We have not yet reached cache maximum size, - * just allocate a new buffer. + * If arc_size is currently overflowing, and has grown past our + * upper limit, we must be adding data faster than the evict + * thread can evict. Thus, to ensure we don't compound the + * problem by adding more data and forcing arc_size to grow even + * further past it's target size, we halt and wait for the + * eviction thread to catch up. + * + * It's also possible that the reclaim thread is unable to evict + * enough buffers to get arc_size below the overflow limit (e.g. + * due to buffers being un-evictable, or hash lock collisions). + * In this case, we want to proceed regardless if we're + * overflowing; thus we don't use a while loop here. */ - if (!arc_evict_needed(type)) { - if (type == ARC_BUFC_METADATA) { - buf->b_data = zio_buf_alloc(size); - arc_space_consume(size, ARC_SPACE_META); - } else { - ASSERT(type == ARC_BUFC_DATA); - buf->b_data = zio_data_buf_alloc(size); - arc_space_consume(size, ARC_SPACE_DATA); + if (arc_is_overflowing()) { + mutex_enter(&arc_reclaim_lock); + + /* + * Now that we've acquired the lock, we may no longer be + * over the overflow limit, lets check. + * + * We're ignoring the case of spurious wake ups. If that + * were to happen, it'd let this thread consume an ARC + * buffer before it should have (i.e. before we're under + * the overflow limit and were signalled by the reclaim + * thread). As long as that is a rare occurrence, it + * shouldn't cause any harm. + */ + if (arc_is_overflowing()) { + cv_signal(&arc_reclaim_thread_cv); + cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); } - goto out; - } - /* - * If we are prefetching from the mfu ghost list, this buffer - * will end up on the mru list; so steal space from there. - */ - if (state == arc_mfu_ghost) - state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; - else if (state == arc_mru_ghost) - state = arc_mru; - - if (state == arc_mru || state == arc_anon) { - uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; - state = (arc_mfu->arcs_lsize[type] >= size && - arc_p > mru_used) ? arc_mfu : arc_mru; - } else { - /* MFU cases */ - uint64_t mfu_space = arc_c - arc_p; - state = (arc_mru->arcs_lsize[type] >= size && - mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; + mutex_exit(&arc_reclaim_lock); } - /* - * Evict data buffers prior to metadata buffers, unless we're - * over the metadata limit and adding a metadata buffer. - */ if (type == ARC_BUFC_METADATA) { - if (arc_meta_used >= arc_meta_limit) - evict = ARC_BUFC_METADATA; - else - /* - * In this case, we're evicting data while - * adding metadata. Thus, to prevent recycling a - * data buffer into a metadata buffer, recycling - * is disabled in the following arc_evict call. - */ - recycle = FALSE; + buf->b_data = abd_alloc_linear(size); + arc_space_consume(size, ARC_SPACE_META); + } else { + ASSERT(type == ARC_BUFC_DATA); + buf->b_data = abd_alloc_scatter(size); + arc_space_consume(size, ARC_SPACE_DATA); } - if ((buf->b_data = arc_evict(state, 0, size, recycle, evict)) == NULL) { - if (type == ARC_BUFC_METADATA) { - buf->b_data = zio_buf_alloc(size); - arc_space_consume(size, ARC_SPACE_META); - - /* - * If we are unable to recycle an existing meta buffer - * signal the reclaim thread. It will notify users - * via the prune callback to drop references. The - * prune callback in run in the context of the reclaim - * thread to avoid deadlocking on the hash_lock. - * Of course, only do this when recycle is true. - */ - if (recycle) - cv_signal(&arc_reclaim_thr_cv); - } else { - ASSERT(type == ARC_BUFC_DATA); - buf->b_data = zio_data_buf_alloc(size); - arc_space_consume(size, ARC_SPACE_DATA); - } - - /* Only bump this if we tried to recycle and failed */ - if (recycle) - ARCSTAT_BUMP(arcstat_recycle_miss); - } - ASSERT(buf->b_data != NULL); -out: /* * Update the state size. Note that ghost states have a * "ghost size" and so don't need to be updated. */ - if (!GHOST_STATE(buf->b_hdr->b_state)) { + if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) { arc_buf_hdr_t *hdr = buf->b_hdr; - atomic_add_64(&hdr->b_state->arcs_size, size); - if (list_link_active(&hdr->b_arc_node)) { - ASSERT(refcount_is_zero(&hdr->b_refcnt)); - atomic_add_64(&hdr->b_state->arcs_lsize[type], size); + atomic_add_64(&hdr->b_l1hdr.b_state->arcs_size, size); + + /* + * If this is reached via arc_read, the link is + * protected by the hash lock. If reached via + * arc_buf_alloc, the header should not be accessed by + * any other thread. And, if reached via arc_read_done, + * the hash lock will protect it if it's found in the + * hash table; otherwise no other thread should be + * trying to [add|remove]_reference it. + */ + if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type], + size); } /* * If we are growing the cache, and we are adding anonymous * data, and we have outgrown arc_p, update arc_p */ - if (!zfs_arc_p_aggressive_disable && - arc_size < arc_c && hdr->b_state == arc_anon && + if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && arc_anon->arcs_size + arc_mru->arcs_size > arc_p) arc_p = MIN(arc_c, arc_p + size); } @@ -2814,25 +3333,26 @@ arc_get_data_buf(arc_buf_t *buf) * NOTE: the hash lock is dropped in this function. */ static void -arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) +arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { clock_t now; ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(HDR_HAS_L1HDR(hdr)); - if (buf->b_state == arc_anon) { + if (hdr->b_l1hdr.b_state == arc_anon) { /* * This buffer is not in the cache, and does not * appear in our "ghost" list. Add the new buffer * to the MRU state. */ - ASSERT(buf->b_arc_access == 0); - buf->b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); - arc_change_state(arc_mru, buf, hash_lock); + ASSERT0(hdr->b_l1hdr.b_arc_access); + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mru, hdr, hash_lock); - } else if (buf->b_state == arc_mru) { + } else if (hdr->b_l1hdr.b_state == arc_mru) { now = ddi_get_lbolt(); /* @@ -2843,15 +3363,17 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) * - move the buffer to the head of the list if this is * another prefetch (to make it less likely to be evicted). */ - if ((buf->b_flags & ARC_PREFETCH) != 0) { - if (refcount_count(&buf->b_refcnt) == 0) { - ASSERT(list_link_active(&buf->b_arc_node)); + if (HDR_PREFETCH(hdr)) { + if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { + /* link protected by hash lock */ + ASSERT(multilist_link_active( + &hdr->b_l1hdr.b_arc_node)); } else { - buf->b_flags &= ~ARC_PREFETCH; - atomic_inc_32(&buf->b_mru_hits); + hdr->b_flags &= ~ARC_FLAG_PREFETCH; + atomic_inc_32(&hdr->b_l1hdr.b_mru_hits); ARCSTAT_BUMP(arcstat_mru_hits); } - buf->b_arc_access = now; + hdr->b_l1hdr.b_arc_access = now; return; } @@ -2860,19 +3382,20 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) * but it is still in the cache. Move it to the MFU * state. */ - if (ddi_time_after(now, buf->b_arc_access + ARC_MINTIME)) { + if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access + + ARC_MINTIME)) { /* * More than 125ms have passed since we * instantiated this buffer. Move it to the * most frequently used state. */ - buf->b_arc_access = now; - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); - arc_change_state(arc_mfu, buf, hash_lock); + hdr->b_l1hdr.b_arc_access = now; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mfu, hdr, hash_lock); } - atomic_inc_32(&buf->b_mru_hits); + atomic_inc_32(&hdr->b_l1hdr.b_mru_hits); ARCSTAT_BUMP(arcstat_mru_hits); - } else if (buf->b_state == arc_mru_ghost) { + } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { arc_state_t *new_state; /* * This buffer has been "accessed" recently, but @@ -2880,22 +3403,22 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) * MFU state. */ - if (buf->b_flags & ARC_PREFETCH) { + if (HDR_PREFETCH(hdr)) { new_state = arc_mru; - if (refcount_count(&buf->b_refcnt) > 0) - buf->b_flags &= ~ARC_PREFETCH; - DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); + if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) + hdr->b_flags &= ~ARC_FLAG_PREFETCH; + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { new_state = arc_mfu; - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); } - buf->b_arc_access = ddi_get_lbolt(); - arc_change_state(new_state, buf, hash_lock); + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); + arc_change_state(new_state, hdr, hash_lock); - atomic_inc_32(&buf->b_mru_ghost_hits); + atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits); ARCSTAT_BUMP(arcstat_mru_ghost_hits); - } else if (buf->b_state == arc_mfu) { + } else if (hdr->b_l1hdr.b_state == arc_mfu) { /* * This buffer has been accessed more than once and is * still in the cache. Keep it in the MFU state. @@ -2905,14 +3428,15 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) * If it was a prefetch, we will explicitly move it to * the head of the list now. */ - if ((buf->b_flags & ARC_PREFETCH) != 0) { - ASSERT(refcount_count(&buf->b_refcnt) == 0); - ASSERT(list_link_active(&buf->b_arc_node)); + if ((HDR_PREFETCH(hdr)) != 0) { + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + /* link protected by hash_lock */ + ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node)); } - atomic_inc_32(&buf->b_mfu_hits); + atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits); ARCSTAT_BUMP(arcstat_mfu_hits); - buf->b_arc_access = ddi_get_lbolt(); - } else if (buf->b_state == arc_mfu_ghost) { + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); + } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { arc_state_t *new_state = arc_mfu; /* * This buffer has been accessed more than once but has @@ -2920,31 +3444,32 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) * MFU state. */ - if (buf->b_flags & ARC_PREFETCH) { + if (HDR_PREFETCH(hdr)) { /* * This is a prefetch access... * move this block back to the MRU state. */ - ASSERT0(refcount_count(&buf->b_refcnt)); + ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); new_state = arc_mru; } - buf->b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); - arc_change_state(new_state, buf, hash_lock); + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); + arc_change_state(new_state, hdr, hash_lock); - atomic_inc_32(&buf->b_mfu_ghost_hits); + atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits); ARCSTAT_BUMP(arcstat_mfu_ghost_hits); - } else if (buf->b_state == arc_l2c_only) { + } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { /* * This buffer is on the 2nd Level ARC. */ - buf->b_arc_access = ddi_get_lbolt(); - DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); - arc_change_state(arc_mfu, buf, hash_lock); + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mfu, hdr, hash_lock); } else { - cmn_err(CE_PANIC, "invalid arc state 0x%p", buf->b_state); + cmn_err(CE_PANIC, "invalid arc state 0x%p", + hdr->b_l1hdr.b_state); } } @@ -2954,7 +3479,7 @@ void arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) { if (zio == NULL || zio->io_error == 0) - bcopy(buf->b_data, arg, buf->b_hdr->b_size); + abd_copy_to_buf(arg, buf->b_data, buf->b_hdr->b_size); VERIFY(arc_buf_remove_ref(buf, arg)); } @@ -3012,26 +3537,32 @@ arc_read_done(zio_t *zio) (found == hdr && HDR_L2_READING(hdr))); } - hdr->b_flags &= ~ARC_L2_EVICTED; - if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) - hdr->b_flags &= ~ARC_L2CACHE; + hdr->b_flags &= ~ARC_FLAG_L2_EVICTED; + if (l2arc_noprefetch && HDR_PREFETCH(hdr)) + hdr->b_flags &= ~ARC_FLAG_L2CACHE; /* byteswap if necessary */ - callback_list = hdr->b_acb; + callback_list = hdr->b_l1hdr.b_acb; ASSERT(callback_list != NULL); if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { dmu_object_byteswap_t bswap = DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); + + void *ziobuf = abd_borrow_buf_copy(zio->io_data, zio->io_size); + if (BP_GET_LEVEL(zio->io_bp) > 0) - byteswap_uint64_array(buf->b_data, hdr->b_size); + byteswap_uint64_array(ziobuf, hdr->b_size); else - dmu_ot_byteswap[bswap].ob_func(buf->b_data, hdr->b_size); + dmu_ot_byteswap[bswap].ob_func(ziobuf, hdr->b_size); + + abd_return_buf_copy(zio->io_data, ziobuf, zio->io_size); } arc_cksum_compute(buf, B_FALSE); arc_buf_watch(buf); - if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) { + if (hash_lock && zio->io_error == 0 && + hdr->b_l1hdr.b_state == arc_anon) { /* * Only call arc_access on anonymous buffers. This is because * if we've issued an I/O for an evicted buffer, we've already @@ -3053,24 +3584,25 @@ arc_read_done(zio_t *zio) abuf = NULL; } } - hdr->b_acb = NULL; - hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + hdr->b_l1hdr.b_acb = NULL; + hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; ASSERT(!HDR_BUF_AVAILABLE(hdr)); if (abuf == buf) { ASSERT(buf->b_efunc == NULL); - ASSERT(hdr->b_datacnt == 1); - hdr->b_flags |= ARC_BUF_AVAILABLE; + ASSERT(hdr->b_l1hdr.b_datacnt == 1); + hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; } - ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || + callback_list != NULL); if (zio->io_error != 0) { - hdr->b_flags |= ARC_IO_ERROR; - if (hdr->b_state != arc_anon) + hdr->b_flags |= ARC_FLAG_IO_ERROR; + if (hdr->b_l1hdr.b_state != arc_anon) arc_change_state(arc_anon, hdr, hash_lock); if (HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); - freeable = refcount_is_zero(&hdr->b_refcnt); + freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); } /* @@ -3078,9 +3610,9 @@ arc_read_done(zio_t *zio) * that the hdr (and hence the cv) might be freed before we get to * the cv_broadcast(). */ - cv_broadcast(&hdr->b_cv); + cv_broadcast(&hdr->b_l1hdr.b_cv); - if (hash_lock) { + if (hash_lock != NULL) { mutex_exit(hash_lock); } else { /* @@ -3089,8 +3621,8 @@ arc_read_done(zio_t *zio) * moved to the anonymous state (so that it won't show up * in the cache). */ - ASSERT3P(hdr->b_state, ==, arc_anon); - freeable = refcount_is_zero(&hdr->b_refcnt); + ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); + freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); } /* execute each callback and free its structure */ @@ -3131,8 +3663,8 @@ arc_read_done(zio_t *zio) */ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, - void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags, - const zbookmark_phys_t *zb) + void *private, zio_priority_t priority, int zio_flags, + arc_flags_t *arc_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = NULL; arc_buf_t *buf = NULL; @@ -3153,18 +3685,18 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, hdr = buf_hash_find(guid, bp, &hash_lock); } - if (hdr != NULL && hdr->b_datacnt > 0) { + if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) { - *arc_flags |= ARC_CACHED; + *arc_flags |= ARC_FLAG_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { - if (*arc_flags & ARC_WAIT) { - cv_wait(&hdr->b_cv, hash_lock); + if (*arc_flags & ARC_FLAG_WAIT) { + cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); mutex_exit(hash_lock); goto top; } - ASSERT(*arc_flags & ARC_NOWAIT); + ASSERT(*arc_flags & ARC_FLAG_NOWAIT); if (done) { arc_callback_t *acb = NULL; @@ -3178,8 +3710,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, spa, NULL, NULL, NULL, zio_flags); ASSERT(acb->acb_done != NULL); - acb->acb_next = hdr->b_acb; - hdr->b_acb = acb; + acb->acb_next = hdr->b_l1hdr.b_acb; + hdr->b_l1hdr.b_acb = acb; add_reference(hdr, hash_lock, private); mutex_exit(hash_lock); goto out; @@ -3188,7 +3720,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, goto out; } - ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + ASSERT(hdr->b_l1hdr.b_state == arc_mru || + hdr->b_l1hdr.b_state == arc_mfu); if (done) { add_reference(hdr, hash_lock, private); @@ -3197,30 +3730,30 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, * copy of the data so that we will be guaranteed * that arc_release() will always succeed. */ - buf = hdr->b_buf; + buf = hdr->b_l1hdr.b_buf; ASSERT(buf); ASSERT(buf->b_data); if (HDR_BUF_AVAILABLE(hdr)) { ASSERT(buf->b_efunc == NULL); - hdr->b_flags &= ~ARC_BUF_AVAILABLE; + hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; } else { buf = arc_buf_clone(buf); } - } else if (*arc_flags & ARC_PREFETCH && - refcount_count(&hdr->b_refcnt) == 0) { - hdr->b_flags |= ARC_PREFETCH; + } else if (*arc_flags & ARC_FLAG_PREFETCH && + refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { + hdr->b_flags |= ARC_FLAG_PREFETCH; } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); - if (*arc_flags & ARC_L2CACHE) - hdr->b_flags |= ARC_L2CACHE; - if (*arc_flags & ARC_L2COMPRESS) - hdr->b_flags |= ARC_L2COMPRESS; + if (*arc_flags & ARC_FLAG_L2CACHE) + hdr->b_flags |= ARC_FLAG_L2CACHE; + if (*arc_flags & ARC_FLAG_L2COMPRESS) + hdr->b_flags |= ARC_FLAG_L2COMPRESS; mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); - ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), - demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), + demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); if (done) @@ -3232,7 +3765,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, uint64_t addr = 0; boolean_t devw = B_FALSE; enum zio_compress b_compress = ZIO_COMPRESS_OFF; - uint64_t b_asize = 0; + int32_t b_asize = 0; /* * Gracefully handle a damaged logical block size as a @@ -3259,7 +3792,6 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, if (!BP_IS_EMBEDDED(bp)) { hdr->b_dva = *BP_IDENTITY(bp); hdr->b_birth = BP_PHYSICAL_BIRTH(bp); - hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; exists = buf_hash_insert(hdr, &hash_lock); } if (exists != NULL) { @@ -3269,63 +3801,73 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, (void) arc_buf_remove_ref(buf, private); goto top; /* restart the IO request */ } + /* if this is a prefetch, we don't have a reference */ - if (*arc_flags & ARC_PREFETCH) { + if (*arc_flags & ARC_FLAG_PREFETCH) { (void) remove_reference(hdr, hash_lock, private); - hdr->b_flags |= ARC_PREFETCH; + hdr->b_flags |= ARC_FLAG_PREFETCH; } - if (*arc_flags & ARC_L2CACHE) - hdr->b_flags |= ARC_L2CACHE; - if (*arc_flags & ARC_L2COMPRESS) - hdr->b_flags |= ARC_L2COMPRESS; + if (*arc_flags & ARC_FLAG_L2CACHE) + hdr->b_flags |= ARC_FLAG_L2CACHE; + if (*arc_flags & ARC_FLAG_L2COMPRESS) + hdr->b_flags |= ARC_FLAG_L2COMPRESS; if (BP_GET_LEVEL(bp) > 0) - hdr->b_flags |= ARC_INDIRECT; + hdr->b_flags |= ARC_FLAG_INDIRECT; } else { - /* this block is in the ghost cache */ - ASSERT(GHOST_STATE(hdr->b_state)); + /* + * This block is in the ghost cache. If it was L2-only + * (and thus didn't have an L1 hdr), we realloc the + * header to add an L1 hdr. + */ + if (!HDR_HAS_L1HDR(hdr)) { + hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, + hdr_full_cache); + } + + ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT0(refcount_count(&hdr->b_refcnt)); - ASSERT(hdr->b_buf == NULL); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); /* if this is a prefetch, we don't have a reference */ - if (*arc_flags & ARC_PREFETCH) - hdr->b_flags |= ARC_PREFETCH; + if (*arc_flags & ARC_FLAG_PREFETCH) + hdr->b_flags |= ARC_FLAG_PREFETCH; else add_reference(hdr, hash_lock, private); - if (*arc_flags & ARC_L2CACHE) - hdr->b_flags |= ARC_L2CACHE; - if (*arc_flags & ARC_L2COMPRESS) - hdr->b_flags |= ARC_L2COMPRESS; + if (*arc_flags & ARC_FLAG_L2CACHE) + hdr->b_flags |= ARC_FLAG_L2CACHE; + if (*arc_flags & ARC_FLAG_L2COMPRESS) + hdr->b_flags |= ARC_FLAG_L2COMPRESS; buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; buf->b_private = NULL; buf->b_next = NULL; - hdr->b_buf = buf; - ASSERT(hdr->b_datacnt == 0); - hdr->b_datacnt = 1; + hdr->b_l1hdr.b_buf = buf; + ASSERT0(hdr->b_l1hdr.b_datacnt); + hdr->b_l1hdr.b_datacnt = 1; arc_get_data_buf(buf); arc_access(hdr, hash_lock); } - ASSERT(!GHOST_STATE(hdr->b_state)); + ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); acb->acb_done = done; acb->acb_private = private; - ASSERT(hdr->b_acb == NULL); - hdr->b_acb = acb; - hdr->b_flags |= ARC_IO_IN_PROGRESS; + ASSERT(hdr->b_l1hdr.b_acb == NULL); + hdr->b_l1hdr.b_acb = acb; + hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; - if (hdr->b_l2hdr != NULL && - (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { - devw = hdr->b_l2hdr->b_dev->l2ad_writing; - addr = hdr->b_l2hdr->b_daddr; - b_compress = hdr->b_l2hdr->b_compress; - b_asize = hdr->b_l2hdr->b_asize; + if (HDR_HAS_L2HDR(hdr) && + (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { + devw = hdr->b_l2hdr.b_dev->l2ad_writing; + addr = hdr->b_l2hdr.b_daddr; + b_compress = HDR_GET_COMPRESS(hdr); + b_asize = hdr->b_l2hdr.b_asize; /* * Lock out device removal. */ @@ -3345,8 +3887,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, uint64_t, size, zbookmark_phys_t *, zb); ARCSTAT_BUMP(arcstat_misses); - ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), - demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, + ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), + demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { @@ -3359,14 +3901,14 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, * also have invalidated the vdev. * 5. This isn't prefetch and l2arc_noprefetch is set. */ - if (hdr->b_l2hdr != NULL && + if (HDR_HAS_L2HDR(hdr) && !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { l2arc_read_callback_t *cb; DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); ARCSTAT_BUMP(arcstat_l2_hits); - atomic_inc_32(&hdr->b_l2hdr->b_hits); + atomic_inc_32(&hdr->b_l2hdr.b_hits); cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); @@ -3408,12 +3950,12 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, zio_t *, rzio); ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize); - if (*arc_flags & ARC_NOWAIT) { + if (*arc_flags & ARC_FLAG_NOWAIT) { zio_nowait(rzio); goto out; } - ASSERT(*arc_flags & ARC_WAIT); + ASSERT(*arc_flags & ARC_FLAG_WAIT); if (zio_wait(rzio) == 0) goto out; @@ -3439,12 +3981,12 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, rzio = zio_read(pio, spa, bp, buf->b_data, size, arc_read_done, buf, priority, zio_flags, zb); - if (*arc_flags & ARC_WAIT) { + if (*arc_flags & ARC_FLAG_WAIT) { rc = zio_wait(rzio); goto out; } - ASSERT(*arc_flags & ARC_NOWAIT); + ASSERT(*arc_flags & ARC_FLAG_NOWAIT); zio_nowait(rzio); } @@ -3488,8 +4030,9 @@ void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) { ASSERT(buf->b_hdr != NULL); - ASSERT(buf->b_hdr->b_state != arc_anon); - ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); + ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon); + ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) || + func == NULL); ASSERT(buf->b_efunc == NULL); ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); @@ -3513,9 +4056,9 @@ arc_freed(spa_t *spa, const blkptr_t *bp) if (hdr == NULL) return; if (HDR_BUF_AVAILABLE(hdr)) { - arc_buf_t *buf = hdr->b_buf; + arc_buf_t *buf = hdr->b_l1hdr.b_buf; add_reference(hdr, hash_lock, FTAG); - hdr->b_flags &= ~ARC_BUF_AVAILABLE; + hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; mutex_exit(hash_lock); arc_release(buf, FTAG); @@ -3571,18 +4114,20 @@ arc_clear_callback(arc_buf_t *buf) hdr = buf->b_hdr; ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); - ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); - ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); + ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <, + hdr->b_l1hdr.b_datacnt); + ASSERT(hdr->b_l1hdr.b_state == arc_mru || + hdr->b_l1hdr.b_state == arc_mfu); buf->b_efunc = NULL; buf->b_private = NULL; - if (hdr->b_datacnt > 1) { + if (hdr->b_l1hdr.b_datacnt > 1) { mutex_exit(&buf->b_evict_lock); - arc_buf_destroy(buf, FALSE, TRUE); + arc_buf_destroy(buf, TRUE); } else { - ASSERT(buf == hdr->b_buf); - hdr->b_flags |= ARC_BUF_AVAILABLE; + ASSERT(buf == hdr->b_l1hdr.b_buf); + hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; mutex_exit(&buf->b_evict_lock); } @@ -3600,10 +4145,9 @@ arc_clear_callback(arc_buf_t *buf) void arc_release(arc_buf_t *buf, void *tag) { - arc_buf_hdr_t *hdr; - kmutex_t *hash_lock = NULL; - l2arc_buf_hdr_t *l2hdr; - uint64_t buf_size = 0; + kmutex_t *hash_lock; + arc_state_t *state; + arc_buf_hdr_t *hdr = buf->b_hdr; /* * It would be nice to assert that if it's DMU metadata (level > @@ -3612,57 +4156,98 @@ arc_release(arc_buf_t *buf, void *tag) */ mutex_enter(&buf->b_evict_lock); - hdr = buf->b_hdr; - /* this buffer is not on any list */ - ASSERT(refcount_count(&hdr->b_refcnt) > 0); + ASSERT(HDR_HAS_L1HDR(hdr)); - if (hdr->b_state == arc_anon) { - /* this buffer is already released */ - ASSERT(buf->b_efunc == NULL); - } else { - hash_lock = HDR_LOCK(hdr); - mutex_enter(hash_lock); - hdr = buf->b_hdr; - ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + /* + * We don't grab the hash lock prior to this check, because if + * the buffer's header is in the arc_anon state, it won't be + * linked into the hash table. + */ + if (hdr->b_l1hdr.b_state == arc_anon) { + mutex_exit(&buf->b_evict_lock); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(!HDR_IN_HASH_TABLE(hdr)); + ASSERT(!HDR_HAS_L2HDR(hdr)); + ASSERT(BUF_EMPTY(hdr)); + + ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1); + ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); + ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + + ASSERT3P(buf->b_efunc, ==, NULL); + ASSERT3P(buf->b_private, ==, NULL); + + hdr->b_l1hdr.b_arc_access = 0; + arc_buf_thaw(buf); + + return; } - l2hdr = hdr->b_l2hdr; - if (l2hdr) { - mutex_enter(&l2arc_buflist_mtx); + hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); + + /* + * This assignment is only valid as long as the hash_lock is + * held, we must be careful not to reference state or the + * b_state field after dropping the lock. + */ + state = hdr->b_l1hdr.b_state; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + ASSERT3P(state, !=, arc_anon); + + /* this buffer is not on any list */ + ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0); + + if (HDR_HAS_L2HDR(hdr)) { + ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + + mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); + list_remove(&hdr->b_l2hdr.b_dev->l2ad_buflist, hdr); + + /* + * We don't want to leak the b_tmp_cdata buffer that was + * allocated in l2arc_write_buffers() + */ arc_buf_l2_cdata_free(hdr); - hdr->b_l2hdr = NULL; - list_remove(l2hdr->b_dev->l2ad_buflist, hdr); + + mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); + + hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; } - buf_size = hdr->b_size; /* * Do we have more than one buf? */ - if (hdr->b_datacnt > 1) { + if (hdr->b_l1hdr.b_datacnt > 1) { arc_buf_hdr_t *nhdr; arc_buf_t **bufp; uint64_t blksz = hdr->b_size; uint64_t spa = hdr->b_spa; - arc_buf_contents_t type = hdr->b_type; + arc_buf_contents_t type = arc_buf_type(hdr); uint32_t flags = hdr->b_flags; - ASSERT(hdr->b_buf != buf || buf->b_next != NULL); + ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); /* * Pull the data off of this hdr and attach it to * a new anonymous hdr. */ (void) remove_reference(hdr, hash_lock, tag); - bufp = &hdr->b_buf; + bufp = &hdr->b_l1hdr.b_buf; while (*bufp != buf) bufp = &(*bufp)->b_next; *bufp = buf->b_next; buf->b_next = NULL; - ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); - atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); - if (refcount_is_zero(&hdr->b_refcnt)) { - uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; + ASSERT3P(state, !=, arc_l2c_only); + ASSERT3U(state->arcs_size, >=, hdr->b_size); + atomic_add_64(&state->arcs_size, -hdr->b_size); + if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { + uint64_t *size; + + ASSERT3P(state, !=, arc_l2c_only); + size = &state->arcs_lsize[type]; ASSERT3U(*size, >=, hdr->b_size); atomic_add_64(size, -hdr->b_size); } @@ -3671,68 +4256,61 @@ arc_release(arc_buf_t *buf, void *tag) * We're releasing a duplicate user data buffer, update * our statistics accordingly. */ - if (hdr->b_type == ARC_BUFC_DATA) { + if (HDR_ISTYPE_DATA(hdr)) { ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); ARCSTAT_INCR(arcstat_duplicate_buffers_size, -hdr->b_size); } - hdr->b_datacnt -= 1; + hdr->b_l1hdr.b_datacnt -= 1; arc_cksum_verify(buf); arc_buf_unwatch(buf); mutex_exit(hash_lock); - nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); + nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); nhdr->b_size = blksz; nhdr->b_spa = spa; - nhdr->b_type = type; - nhdr->b_buf = buf; - nhdr->b_state = arc_anon; - nhdr->b_arc_access = 0; - nhdr->b_mru_hits = 0; - nhdr->b_mru_ghost_hits = 0; - nhdr->b_mfu_hits = 0; - nhdr->b_mfu_ghost_hits = 0; - nhdr->b_l2_hits = 0; - nhdr->b_flags = flags & ARC_L2_WRITING; - nhdr->b_l2hdr = NULL; - nhdr->b_datacnt = 1; + + nhdr->b_l1hdr.b_mru_hits = 0; + nhdr->b_l1hdr.b_mru_ghost_hits = 0; + nhdr->b_l1hdr.b_mfu_hits = 0; + nhdr->b_l1hdr.b_mfu_ghost_hits = 0; + nhdr->b_l1hdr.b_l2_hits = 0; + nhdr->b_flags = flags & ARC_FLAG_L2_WRITING; + nhdr->b_flags |= arc_bufc_to_flags(type); + nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; + + nhdr->b_l1hdr.b_buf = buf; + nhdr->b_l1hdr.b_datacnt = 1; + nhdr->b_l1hdr.b_state = arc_anon; + nhdr->b_l1hdr.b_arc_access = 0; + nhdr->b_l1hdr.b_tmp_cdata = NULL; nhdr->b_freeze_cksum = NULL; - (void) refcount_add(&nhdr->b_refcnt, tag); + + (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; mutex_exit(&buf->b_evict_lock); atomic_add_64(&arc_anon->arcs_size, blksz); } else { mutex_exit(&buf->b_evict_lock); - ASSERT(refcount_count(&hdr->b_refcnt) == 1); - ASSERT(!list_link_active(&hdr->b_arc_node)); + ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); + /* protected by hash lock, or hdr is on arc_anon */ + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - if (hdr->b_state != arc_anon) - arc_change_state(arc_anon, hdr, hash_lock); - hdr->b_arc_access = 0; - hdr->b_mru_hits = 0; - hdr->b_mru_ghost_hits = 0; - hdr->b_mfu_hits = 0; - hdr->b_mfu_ghost_hits = 0; - hdr->b_l2_hits = 0; - if (hash_lock) - mutex_exit(hash_lock); + hdr->b_l1hdr.b_mru_hits = 0; + hdr->b_l1hdr.b_mru_ghost_hits = 0; + hdr->b_l1hdr.b_mfu_hits = 0; + hdr->b_l1hdr.b_mfu_ghost_hits = 0; + hdr->b_l1hdr.b_l2_hits = 0; + arc_change_state(arc_anon, hdr, hash_lock); + hdr->b_l1hdr.b_arc_access = 0; + mutex_exit(hash_lock); buf_discard_identity(hdr); arc_buf_thaw(buf); } buf->b_efunc = NULL; buf->b_private = NULL; - - if (l2hdr) { - ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); - vdev_space_update(l2hdr->b_dev->l2ad_vdev, - -l2hdr->b_asize, 0, 0); - kmem_cache_free(l2arc_hdr_cache, l2hdr); - arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); - ARCSTAT_INCR(arcstat_l2_size, -buf_size); - mutex_exit(&l2arc_buflist_mtx); - } } int @@ -3741,7 +4319,8 @@ arc_released(arc_buf_t *buf) int released; mutex_enter(&buf->b_evict_lock); - released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); + released = (buf->b_data != NULL && + buf->b_hdr->b_l1hdr.b_state == arc_anon); mutex_exit(&buf->b_evict_lock); return (released); } @@ -3753,7 +4332,7 @@ arc_referenced(arc_buf_t *buf) int referenced; mutex_enter(&buf->b_evict_lock); - referenced = (refcount_count(&buf->b_hdr->b_refcnt)); + referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); mutex_exit(&buf->b_evict_lock); return (referenced); } @@ -3766,7 +4345,9 @@ arc_write_ready(zio_t *zio) arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); + ASSERT(hdr->b_l1hdr.b_datacnt > 0); callback->awcb_ready(zio, buf, callback->awcb_private); /* @@ -3776,15 +4357,15 @@ arc_write_ready(zio_t *zio) * accounting for any re-write attempt. */ if (HDR_IO_IN_PROGRESS(hdr)) { - mutex_enter(&hdr->b_freeze_lock); + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_freeze_cksum != NULL) { kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); hdr->b_freeze_cksum = NULL; } - mutex_exit(&hdr->b_freeze_lock); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } arc_cksum_compute(buf, B_FALSE); - hdr->b_flags |= ARC_IO_IN_PROGRESS; + hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; } /* @@ -3806,7 +4387,7 @@ arc_write_done(zio_t *zio) arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; - ASSERT(hdr->b_acb == NULL); + ASSERT(hdr->b_l1hdr.b_acb == NULL); if (zio->io_error == 0) { if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { @@ -3814,7 +4395,6 @@ arc_write_done(zio_t *zio) } else { hdr->b_dva = *BP_IDENTITY(zio->io_bp); hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); - hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; } } else { ASSERT(BUF_EMPTY(hdr)); @@ -3835,7 +4415,7 @@ arc_write_done(zio_t *zio) arc_cksum_verify(buf); exists = buf_hash_insert(hdr, &hash_lock); - if (exists) { + if (exists != NULL) { /* * This can only happen if we overwrite for * sync-to-convergence, because we remove @@ -3845,7 +4425,8 @@ arc_write_done(zio_t *zio) if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) panic("bad overwrite, hdr=%p exists=%p", (void *)hdr, (void *)exists); - ASSERT(refcount_is_zero(&exists->b_refcnt)); + ASSERT(refcount_is_zero( + &exists->b_l1hdr.b_refcnt)); arc_change_state(arc_anon, exists, hash_lock); mutex_exit(hash_lock); arc_hdr_destroy(exists); @@ -3859,22 +4440,22 @@ arc_write_done(zio_t *zio) (void *)hdr, (void *)exists); } else { /* Dedup */ - ASSERT(hdr->b_datacnt == 1); - ASSERT(hdr->b_state == arc_anon); + ASSERT(hdr->b_l1hdr.b_datacnt == 1); + ASSERT(hdr->b_l1hdr.b_state == arc_anon); ASSERT(BP_GET_DEDUP(zio->io_bp)); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); } } - hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; /* if it's not anon, we are doing a scrub */ - if (!exists && hdr->b_state == arc_anon) + if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) arc_access(hdr, hash_lock); mutex_exit(hash_lock); } else { - hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; } - ASSERT(!refcount_is_zero(&hdr->b_refcnt)); + ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); callback->awcb_done(zio, buf, callback->awcb_private); kmem_free(callback, sizeof (arc_write_callback_t)); @@ -3894,12 +4475,13 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, ASSERT(ready != NULL); ASSERT(done != NULL); ASSERT(!HDR_IO_ERROR(hdr)); - ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); - ASSERT(hdr->b_acb == NULL); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(hdr->b_l1hdr.b_acb == NULL); + ASSERT(hdr->b_l1hdr.b_datacnt > 0); if (l2arc) - hdr->b_flags |= ARC_L2CACHE; + hdr->b_flags |= ARC_FLAG_L2CACHE; if (l2arc_compress) - hdr->b_flags |= ARC_L2COMPRESS; + hdr->b_flags |= ARC_FLAG_L2COMPRESS; callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_physdone = physdone; @@ -4036,11 +4618,50 @@ arc_kstat_update(kstat_t *ksp, int rw) return (0); } +/* + * This function *must* return indices evenly distributed between all + * sublists of the multilist. This is needed due to how the ARC eviction + * code is laid out; arc_evict_state() assumes ARC buffers are evenly + * distributed between all sublists and uses this assumption when + * deciding which sublist to evict from and how much to evict from it. + */ +unsigned int +arc_state_multilist_index_func(multilist_t *ml, void *obj) +{ + arc_buf_hdr_t *hdr = obj; + + /* + * We rely on b_dva to generate evenly distributed index + * numbers using buf_hash below. So, as an added precaution, + * let's make sure we never add empty buffers to the arc lists. + */ + ASSERT(!BUF_EMPTY(hdr)); + + /* + * The assumption here, is the hash value for a given + * arc_buf_hdr_t will remain constant throughout it's lifetime + * (i.e. it's b_spa, b_dva, and b_birth fields don't change). + * Thus, we don't need to store the header's sublist index + * on insertion, as this index can be recalculated on removal. + * + * Also, the low order bits of the hash value are thought to be + * distributed evenly. Otherwise, in the case that the multilist + * has a power of two number of sublists, each sublists' usage + * would not be evenly distributed. + */ + return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % + multilist_get_num_sublists(ml)); +} + void arc_init(void) { - mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); + cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); + + mutex_init(&arc_user_evicts_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL); /* Convert seconds to clock ticks */ zfs_arc_min_prefetch_lifespan = 1 * hz; @@ -4088,6 +4709,9 @@ arc_init(void) if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) arc_meta_limit = zfs_arc_meta_limit; + if (zfs_arc_num_sublists_per_state < 1) + zfs_arc_num_sublists_per_state = num_online_cpus(); + /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; @@ -4102,33 +4726,46 @@ arc_init(void) arc_l2c_only = &ARC_l2c_only; arc_size = 0; - mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - - list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); + multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); arc_anon->arcs_state = ARC_STATE_ANON; arc_mru->arcs_state = ARC_STATE_MRU; @@ -4139,12 +4776,12 @@ arc_init(void) buf_init(); - arc_thread_exit = 0; + arc_reclaim_thread_exit = FALSE; + arc_user_evicts_thread_exit = FALSE; list_create(&arc_prune_list, sizeof (arc_prune_t), offsetof(arc_prune_t, p_node)); arc_eviction_list = NULL; mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, @@ -4159,6 +4796,9 @@ arc_init(void) (void) thread_create(NULL, 0, arc_adapt_thread, NULL, 0, &p0, TS_RUN, minclsyspri); + (void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0, + TS_RUN, minclsyspri); + arc_dead = FALSE; arc_warm = B_FALSE; @@ -4187,17 +4827,36 @@ arc_fini(void) { arc_prune_t *p; - mutex_enter(&arc_reclaim_thr_lock); #ifdef _KERNEL spl_unregister_shrinker(&arc_shrinker); #endif /* _KERNEL */ - arc_thread_exit = 1; - while (arc_thread_exit != 0) - cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); - mutex_exit(&arc_reclaim_thr_lock); + mutex_enter(&arc_reclaim_lock); + arc_reclaim_thread_exit = TRUE; + /* + * The reclaim thread will set arc_reclaim_thread_exit back to + * FALSE when it is finished exiting; we're waiting for that. + */ + while (arc_reclaim_thread_exit) { + cv_signal(&arc_reclaim_thread_cv); + cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock); + } + mutex_exit(&arc_reclaim_lock); + + mutex_enter(&arc_user_evicts_lock); + arc_user_evicts_thread_exit = TRUE; + /* + * The user evicts thread will set arc_user_evicts_thread_exit + * to FALSE when it is finished exiting; we're waiting for that. + */ + while (arc_user_evicts_thread_exit) { + cv_signal(&arc_user_evicts_cv); + cv_wait(&arc_user_evicts_cv, &arc_user_evicts_lock); + } + mutex_exit(&arc_user_evicts_lock); - arc_flush(NULL); + /* Use TRUE to ensure *all* buffers are evicted */ + arc_flush(NULL, TRUE); arc_dead = TRUE; @@ -4217,29 +4876,27 @@ arc_fini(void) list_destroy(&arc_prune_list); mutex_destroy(&arc_prune_mtx); - mutex_destroy(&arc_eviction_mtx); - mutex_destroy(&arc_reclaim_thr_lock); - cv_destroy(&arc_reclaim_thr_cv); - - list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); - list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); - list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); - list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); - list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); - list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); - list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); - list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); - - mutex_destroy(&arc_anon->arcs_mtx); - mutex_destroy(&arc_mru->arcs_mtx); - mutex_destroy(&arc_mru_ghost->arcs_mtx); - mutex_destroy(&arc_mfu->arcs_mtx); - mutex_destroy(&arc_mfu_ghost->arcs_mtx); - mutex_destroy(&arc_l2c_only->arcs_mtx); + mutex_destroy(&arc_reclaim_lock); + cv_destroy(&arc_reclaim_thread_cv); + cv_destroy(&arc_reclaim_waiters_cv); + + mutex_destroy(&arc_user_evicts_lock); + cv_destroy(&arc_user_evicts_cv); + + multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]); buf_fini(); - ASSERT(arc_loaned_bytes == 0); + ASSERT0(arc_loaned_bytes); } /* @@ -4389,7 +5046,7 @@ arc_fini(void) */ static boolean_t -l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) +l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) { /* * A buffer is *not* eligible for the L2ARC if it: @@ -4398,8 +5055,8 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) * 3. has an I/O in progress (it may be an incomplete read). * 4. is flagged not eligible (zfs property). */ - if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL || - HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) + if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) || + HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr)) return (B_FALSE); return (B_TRUE); @@ -4451,20 +5108,6 @@ l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) return (next); } -static void -l2arc_hdr_stat_add(void) -{ - ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE); - ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); -} - -static void -l2arc_hdr_stat_remove(void) -{ - ARCSTAT_INCR(arcstat_l2_hdr_size, -HDR_SIZE); - ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); -} - /* * Cycle through L2ARC devices. This is how L2ARC load balances. * If a device is returned, this also returns holding the spa config lock. @@ -4560,8 +5203,7 @@ l2arc_write_done(zio_t *zio) l2arc_write_callback_t *cb; l2arc_dev_t *dev; list_t *buflist; - arc_buf_hdr_t *head, *ab, *ab_prev; - l2arc_buf_hdr_t *abl2; + arc_buf_hdr_t *head, *hdr, *hdr_prev; kmutex_t *hash_lock; int64_t bytes_dropped = 0; @@ -4571,7 +5213,7 @@ l2arc_write_done(zio_t *zio) ASSERT(dev != NULL); head = cb->l2wcb_head; ASSERT(head != NULL); - buflist = dev->l2ad_buflist; + buflist = &dev->l2ad_buflist; ASSERT(buflist != NULL); DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, l2arc_write_callback_t *, cb); @@ -4579,57 +5221,88 @@ l2arc_write_done(zio_t *zio) if (zio->io_error != 0) ARCSTAT_BUMP(arcstat_l2_writes_error); - mutex_enter(&l2arc_buflist_mtx); - /* * All writes completed, or an error was hit. */ - for (ab = list_prev(buflist, head); ab; ab = ab_prev) { - ab_prev = list_prev(buflist, ab); - abl2 = ab->b_l2hdr; +top: + mutex_enter(&dev->l2ad_mtx); + for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { + hdr_prev = list_prev(buflist, hdr); + + hash_lock = HDR_LOCK(hdr); /* - * Release the temporary compressed buffer as soon as possible. + * We cannot use mutex_enter or else we can deadlock + * with l2arc_write_buffers (due to swapping the order + * the hash lock and l2ad_mtx are taken). */ - if (abl2->b_compress != ZIO_COMPRESS_OFF) - l2arc_release_cdata_buf(ab); - - hash_lock = HDR_LOCK(ab); if (!mutex_tryenter(hash_lock)) { /* - * This buffer misses out. It may be in a stage - * of eviction. Its ARC_L2_WRITING flag will be - * left set, denying reads to this buffer. + * Missed the hash lock. We must retry so we + * don't leave the ARC_FLAG_L2_WRITING bit set. */ - ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); - continue; + ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); + + /* + * We don't want to rescan the headers we've + * already marked as having been written out, so + * we reinsert the head node so we can pick up + * where we left off. + */ + list_remove(buflist, head); + list_insert_after(buflist, hdr, head); + + mutex_exit(&dev->l2ad_mtx); + + /* + * We wait for the hash lock to become available + * to try and prevent busy waiting, and increase + * the chance we'll be able to acquire the lock + * the next time around. + */ + mutex_enter(hash_lock); + mutex_exit(hash_lock); + goto top; } + /* + * We could not have been moved into the arc_l2c_only + * state while in-flight due to our ARC_FLAG_L2_WRITING + * bit being set. Let's just ensure that's being enforced. + */ + ASSERT(HDR_HAS_L1HDR(hdr)); + + /* + * We may have allocated a buffer for L2ARC compression, + * we must release it to avoid leaking this data. + */ + l2arc_release_cdata_buf(hdr); + if (zio->io_error != 0) { /* * Error - drop L2ARC entry. */ - list_remove(buflist, ab); - ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); - bytes_dropped += abl2->b_asize; - ab->b_l2hdr = NULL; - kmem_cache_free(l2arc_hdr_cache, abl2); - arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); - ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); + list_remove(buflist, hdr); + hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; + + ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); } /* - * Allow ARC to begin reads to this L2ARC entry. + * Allow ARC to begin reads and ghost list evictions to + * this L2ARC entry. */ - ab->b_flags &= ~ARC_L2_WRITING; + hdr->b_flags &= ~ARC_FLAG_L2_WRITING; mutex_exit(hash_lock); } atomic_inc_64(&l2arc_writes_done); list_remove(buflist, head); - kmem_cache_free(hdr_cache, head); - mutex_exit(&l2arc_buflist_mtx); + ASSERT(!HDR_HAS_L1HDR(head)); + kmem_cache_free(hdr_l2only_cache, head); + mutex_exit(&dev->l2ad_mtx); vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); @@ -4726,35 +5399,37 @@ l2arc_read_done(zio_t *zio) * the data lists. This function returns a locked list, and also returns * the lock pointer. */ -static list_t * -l2arc_list_locked(int list_num, kmutex_t **lock) +static multilist_sublist_t * +l2arc_sublist_lock(int list_num) { - list_t *list = NULL; + multilist_t *ml = NULL; + unsigned int idx; ASSERT(list_num >= 0 && list_num <= 3); switch (list_num) { case 0: - list = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; - *lock = &arc_mfu->arcs_mtx; + ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; break; case 1: - list = &arc_mru->arcs_list[ARC_BUFC_METADATA]; - *lock = &arc_mru->arcs_mtx; + ml = &arc_mru->arcs_list[ARC_BUFC_METADATA]; break; case 2: - list = &arc_mfu->arcs_list[ARC_BUFC_DATA]; - *lock = &arc_mfu->arcs_mtx; + ml = &arc_mfu->arcs_list[ARC_BUFC_DATA]; break; case 3: - list = &arc_mru->arcs_list[ARC_BUFC_DATA]; - *lock = &arc_mru->arcs_mtx; + ml = &arc_mru->arcs_list[ARC_BUFC_DATA]; break; } - ASSERT(!(MUTEX_HELD(*lock))); - mutex_enter(*lock); - return (list); + /* + * Return a randomly-selected sublist. This is acceptable + * because the caller feeds only a little bit of data for each + * call (8MB). Subsequent calls will result in different + * sublists being selected. + */ + idx = multilist_get_random_index(ml); + return (multilist_sublist_lock(ml, idx)); } /* @@ -4767,16 +5442,12 @@ static void l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) { list_t *buflist; - l2arc_buf_hdr_t *abl2; - arc_buf_hdr_t *ab, *ab_prev; + arc_buf_hdr_t *hdr, *hdr_prev; kmutex_t *hash_lock; uint64_t taddr; int64_t bytes_evicted = 0; - buflist = dev->l2ad_buflist; - - if (buflist == NULL) - return; + buflist = &dev->l2ad_buflist; if (!all && dev->l2ad_first) { /* @@ -4799,35 +5470,41 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) uint64_t, taddr, boolean_t, all); top: - mutex_enter(&l2arc_buflist_mtx); - for (ab = list_tail(buflist); ab; ab = ab_prev) { - ab_prev = list_prev(buflist, ab); + mutex_enter(&dev->l2ad_mtx); + for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { + hdr_prev = list_prev(buflist, hdr); + + hash_lock = HDR_LOCK(hdr); - hash_lock = HDR_LOCK(ab); + /* + * We cannot use mutex_enter or else we can deadlock + * with l2arc_write_buffers (due to swapping the order + * the hash lock and l2ad_mtx are taken). + */ if (!mutex_tryenter(hash_lock)) { /* * Missed the hash lock. Retry. */ ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); - mutex_exit(&l2arc_buflist_mtx); + mutex_exit(&dev->l2ad_mtx); mutex_enter(hash_lock); mutex_exit(hash_lock); goto top; } - if (HDR_L2_WRITE_HEAD(ab)) { + if (HDR_L2_WRITE_HEAD(hdr)) { /* * We hit a write head node. Leave it for * l2arc_write_done(). */ - list_remove(buflist, ab); + list_remove(buflist, hdr); mutex_exit(hash_lock); continue; } - if (!all && ab->b_l2hdr != NULL && - (ab->b_l2hdr->b_daddr > taddr || - ab->b_l2hdr->b_daddr < dev->l2ad_hand)) { + if (!all && HDR_HAS_L2HDR(hdr) && + (hdr->b_l2hdr.b_daddr > taddr || + hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { /* * We've evicted to the target address, * or the end of the device. @@ -4836,62 +5513,45 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) break; } - if (HDR_FREE_IN_PROGRESS(ab)) { - /* - * Already on the path to destruction. - */ - mutex_exit(hash_lock); - continue; - } - - if (ab->b_state == arc_l2c_only) { - ASSERT(!HDR_L2_READING(ab)); + ASSERT(HDR_HAS_L2HDR(hdr)); + if (!HDR_HAS_L1HDR(hdr)) { + ASSERT(!HDR_L2_READING(hdr)); /* * This doesn't exist in the ARC. Destroy. * arc_hdr_destroy() will call list_remove() * and decrement arcstat_l2_size. */ - arc_change_state(arc_anon, ab, hash_lock); - arc_hdr_destroy(ab); + arc_change_state(arc_anon, hdr, hash_lock); + arc_hdr_destroy(hdr); } else { + ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); + ARCSTAT_BUMP(arcstat_l2_evict_l1cached); /* * Invalidate issued or about to be issued * reads, since we may be about to write * over this location. */ - if (HDR_L2_READING(ab)) { + if (HDR_L2_READING(hdr)) { ARCSTAT_BUMP(arcstat_l2_evict_reading); - ab->b_flags |= ARC_L2_EVICTED; + hdr->b_flags |= ARC_FLAG_L2_EVICTED; } /* * Tell ARC this no longer exists in L2ARC. */ - if (ab->b_l2hdr != NULL) { - abl2 = ab->b_l2hdr; - ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize); - bytes_evicted += abl2->b_asize; - ab->b_l2hdr = NULL; - /* - * We are destroying l2hdr, so ensure that - * its compressed buffer, if any, is not leaked. - */ - ASSERT(abl2->b_tmp_cdata == NULL); - kmem_cache_free(l2arc_hdr_cache, abl2); - arc_space_return(L2HDR_SIZE, ARC_SPACE_L2HDRS); - ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); - } - list_remove(buflist, ab); + /* Tell ARC this no longer exists in L2ARC. */ + ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; + list_remove(buflist, hdr); - /* - * This may have been leftover after a - * failed write. - */ - ab->b_flags &= ~ARC_L2_WRITING; + /* Ensure this header has finished being written */ + ASSERT(!HDR_L2_WRITING(hdr)); + ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); } mutex_exit(hash_lock); } - mutex_exit(&l2arc_buflist_mtx); + mutex_exit(&dev->l2ad_mtx); vdev_space_update(dev->l2ad_vdev, -bytes_evicted, 0, 0); dev->l2ad_evict = taddr; @@ -4900,7 +5560,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) /* * Find and write ARC buffers to the L2ARC device. * - * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid + * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid * for reading until they have completed writing. * The headroom_boost is an in-out parameter used to maintain headroom boost * state between calls to this function. @@ -4912,12 +5572,10 @@ static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, boolean_t *headroom_boost) { - arc_buf_hdr_t *ab, *ab_prev, *head; - list_t *list; + arc_buf_hdr_t *hdr, *hdr_prev, *head; uint64_t write_asize, write_psize, write_sz, headroom, buf_compress_minsz; - void *buf_data; - kmutex_t *list_lock = NULL; + abd_t *buf_data; boolean_t full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; @@ -4933,8 +5591,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, pio = NULL; write_sz = write_asize = write_psize = 0; full = B_FALSE; - head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); - head->b_flags |= ARC_L2_WRITE_HEAD; + head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); + head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; + head->b_flags |= ARC_FLAG_HAS_L2HDR; /* * We will want to try to compress buffers that are at least 2x the @@ -4945,12 +5604,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, /* * Copy buffers for L2ARC writing. */ - mutex_enter(&l2arc_buflist_mtx); for (try = 0; try <= 3; try++) { + multilist_sublist_t *mls = l2arc_sublist_lock(try); uint64_t passed_sz = 0; - list = l2arc_list_locked(try, &list_lock); - /* * L2ARC fast warmup. * @@ -4958,25 +5615,24 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, * head of the ARC lists rather than the tail. */ if (arc_warm == B_FALSE) - ab = list_head(list); + hdr = multilist_sublist_head(mls); else - ab = list_tail(list); + hdr = multilist_sublist_tail(mls); headroom = target_sz * l2arc_headroom; if (do_headroom_boost) headroom = (headroom * l2arc_headroom_boost) / 100; - for (; ab; ab = ab_prev) { - l2arc_buf_hdr_t *l2hdr; + for (; hdr; hdr = hdr_prev) { kmutex_t *hash_lock; uint64_t buf_sz; if (arc_warm == B_FALSE) - ab_prev = list_next(list, ab); + hdr_prev = multilist_sublist_next(mls, hdr); else - ab_prev = list_prev(list, ab); + hdr_prev = multilist_sublist_prev(mls, hdr); - hash_lock = HDR_LOCK(ab); + hash_lock = HDR_LOCK(hdr); if (!mutex_tryenter(hash_lock)) { /* * Skip this buffer rather than waiting. @@ -4984,7 +5640,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, continue; } - passed_sz += ab->b_size; + passed_sz += hdr->b_size; if (passed_sz > headroom) { /* * Searched too far. @@ -4993,12 +5649,12 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, break; } - if (!l2arc_write_eligible(guid, ab)) { + if (!l2arc_write_eligible(guid, hdr)) { mutex_exit(hash_lock); continue; } - if ((write_sz + ab->b_size) > target_sz) { + if ((write_sz + hdr->b_size) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); break; @@ -5010,7 +5666,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, * l2arc_write_done() can find where the * write buffers begin without searching. */ - list_insert_head(dev->l2ad_buflist, head); + mutex_enter(&dev->l2ad_mtx); + list_insert_head(&dev->l2ad_buflist, head); + mutex_exit(&dev->l2ad_mtx); cb = kmem_alloc(sizeof (l2arc_write_callback_t), KM_SLEEP); @@ -5023,44 +5681,42 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, /* * Create and add a new L2ARC header. */ - l2hdr = kmem_cache_alloc(l2arc_hdr_cache, KM_SLEEP); - l2hdr->b_dev = dev; - l2hdr->b_daddr = 0; - arc_space_consume(L2HDR_SIZE, ARC_SPACE_L2HDRS); - - ab->b_flags |= ARC_L2_WRITING; - + hdr->b_l2hdr.b_dev = dev; + arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); + hdr->b_flags |= ARC_FLAG_L2_WRITING; /* * Temporarily stash the data buffer in b_tmp_cdata. * The subsequent write step will pick it up from - * there. This is because can't access ab->b_buf + * there. This is because can't access b_l1hdr.b_buf * without holding the hash_lock, which we in turn * can't access without holding the ARC list locks * (which we want to avoid during compression/writing) */ - l2hdr->b_compress = ZIO_COMPRESS_OFF; - l2hdr->b_asize = ab->b_size; - l2hdr->b_tmp_cdata = ab->b_buf->b_data; - l2hdr->b_hits = 0; + HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); + hdr->b_l2hdr.b_asize = hdr->b_size; + hdr->b_l2hdr.b_hits = 0; + hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data; - buf_sz = ab->b_size; - ab->b_l2hdr = l2hdr; + buf_sz = hdr->b_size; + hdr->b_flags |= ARC_FLAG_HAS_L2HDR; - list_insert_head(dev->l2ad_buflist, ab); + mutex_enter(&dev->l2ad_mtx); + list_insert_head(&dev->l2ad_buflist, hdr); + mutex_exit(&dev->l2ad_mtx); /* * Compute and store the buffer cksum before * writing. On debug the cksum is verified first. */ - arc_cksum_verify(ab->b_buf); - arc_cksum_compute(ab->b_buf, B_TRUE); + arc_cksum_verify(hdr->b_l1hdr.b_buf); + arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE); mutex_exit(hash_lock); write_sz += buf_sz; } - mutex_exit(list_lock); + multilist_sublist_unlock(mls); if (full == B_TRUE) break; @@ -5069,33 +5725,42 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, /* No buffers selected for writing? */ if (pio == NULL) { ASSERT0(write_sz); - mutex_exit(&l2arc_buflist_mtx); - kmem_cache_free(hdr_cache, head); + ASSERT(!HDR_HAS_L1HDR(head)); + kmem_cache_free(hdr_l2only_cache, head); return (0); } + mutex_enter(&dev->l2ad_mtx); + /* * Now start writing the buffers. We're starting at the write head * and work backwards, retracing the course of the buffer selector * loop above. */ - for (ab = list_prev(dev->l2ad_buflist, head); ab; - ab = list_prev(dev->l2ad_buflist, ab)) { - l2arc_buf_hdr_t *l2hdr; + for (hdr = list_prev(&dev->l2ad_buflist, head); hdr; + hdr = list_prev(&dev->l2ad_buflist, hdr)) { uint64_t buf_sz; + /* + * We rely on the L1 portion of the header below, so + * it's invalid for this header to have been evicted out + * of the ghost cache, prior to being written out. The + * ARC_FLAG_L2_WRITING bit ensures this won't happen. + */ + ASSERT(HDR_HAS_L1HDR(hdr)); + /* * We shouldn't need to lock the buffer here, since we flagged - * it as ARC_L2_WRITING in the previous step, but we must take - * care to only access its L2 cache parameters. In particular, - * ab->b_buf may be invalid by now due to ARC eviction. + * it as ARC_FLAG_L2_WRITING in the previous step, but we must + * take care to only access its L2 cache parameters. In + * particular, hdr->l1hdr.b_buf may be invalid by now due to + * ARC eviction. */ - l2hdr = ab->b_l2hdr; - l2hdr->b_daddr = dev->l2ad_hand; + hdr->b_l2hdr.b_daddr = dev->l2ad_hand; - if (!l2arc_nocompress && (ab->b_flags & ARC_L2COMPRESS) && - l2hdr->b_asize >= buf_compress_minsz) { - if (l2arc_compress_buf(l2hdr)) { + if ((!l2arc_nocompress && HDR_L2COMPRESS(hdr)) && + hdr->b_l2hdr.b_asize >= buf_compress_minsz) { + if (l2arc_compress_buf(hdr)) { /* * If compression succeeded, enable headroom * boost on the next scan cycle. @@ -5108,16 +5773,8 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, * Pick up the buffer data we had previously stashed away * (and now potentially also compressed). */ - buf_data = l2hdr->b_tmp_cdata; - buf_sz = l2hdr->b_asize; - - /* - * If the data has not been compressed, then clear b_tmp_cdata - * to make sure that it points only to a temporary compression - * buffer. - */ - if (!L2ARC_IS_VALID_COMPRESS(l2hdr->b_compress)) - l2hdr->b_tmp_cdata = NULL; + buf_data = hdr->b_l1hdr.b_tmp_cdata; + buf_sz = hdr->b_l2hdr.b_asize; /* Compression may have squashed the buffer to zero length. */ if (buf_sz != 0) { @@ -5142,7 +5799,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, } } - mutex_exit(&l2arc_buflist_mtx); + mutex_exit(&dev->l2ad_mtx); ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); @@ -5170,7 +5827,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, /* * Compresses an L2ARC buffer. - * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its + * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its * size in l2hdr->b_asize. This routine tries to compress the data and * depending on the compression result there are three possible outcomes: * *) The buffer was incompressible. The original l2hdr contents were left @@ -5188,31 +5845,43 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, * buffer was incompressible). */ static boolean_t -l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr) +l2arc_compress_buf(arc_buf_hdr_t *hdr) { - void *cdata; + abd_t *cdata; + void *ddata; size_t csize, len, rounded; + l2arc_buf_hdr_t *l2hdr; - ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF); - ASSERT(l2hdr->b_tmp_cdata != NULL); + ASSERT(HDR_HAS_L2HDR(hdr)); + + l2hdr = &hdr->b_l2hdr; + + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF); + ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); len = l2hdr->b_asize; - cdata = zio_data_buf_alloc(len); - csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata, - cdata, l2hdr->b_asize); + cdata = abd_alloc_linear(len); + ASSERT3P(cdata, !=, NULL); + ddata = abd_borrow_buf_copy(hdr->b_l1hdr.b_tmp_cdata, l2hdr->b_asize); + + csize = zio_compress_data(ZIO_COMPRESS_LZ4, ddata, + ABD_TO_BUF(cdata), l2hdr->b_asize); + + abd_return_buf(hdr->b_l1hdr.b_tmp_cdata, ddata, l2hdr->b_asize); rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE); if (rounded > csize) { - bzero((char *)cdata + csize, rounded - csize); + abd_zero_off(cdata, rounded - csize, csize); csize = rounded; } if (csize == 0) { /* zero block, indicate that there's nothing to write */ - zio_data_buf_free(cdata, len); - l2hdr->b_compress = ZIO_COMPRESS_EMPTY; + abd_free(cdata, len); + HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_EMPTY); l2hdr->b_asize = 0; - l2hdr->b_tmp_cdata = NULL; + hdr->b_l1hdr.b_tmp_cdata = NULL; ARCSTAT_BUMP(arcstat_l2_compress_zeros); return (B_TRUE); } else if (csize > 0 && csize < len) { @@ -5220,9 +5889,9 @@ l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr) * Compression succeeded, we'll keep the cdata around for * writing and release it afterwards. */ - l2hdr->b_compress = ZIO_COMPRESS_LZ4; + HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_LZ4); l2hdr->b_asize = csize; - l2hdr->b_tmp_cdata = cdata; + hdr->b_l1hdr.b_tmp_cdata = cdata; ARCSTAT_BUMP(arcstat_l2_compress_successes); return (B_TRUE); } else { @@ -5230,7 +5899,7 @@ l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr) * Compression failed, release the compressed buffer. * l2hdr will be left unmodified. */ - zio_data_buf_free(cdata, len); + abd_free(cdata, len); ARCSTAT_BUMP(arcstat_l2_compress_failures); return (B_FALSE); } @@ -5270,10 +5939,11 @@ l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) * need to fill its io_data after we're done restoring the * buffer's contents. */ - ASSERT(hdr->b_buf != NULL); - bzero(hdr->b_buf->b_data, hdr->b_size); - zio->io_data = zio->io_orig_data = hdr->b_buf->b_data; + ASSERT(hdr->b_l1hdr.b_buf != NULL); + abd_zero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size); + zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data; } else { + void *ddata; ASSERT(zio->io_data != NULL); /* * We copy the compressed data from the start of the arc buffer @@ -5287,10 +5957,15 @@ l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) */ csize = zio->io_size; cdata = zio_data_buf_alloc(csize); - bcopy(zio->io_data, cdata, csize); - if (zio_decompress_data(c, cdata, zio->io_data, csize, + + abd_copy_to_buf(cdata, zio->io_data, csize); + ddata = abd_borrow_buf(zio->io_data, hdr->b_size); + + if (zio_decompress_data(c, cdata, ddata, csize, hdr->b_size) != 0) zio->io_error = SET_ERROR(EIO); + + abd_return_buf_copy(zio->io_data, ddata, hdr->b_size); zio_data_buf_free(cdata, csize); } @@ -5305,22 +5980,38 @@ l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) * done, we can dispose of it. */ static void -l2arc_release_cdata_buf(arc_buf_hdr_t *ab) +l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) { - l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr; + enum zio_compress comp = HDR_GET_COMPRESS(hdr); + + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp)); - ASSERT(L2ARC_IS_VALID_COMPRESS(l2hdr->b_compress)); - if (l2hdr->b_compress != ZIO_COMPRESS_EMPTY) { + if (comp == ZIO_COMPRESS_OFF) { + /* + * In this case, b_tmp_cdata points to the same buffer + * as the arc_buf_t's b_data field. We don't want to + * free it, since the arc_buf_t will handle that. + */ + hdr->b_l1hdr.b_tmp_cdata = NULL; + } else if (comp == ZIO_COMPRESS_EMPTY) { + /* + * In this case, b_tmp_cdata was compressed to an empty + * buffer, thus there's nothing to free and b_tmp_cdata + * should have been set to NULL in l2arc_write_buffers(). + */ + ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); + } else { /* * If the data was compressed, then we've allocated a * temporary buffer for it, so now we need to release it. */ - ASSERT(l2hdr->b_tmp_cdata != NULL); - zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size); - l2hdr->b_tmp_cdata = NULL; - } else { - ASSERT(l2hdr->b_tmp_cdata == NULL); + ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); + abd_free(hdr->b_l1hdr.b_tmp_cdata, + hdr->b_size); + hdr->b_l1hdr.b_tmp_cdata = NULL; } + } /* @@ -5462,13 +6153,13 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) adddev->l2ad_writing = B_FALSE; list_link_init(&adddev->l2ad_node); + mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); /* * This is a list of all ARC buffers that are still valid on the * device. */ - adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP); - list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), - offsetof(arc_buf_hdr_t, b_l2node)); + list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); @@ -5514,8 +6205,8 @@ l2arc_remove_vdev(vdev_t *vd) * Clear all buflists and ARC references. L2ARC device flush. */ l2arc_evict(remdev, 0, B_TRUE); - list_destroy(remdev->l2ad_buflist); - kmem_free(remdev->l2ad_buflist, sizeof (list_t)); + list_destroy(&remdev->l2ad_buflist); + mutex_destroy(&remdev->l2ad_mtx); kmem_free(remdev, sizeof (l2arc_dev_t)); } @@ -5530,7 +6221,6 @@ l2arc_init(void) mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); l2arc_dev_list = &L2ARC_dev_list; @@ -5555,7 +6245,6 @@ l2arc_fini(void) mutex_destroy(&l2arc_feed_thr_lock); cv_destroy(&l2arc_feed_thr_cv); mutex_destroy(&l2arc_dev_mtx); - mutex_destroy(&l2arc_buflist_mtx); mutex_destroy(&l2arc_free_on_write_mtx); list_destroy(l2arc_dev_list); @@ -5605,8 +6294,11 @@ MODULE_PARM_DESC(zfs_arc_max, "Max arc size"); module_param(zfs_arc_meta_limit, ulong, 0644); MODULE_PARM_DESC(zfs_arc_meta_limit, "Meta limit for arc size"); +module_param(zfs_arc_meta_min, ulong, 0644); +MODULE_PARM_DESC(zfs_arc_meta_min, "Min arc metadata"); + module_param(zfs_arc_meta_prune, int, 0644); -MODULE_PARM_DESC(zfs_arc_meta_prune, "Bytes of meta data to prune"); +MODULE_PARM_DESC(zfs_arc_meta_prune, "Meta objects to scan for prune"); module_param(zfs_arc_grow_retry, int, 0644); MODULE_PARM_DESC(zfs_arc_grow_retry, "Seconds before growing arc size"); diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c index 7c8f932f5cf3..166eb33e4d07 100644 --- a/module/zfs/bpobj.c +++ b/module/zfs/bpobj.c @@ -126,7 +126,7 @@ bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) ASSERT3U(offset, >=, dbuf->db_offset); ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); - objarray = dbuf->db_data; + objarray = ABD_TO_BUF(dbuf->db_data); bpobj_free(os, objarray[blkoff], tx); } if (dbuf) { @@ -170,7 +170,7 @@ bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT; bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0); bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1); - bpo->bpo_phys = bpo->bpo_dbuf->db_data; + bpo->bpo_phys = ABD_TO_BUF(bpo->bpo_dbuf->db_data); return (0); } @@ -234,7 +234,7 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, ASSERT3U(offset, >=, dbuf->db_offset); ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); - bparray = dbuf->db_data; + bparray = ABD_TO_BUF(dbuf->db_data); bp = &bparray[blkoff]; err = func(arg, bp, tx); if (err) @@ -294,7 +294,7 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, ASSERT3U(offset, >=, dbuf->db_offset); ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); - objarray = dbuf->db_data; + objarray = ABD_TO_BUF(dbuf->db_data); err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]); if (err) break; @@ -433,7 +433,8 @@ bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) numsubsub * sizeof (subobj)); dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), - numsubsub * sizeof (subobj), subdb->db_data, tx); + numsubsub * sizeof (subobj), + ABD_TO_BUF(subdb->db_data), tx); dmu_buf_rele(subdb, FTAG); bpo->bpo_phys->bpo_num_subobjs += numsubsub; @@ -501,7 +502,7 @@ bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx) } dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx); - bparray = bpo->bpo_cached_dbuf->db_data; + bparray = ABD_TO_BUF(bpo->bpo_cached_dbuf->db_data); bparray[blkoff] = stored_bp; dmu_buf_will_dirty(bpo->bpo_dbuf, tx); diff --git a/module/zfs/bptree.c b/module/zfs/bptree.c index d6ea9d7c6451..acd22f7d8f70 100644 --- a/module/zfs/bptree.c +++ b/module/zfs/bptree.c @@ -74,7 +74,7 @@ bptree_alloc(objset_t *os, dmu_tx_t *tx) */ VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); dmu_buf_will_dirty(db, tx); - bt = db->db_data; + bt = ABD_TO_BUF(db->db_data); bt->bt_begin = 0; bt->bt_end = 0; bt->bt_bytes = 0; @@ -92,7 +92,7 @@ bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) bptree_phys_t *bt; VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); - bt = db->db_data; + bt = ABD_TO_BUF(db->db_data); ASSERT3U(bt->bt_begin, ==, bt->bt_end); ASSERT0(bt->bt_bytes); ASSERT0(bt->bt_comp); @@ -110,7 +110,7 @@ bptree_is_empty(objset_t *os, uint64_t obj) boolean_t rv; VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db)); - bt = db->db_data; + bt = ABD_TO_BUF(db->db_data); rv = (bt->bt_begin == bt->bt_end); dmu_buf_rele(db, FTAG); return (rv); @@ -132,7 +132,7 @@ bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, ASSERT(dmu_tx_is_syncing(tx)); VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); - bt = db->db_data; + bt = ABD_TO_BUF(db->db_data); bte = kmem_zalloc(sizeof (*bte), KM_SLEEP); bte->be_birth_txg = birth_txg; @@ -203,7 +203,7 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, if (free) dmu_buf_will_dirty(db, tx); - ba.ba_phys = db->db_data; + ba.ba_phys = ABD_TO_BUF(db->db_data); ba.ba_free = free; ba.ba_func = func; ba.ba_arg = arg; diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index f10a04d112a8..6df9d94307ff 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -255,8 +255,9 @@ dbuf_evict_user(dmu_buf_impl_t *db) if (db->db_level != 0 || db->db_evict_func == NULL) return; - if (db->db_user_data_ptr_ptr) - *db->db_user_data_ptr_ptr = db->db.db_data; + /* XXX ABD: db_data might be freed */ +// if (db->db_user_data_ptr_ptr) +// *db->db_user_data_ptr_ptr = ABD_TO_BUF(db->db.db_data); db->db_evict_func(&db->db, db->db_user_ptr); db->db_user_ptr = NULL; db->db_user_data_ptr_ptr = NULL; @@ -363,6 +364,21 @@ dbuf_fini(void) */ #ifdef ZFS_DEBUG +static int +checkzero(const void *p, uint64_t size, void *private) +{ + int i; + const uint64_t *x = p; + uint64_t *t = private; + for (i = 0; i < size >> 3; i++) { + if (x[i] != 0) { + *t = x[i]; + return (1); + } + } + return (0); +} + static void dbuf_verify(dmu_buf_impl_t *db) { @@ -446,7 +462,8 @@ dbuf_verify(dmu_buf_impl_t *db) */ if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { ASSERT3P(db->db_blkptr, ==, - ((blkptr_t *)db->db_parent->db.db_data + + ((blkptr_t *) + ABD_TO_BUF(db->db_parent->db.db_data) + db->db_blkid % epb)); } } @@ -461,12 +478,10 @@ dbuf_verify(dmu_buf_impl_t *db) * data when we evict this buffer. */ if (db->db_dirtycnt == 0) { - ASSERTV(uint64_t *buf = db->db.db_data); - int i; - - for (i = 0; i < db->db.db_size >> 3; i++) { - ASSERT(buf[i] == 0); - } + uint64_t tmp = 0; + abd_iterate_rfunc(db->db.db_data, db->db.db_size, + checkzero, &tmp); + ASSERT(tmp == 0); } } DB_DNODE_EXIT(db); @@ -479,7 +494,7 @@ dbuf_update_data(dmu_buf_impl_t *db) ASSERT(MUTEX_HELD(&db->db_mtx)); if (db->db_level == 0 && db->db_user_data_ptr_ptr) { ASSERT(!refcount_is_zero(&db->db_holds)); - *db->db_user_data_ptr_ptr = db->db.db_data; + *db->db_user_data_ptr_ptr = ABD_TO_BUF(db->db.db_data); } } @@ -517,7 +532,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) mutex_exit(&db->db_mtx); abuf = arc_loan_buf(spa, blksz); - bcopy(db->db.db_data, abuf->b_data, blksz); + abd_copy(abuf->b_data, db->db.db_data, blksz); } else { abuf = db->db_buf; arc_loan_inuse_buf(abuf, db); @@ -554,7 +569,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) if (db->db_level == 0 && db->db_freed_in_flight) { /* we were freed in flight; disregard any error */ arc_release(buf, db); - bzero(buf->b_data, db->db.db_size); + abd_zero(buf->b_data, db->db.db_size); arc_buf_freeze(buf); db->db_freed_in_flight = FALSE; dbuf_set_data(db, buf); @@ -577,7 +592,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) { dnode_t *dn; zbookmark_phys_t zb; - uint32_t aflags = ARC_NOWAIT; + uint32_t aflags = ARC_FLAG_NOWAIT; int err; DB_DNODE_ENTER(db); @@ -593,12 +608,13 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); ASSERT3U(bonuslen, <=, db->db.db_size); - db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); + db->db.db_data = abd_alloc_linear(DN_MAX_BONUSLEN); arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); if (bonuslen < DN_MAX_BONUSLEN) - bzero(db->db.db_data, DN_MAX_BONUSLEN); + abd_zero(db->db.db_data, DN_MAX_BONUSLEN); if (bonuslen) - bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); + abd_copy_from_buf(db->db.db_data, + DN_BONUS(dn->dn_phys), bonuslen); DB_DNODE_EXIT(db); dbuf_update_data(db); db->db_state = DB_CACHED; @@ -619,7 +635,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) DB_DNODE_EXIT(db); dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa, db->db.db_size, db, type)); - bzero(db->db.db_data, db->db.db_size); + abd_zero(db->db.db_data, db->db.db_size); db->db_state = DB_CACHED; *flags |= DB_RF_CACHED; mutex_exit(&db->db_mtx); @@ -632,9 +648,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) mutex_exit(&db->db_mtx); if (DBUF_IS_L2CACHEABLE(db)) - aflags |= ARC_L2CACHE; + aflags |= ARC_FLAG_L2CACHE; if (DBUF_IS_L2COMPRESSIBLE(db)) - aflags |= ARC_L2COMPRESS; + aflags |= ARC_FLAG_L2COMPRESS; SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, @@ -646,7 +662,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, &aflags, &zb); - if (aflags & ARC_CACHED) + if (aflags & ARC_FLAG_CACHED) *flags |= DB_RF_CACHED; return (SET_ERROR(err)); @@ -794,7 +810,8 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) if (dr == NULL || (dr->dt.dl.dr_data != - ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) + ((db->db_blkid == DMU_BONUS_BLKID) ? ABD_TO_BUF(db->db.db_data) : + db->db_buf))) return; /* @@ -809,14 +826,15 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) /* Note that the data bufs here are zio_bufs */ dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); - bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); + abd_copy_to_buf(dr->dt.dl.dr_data, db->db.db_data, + DN_MAX_BONUSLEN); } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { int size = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa = db->db_objset->os_spa; dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); - bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); + abd_copy(dr->dt.dl.dr_data->b_data, db->db.db_data, size); } else { dbuf_set_data(db, NULL); } @@ -957,7 +975,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); arc_release(db->db_buf, db); - bzero(db->db.db_data, db->db.db_size); + abd_zero(db->db.db_data, db->db.db_size); arc_buf_freeze(db->db_buf); } @@ -1034,10 +1052,10 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) /* copy old block data to the new block */ obuf = db->db_buf; - bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); + abd_copy(buf->b_data, obuf->b_data, MIN(osize, size)); /* zero the remainder */ if (size > osize) - bzero((uint8_t *)buf->b_data + osize, size - osize); + abd_zero_off(buf->b_data, size - osize, osize); mutex_enter(&db->db_mtx); dbuf_set_data(db, buf); @@ -1206,7 +1224,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) if (db->db_state != DB_NOFILL) { if (db->db_blkid == DMU_BONUS_BLKID) { dbuf_fix_old_data(db, tx->tx_txg); - data_old = db->db.db_data; + data_old = ABD_TO_BUF(db->db.db_data); } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { /* * Release the data buffer from the cache so @@ -1493,7 +1511,7 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ - bzero(db->db.db_data, db->db.db_size); + abd_zero(db->db.db_data, db->db.db_size); db->db_freed_in_flight = FALSE; } db->db_state = DB_CACHED; @@ -1563,7 +1581,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); - bcopy(buf->b_data, db->db.db_data, db->db.db_size); + abd_copy(db->db.db_data, buf->b_data, db->db.db_size); VERIFY(arc_buf_remove_ref(buf, db)); xuio_stat_wbuf_copied(); return; @@ -1629,7 +1647,7 @@ dbuf_clear(dmu_buf_impl_t *db) if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); if (db->db_blkid == DMU_BONUS_BLKID) { - zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); + abd_free(db->db.db_data, DN_MAX_BONUSLEN); arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); } db->db.db_data = NULL; @@ -1737,7 +1755,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, *parentp = NULL; return (err); } - *bpp = ((blkptr_t *)(*parentp)->db.db_data) + + *bpp = ((blkptr_t *)ABD_TO_BUF((*parentp)->db.db_data)) + (blkid & ((1ULL << epbs) - 1)); return (0); } else { @@ -1931,7 +1949,8 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp, NULL) == 0) { if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; - uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; + arc_flags_t aflags = + ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; zbookmark_phys_t zb; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, @@ -2023,8 +2042,9 @@ __dbuf_hold_impl(struct dbuf_hold_impl_data *dh) dbuf_set_data(dh->dh_db, arc_buf_alloc(dh->dh_dn->dn_objset->os_spa, dh->dh_db->db.db_size, dh->dh_db, dh->dh_type)); - bcopy(dh->dh_dr->dt.dl.dr_data->b_data, - dh->dh_db->db.db_data, dh->dh_db->db.db_size); + abd_copy(dh->dh_db->db.db_data, + dh->dh_dr->dt.dl.dr_data->b_data, + dh->dh_db->db.db_size); } } @@ -2430,7 +2450,7 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) mutex_enter(&db->db_mtx); db->db_parent = parent; } - db->db_blkptr = (blkptr_t *)parent->db.db_data + + db->db_blkptr = (blkptr_t *)ABD_TO_BUF(parent->db.db_data) + (db->db_blkid & ((1ULL << epbs) - 1)); DBUF_VERIFY(db); } @@ -2515,7 +2535,8 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT(db->db.db_data == NULL); } else if (db->db_state == DB_FILL) { /* This buffer was freed and is now being re-filled */ - ASSERT(db->db.db_data != dr->dt.dl.dr_data); + ASSERT(!ABD_IS_LINEAR(db->db.db_data) || + ABD_TO_BUF(db->db.db_data) != dr->dt.dl.dr_data); } else { ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); } @@ -2545,7 +2566,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); DB_DNODE_EXIT(db); - if (*datap != db->db.db_data) { + if (*datap != ABD_TO_BUF(db->db.db_data)) { zio_buf_free(*datap, DN_MAX_BONUSLEN); arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); } @@ -2606,7 +2627,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) int blksz = arc_buf_size(*datap); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); *datap = arc_buf_alloc(os->os_spa, blksz, db, type); - bcopy(db->db.db_data, (*datap)->b_data, blksz); + abd_copy((*datap)->b_data, db->db.db_data, blksz); } db->db_data_pending = dr; @@ -2705,7 +2726,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) mutex_exit(&dn->dn_mtx); if (dn->dn_type == DMU_OT_DNODE) { - dnode_phys_t *dnp = db->db.db_data; + dnode_phys_t *dnp = ABD_TO_BUF(db->db.db_data); for (i = db->db.db_size >> DNODE_SHIFT; i > 0; i--, dnp++) { if (dnp->dn_type != DMU_OT_NONE) @@ -2719,7 +2740,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) } } } else { - blkptr_t *ibp = db->db.db_data; + blkptr_t *ibp = ABD_TO_BUF(db->db.db_data); ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { if (BP_IS_HOLE(ibp)) diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 18557ffb5c1f..41edceae4058 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -37,9 +37,6 @@ #include #include -static kmem_cache_t *ddt_cache; -static kmem_cache_t *ddt_entry_cache; - /* * Enable/disable prefetching of dedup-ed blocks which are going to be freed. */ @@ -517,7 +514,7 @@ ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total) { ddt_histogram_t *ddh_total; - ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); + ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_PUSHPAGE); ddt_get_dedup_histogram(spa, ddh_total); ddt_histogram_stat(dds_total, ddh_total); kmem_free(ddh_total, sizeof (ddt_histogram_t)); @@ -664,29 +661,12 @@ ddt_exit(ddt_t *ddt) mutex_exit(&ddt->ddt_lock); } -void -ddt_init(void) -{ - ddt_cache = kmem_cache_create("ddt_cache", - sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0); - ddt_entry_cache = kmem_cache_create("ddt_entry_cache", - sizeof (ddt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); -} - -void -ddt_fini(void) -{ - kmem_cache_destroy(ddt_entry_cache); - kmem_cache_destroy(ddt_cache); -} - static ddt_entry_t * ddt_alloc(const ddt_key_t *ddk) { ddt_entry_t *dde; - dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP); - bzero(dde, sizeof (ddt_entry_t)); + dde = kmem_zalloc(sizeof (ddt_entry_t), KM_PUSHPAGE); cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); dde->dde_key = *ddk; @@ -705,11 +685,10 @@ ddt_free(ddt_entry_t *dde) ASSERT(dde->dde_lead_zio[p] == NULL); if (dde->dde_repair_data != NULL) - zio_buf_free(dde->dde_repair_data, - DDK_GET_PSIZE(&dde->dde_key)); + abd_free(dde->dde_repair_data, DDK_GET_PSIZE(&dde->dde_key)); cv_destroy(&dde->dde_cv); - kmem_cache_free(ddt_entry_cache, dde); + kmem_free(dde, sizeof (*dde)); } void @@ -834,8 +813,7 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c) { ddt_t *ddt; - ddt = kmem_cache_alloc(ddt_cache, KM_SLEEP); - bzero(ddt, sizeof (ddt_t)); + ddt = kmem_zalloc(sizeof (*ddt), KM_PUSHPAGE); mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&ddt->ddt_tree, ddt_entry_compare, @@ -857,7 +835,7 @@ ddt_table_free(ddt_t *ddt) avl_destroy(&ddt->ddt_tree); avl_destroy(&ddt->ddt_repair_tree); mutex_destroy(&ddt->ddt_lock); - kmem_cache_free(ddt_cache, ddt); + kmem_free(ddt, sizeof (*ddt)); } void @@ -937,20 +915,20 @@ ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp) return (B_TRUE); ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)]; - dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP); + dde = kmem_alloc(sizeof (ddt_entry_t), KM_PUSHPAGE); ddt_key_fill(&(dde->dde_key), bp); for (type = 0; type < DDT_TYPES; type++) { for (class = 0; class <= max_class; class++) { if (ddt_object_lookup(ddt, type, class, dde) == 0) { - kmem_cache_free(ddt_entry_cache, dde); + kmem_free(dde, sizeof (ddt_entry_t)); return (B_TRUE); } } } - kmem_cache_free(ddt_entry_cache, dde); + kmem_free(dde, sizeof (ddt_entry_t)); return (B_FALSE); } diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 1501ae8046ad..084e0b970bd2 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -793,7 +793,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); - bcopy((char *)db->db_data + bufoff, buf, tocpy); + abd_copy_to_buf_off(buf, db->db_data, tocpy, bufoff); offset += tocpy; size -= tocpy; @@ -835,7 +835,7 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, else dmu_buf_will_dirty(db, tx); - (void) memcpy((char *)db->db_data + bufoff, buf, tocpy); + abd_copy_from_buf_off(db->db_data, buf, tocpy, bufoff); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); @@ -960,6 +960,7 @@ dmu_xuio_fini(xuio_t *xuio) * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf } * and increase priv->next by 1. */ +/* TODO: abd handle xuio */ int dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) { @@ -1044,7 +1045,8 @@ xuio_stat_wbuf_nocopy() * return value is the number of bytes successfully copied to arg_buf. */ static int -dmu_req_copy(void *arg_buf, int size, struct request *req, size_t req_offset) +dmu_req_copy(abd_t *db_data, int db_offset, int size, struct request *req, + size_t req_offset) { struct bio_vec bv, *bvp; struct req_iterator iter; @@ -1078,9 +1080,11 @@ dmu_req_copy(void *arg_buf, int size, struct request *req, size_t req_offset) ASSERT3P(bv_buf, !=, NULL); if (rq_data_dir(req) == WRITE) - memcpy(arg_buf + offset, bv_buf, tocpy); + abd_copy_from_buf_off(db_data, bv_buf, tocpy, + db_offset + offset); else - memcpy(bv_buf, arg_buf + offset, tocpy); + abd_copy_to_buf_off(bv_buf, db_data, tocpy, + db_offset + offset); offset += tocpy; } @@ -1118,7 +1122,7 @@ dmu_read_req(objset_t *os, uint64_t object, struct request *req) if (tocpy == 0) break; - didcpy = dmu_req_copy(db->db_data + bufoff, tocpy, req, + didcpy = dmu_req_copy(db->db_data, bufoff, tocpy, req, req_offset); if (didcpy < tocpy) @@ -1173,7 +1177,7 @@ dmu_write_req(objset_t *os, uint64_t object, struct request *req, dmu_tx_t *tx) else dmu_buf_will_dirty(db, tx); - didcpy = dmu_req_copy(db->db_data + bufoff, tocpy, req, + didcpy = dmu_req_copy(db->db_data, bufoff, tocpy, req, req_offset); if (tocpy == db->db_size) @@ -1236,8 +1240,8 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) else XUIOSTAT_BUMP(xuiostat_rbuf_copied); } else { - err = uiomove((char *)db->db_data + bufoff, tocpy, - UIO_READ, uio); + err = abd_uiomove_off(db->db_data, tocpy, UIO_READ, + uio, bufoff); } if (err) break; @@ -1285,8 +1289,8 @@ dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) * to lock the pages in memory, so that uiomove won't * block. */ - err = uiomove((char *)db->db_data + bufoff, tocpy, - UIO_WRITE, uio); + err = abd_uiomove_off(db->db_data, tocpy, UIO_WRITE, uio, + bufoff); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); @@ -1399,6 +1403,7 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, } else { objset_t *os; uint64_t object; + void *tmp_buf; DB_DNODE_ENTER(dbuf); dn = DB_DNODE(dbuf); @@ -1407,7 +1412,13 @@ dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, DB_DNODE_EXIT(dbuf); dbuf_rele(db, FTAG); - dmu_write(os, object, offset, blksz, buf->b_data, tx); + + tmp_buf = abd_borrow_buf_copy(buf->b_data, blksz); + + dmu_write(os, object, offset, blksz, tmp_buf, tx); + + abd_return_buf(buf->b_data, tmp_buf, blksz); + dmu_return_arcbuf(buf); XUIOSTAT_BUMP(xuiostat_wbuf_copied); } diff --git a/module/zfs/dmu_diff.c b/module/zfs/dmu_diff.c index 30fabbb07957..32838bbb63db 100644 --- a/module/zfs/dmu_diff.c +++ b/module/zfs/dmu_diff.c @@ -129,7 +129,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, } else if (zb->zb_level == 0) { dnode_phys_t *blk; arc_buf_t *abuf; - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; int blksz = BP_GET_LSIZE(bp); int i; @@ -138,7 +138,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, &aflags, zb) != 0) return (SET_ERROR(EIO)); - blk = abuf->b_data; + blk = ABD_TO_BUF(abuf->b_data); for (i = 0; i < blksz >> DNODE_SHIFT; i++) { uint64_t dnobj = (zb->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index f438ca62a11f..a9afe5772d83 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -284,15 +284,15 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, os->os_spa = spa; os->os_rootbp = bp; if (!BP_IS_HOLE(os->os_rootbp)) { - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; zbookmark_phys_t zb; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); if (DMU_OS_IS_L2CACHEABLE(os)) - aflags |= ARC_L2CACHE; + aflags |= ARC_FLAG_L2CACHE; if (DMU_OS_IS_L2COMPRESSIBLE(os)) - aflags |= ARC_L2COMPRESS; + aflags |= ARC_FLAG_L2COMPRESS; dprintf_bp(os->os_rootbp, "reading %s", ""); err = arc_read(NULL, spa, os->os_rootbp, @@ -312,22 +312,22 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, arc_buf_t *buf = arc_buf_alloc(spa, sizeof (objset_phys_t), &os->os_phys_buf, ARC_BUFC_METADATA); - bzero(buf->b_data, sizeof (objset_phys_t)); - bcopy(os->os_phys_buf->b_data, buf->b_data, + abd_zero(buf->b_data, sizeof (objset_phys_t)); + abd_copy(buf->b_data, os->os_phys_buf->b_data, arc_buf_size(os->os_phys_buf)); (void) arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf); os->os_phys_buf = buf; } - os->os_phys = os->os_phys_buf->b_data; + os->os_phys = ABD_TO_BUF(os->os_phys_buf->b_data); os->os_flags = os->os_phys->os_flags; } else { int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; os->os_phys_buf = arc_buf_alloc(spa, size, &os->os_phys_buf, ARC_BUFC_METADATA); - os->os_phys = os->os_phys_buf->b_data; + os->os_phys = ABD_TO_BUF(os->os_phys_buf->b_data); bzero(os->os_phys, size); } @@ -1219,7 +1219,7 @@ dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) void *data; if (db->db_dirtycnt == 0) - return (db->db.db_data); /* Nothing is changing */ + return (ABD_TO_BUF(db->db.db_data)); /* Nothing is changing */ for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) if (dr->dr_txg == tx->tx_txg) @@ -1235,7 +1235,7 @@ dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) if (dn->dn_bonuslen == 0 && dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) - data = dr->dt.dl.dr_data->b_data; + data = ABD_TO_BUF(dr->dt.dl.dr_data->b_data); else data = dr->dt.dl.dr_data; @@ -1284,7 +1284,7 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) FTAG, (dmu_buf_t **)&db); ASSERT(error == 0); mutex_enter(&db->db_mtx); - data = (before) ? db->db.db_data : + data = (before) ? ABD_TO_BUF(db->db.db_data) : dmu_objset_userquota_find_data(db, tx); have_spill = B_TRUE; } else { diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 1f61368c5d65..4e4a0c379db1 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -446,6 +446,16 @@ backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) +static int +fillbadblock(void *p, uint64_t size, void *private) +{ + int i; + uint64_t *x = p; + for (i = 0; i < size >> 3; i++) + x[i] = 0x2f5baddb10cULL; + return (0); +} + /* ARGSUSED */ static int backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, @@ -481,7 +491,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, dnode_phys_t *blk; int i; int blksz = BP_GET_LSIZE(bp); - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, @@ -489,7 +499,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, &aflags, zb) != 0) return (SET_ERROR(EIO)); - blk = abuf->b_data; + blk = ABD_TO_BUF(abuf->b_data); for (i = 0; i < blksz >> DNODE_SHIFT; i++) { uint64_t dnobj = (zb->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; @@ -499,7 +509,7 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, } (void) arc_buf_remove_ref(abuf, &abuf); } else if (type == DMU_OT_SA) { - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; int blksz = BP_GET_LSIZE(bp); @@ -508,7 +518,8 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, &aflags, zb) != 0) return (SET_ERROR(EIO)); - err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); + err = dump_spill(dsp, zb->zb_object, blksz, + ABD_TO_BUF(abuf->b_data)); (void) arc_buf_remove_ref(abuf, &abuf); } else if (backup_do_embed(dsp, bp)) { /* it's an embedded level-0 block of a regular object */ @@ -516,9 +527,10 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, err = dump_write_embedded(dsp, zb->zb_object, zb->zb_blkid * blksz, blksz, bp); } else { /* it's a level-0 block of a regular object */ - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; int blksz = BP_GET_LSIZE(bp); + void *buf; ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); ASSERT0(zb->zb_level); @@ -526,21 +538,21 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) { if (zfs_send_corrupt_data) { - uint64_t *ptr; /* Send a block filled with 0x"zfs badd bloc" */ abuf = arc_buf_alloc(spa, blksz, &abuf, ARC_BUFC_DATA); - for (ptr = abuf->b_data; - (char *)ptr < (char *)abuf->b_data + blksz; - ptr++) - *ptr = 0x2f5baddb10cULL; + + abd_iterate_wfunc(abuf->b_data, blksz, + fillbadblock, NULL); } else { return (SET_ERROR(EIO)); } } + buf = abd_borrow_buf_copy(abuf->b_data, blksz); err = dump_write(dsp, type, zb->zb_object, zb->zb_blkid * blksz, - blksz, bp, abuf->b_data); + blksz, bp, buf); + abd_return_buf(abuf->b_data, buf, blksz); (void) arc_buf_remove_ref(abuf, &abuf); } @@ -1431,12 +1443,12 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) dmu_buf_will_dirty(db, tx); ASSERT3U(db->db_size, >=, drro->drr_bonuslen); - bcopy(data, db->db_data, drro->drr_bonuslen); + bcopy(data, ABD_TO_BUF(db->db_data), drro->drr_bonuslen); if (ra->byteswap) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drro->drr_bonustype); - dmu_ot_byteswap[byteswap].ob_func(db->db_data, - drro->drr_bonuslen); + dmu_ot_byteswap[byteswap].ob_func( + ABD_TO_BUF(db->db_data), drro->drr_bonuslen); } dmu_buf_rele(db, FTAG); } @@ -1476,7 +1488,7 @@ restore_write(struct restorearg *ra, objset_t *os, dmu_tx_t *tx; dmu_buf_t *bonus; arc_buf_t *abuf; - void *data; + void *data, *lbuf; int err; if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || @@ -1490,9 +1502,11 @@ restore_write(struct restorearg *ra, objset_t *os, return (SET_ERROR(EINVAL)); abuf = dmu_request_arcbuf(bonus, drrw->drr_length); + lbuf = abd_borrow_buf(abuf->b_data, drrw->drr_length); - data = restore_read(ra, drrw->drr_length, abuf->b_data); + data = restore_read(ra, drrw->drr_length, lbuf); if (data == NULL) { + abd_return_buf(abuf->b_data, lbuf, drrw->drr_length); dmu_return_arcbuf(abuf); dmu_buf_rele(bonus, FTAG); return (ra->err); @@ -1504,6 +1518,7 @@ restore_write(struct restorearg *ra, objset_t *os, drrw->drr_offset, drrw->drr_length); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { + abd_return_buf(abuf->b_data, lbuf, drrw->drr_length); dmu_return_arcbuf(abuf); dmu_buf_rele(bonus, FTAG); dmu_tx_abort(tx); @@ -1514,6 +1529,7 @@ restore_write(struct restorearg *ra, objset_t *os, DMU_OT_BYTESWAP(drrw->drr_type); dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length); } + abd_return_buf_copy(abuf->b_data, lbuf, drrw->drr_length); dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); dmu_tx_commit(tx); dmu_buf_rele(bonus, FTAG); @@ -1538,6 +1554,7 @@ restore_write_byref(struct restorearg *ra, objset_t *os, avl_index_t where; objset_t *ref_os = NULL; dmu_buf_t *dbp; + void *buf; if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) return (SET_ERROR(EINVAL)); @@ -1572,8 +1589,10 @@ restore_write_byref(struct restorearg *ra, objset_t *os, dmu_tx_abort(tx); return (err); } + buf = abd_borrow_buf_copy(dbp->db_data, drrwbr->drr_length); dmu_write(os, drrwbr->drr_object, - drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); + drrwbr->drr_offset, drrwbr->drr_length, buf, tx); + abd_return_buf(dbp->db_data, buf, drrwbr->drr_length); dmu_buf_rele(dbp, FTAG); dmu_tx_commit(tx); return (0); @@ -1662,7 +1681,7 @@ restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) if (db_spill->db_size < drrs->drr_length) VERIFY(0 == dbuf_spill_set_blksz(db_spill, drrs->drr_length, tx)); - bcopy(data, db_spill->db_data, drrs->drr_length); + abd_copy_from_buf(db_spill->db_data, data, drrs->drr_length); dmu_buf_rele(db, FTAG); dmu_buf_rele(db_spill, FTAG); diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index b5c1ec758f8b..2989c1283da7 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -177,7 +177,7 @@ static void traverse_prefetch_metadata(traverse_data_t *td, const blkptr_t *bp, const zbookmark_phys_t *zb) { - uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; + arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA)) return; @@ -278,7 +278,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, } if (BP_GET_LEVEL(bp) > 0) { - uint32_t flags = ARC_WAIT; + uint32_t flags = ARC_FLAG_WAIT; int32_t i; int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; zbookmark_phys_t *czb; @@ -295,7 +295,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, zb->zb_level - 1, zb->zb_blkid * epb + i); traverse_prefetch_metadata(td, - &((blkptr_t *)buf->b_data)[i], czb); + &((blkptr_t *)ABD_TO_BUF(buf->b_data))[i], czb); } /* recursively visitbp() blocks below this */ @@ -304,7 +304,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, zb->zb_level - 1, zb->zb_blkid * epb + i); err = traverse_visitbp(td, dnp, - &((blkptr_t *)buf->b_data)[i], czb); + &((blkptr_t *)ABD_TO_BUF(buf->b_data))[i], czb); if (err != 0) break; } @@ -312,7 +312,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, kmem_free(czb, sizeof (zbookmark_phys_t)); } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { - uint32_t flags = ARC_WAIT; + uint32_t flags = ARC_FLAG_WAIT; int32_t i; int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; @@ -320,7 +320,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err != 0) goto post; - dnp = buf->b_data; + dnp = ABD_TO_BUF(buf->b_data); for (i = 0; i < epb; i++) { prefetch_dnode_metadata(td, &dnp[i], zb->zb_objset, @@ -335,7 +335,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, break; } } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { - uint32_t flags = ARC_WAIT; + arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; dnode_phys_t *dnp; @@ -344,7 +344,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, if (err != 0) goto post; - osp = buf->b_data; + osp = ABD_TO_BUF(buf->b_data); dnp = &osp->os_meta_dnode; prefetch_dnode_metadata(td, dnp, zb->zb_objset, DMU_META_DNODE_OBJECT); @@ -451,7 +451,7 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { prefetch_data_t *pfd = arg; - uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; + arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; ASSERT(pfd->pd_blks_fetched >= 0); if (pfd->pd_cancel) @@ -542,7 +542,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, /* See comment on ZIL traversal in dsl_scan_visitds. */ if (ds != NULL && !dsl_dataset_is_snapshot(ds) && !BP_IS_HOLE(rootbp)) { - uint32_t flags = ARC_WAIT; + arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; arc_buf_t *buf; @@ -552,7 +552,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, if (err != 0) return (err); - osp = buf->b_data; + osp = ABD_TO_BUF(buf->b_data); traverse_zil(td, &osp->os_zil_header); (void) arc_buf_remove_ref(buf, &buf); } diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index cdf5a6d0fcfa..4b828e5bbc0a 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -548,7 +548,7 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) break; } - bp = dbuf->db.db_data; + bp = ABD_TO_BUF(dbuf->db.db_data); bp += blkoff; for (i = 0; i < tochk; i++) { diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index ef74621a0f6c..eb8de04470d4 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -235,7 +235,7 @@ dnode_verify(dnode_t *dn) ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL); if (dn->dn_dbuf != NULL) { ASSERT3P(dn->dn_phys, ==, - (dnode_phys_t *)dn->dn_dbuf->db.db_data + + (dnode_phys_t *)ABD_TO_BUF(dn->dn_dbuf->db.db_data) + (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT))); } if (drop_struct_lock) @@ -1089,7 +1089,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, dnh = &children_dnodes->dnc_children[idx]; zrl_add(&dnh->dnh_zrlock); if ((dn = dnh->dnh_dnode) == NULL) { - dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx; + dnode_phys_t *phys = + (dnode_phys_t *)ABD_TO_BUF(db->db.db_data) + idx; dnode_t *winner; dn = dnode_create(os, phys, db, object, dnh); @@ -1509,16 +1510,13 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) head = len; if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE, FTAG, &db) == 0) { - caddr_t data; - /* don't dirty if it isn't on disk and isn't dirty */ if (db->db_last_dirty || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { rw_exit(&dn->dn_struct_rwlock); dmu_buf_will_dirty(&db->db, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - data = db->db.db_data; - bzero(data + blkoff, head); + abd_zero_off(db->db.db_data, head, blkoff); } dbuf_rele(db, FTAG); } @@ -1553,7 +1551,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) rw_exit(&dn->dn_struct_rwlock); dmu_buf_will_dirty(&db->db, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - bzero(db->db.db_data, tail); + abd_zero(db->db.db_data, tail); } dbuf_rele(db, FTAG); } @@ -1780,7 +1778,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, dbuf_rele(db, FTAG); return (error); } - data = db->db.db_data; + data = ABD_TO_BUF(db->db.db_data); } diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c index 1825e983551c..efd52c8f6f1f 100644 --- a/module/zfs/dnode_sync.c +++ b/module/zfs/dnode_sync.c @@ -69,7 +69,7 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) ASSERT(db->db.db_data); ASSERT(arc_released(db->db_buf)); ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size); - bcopy(dn->dn_phys->dn_blkptr, db->db.db_data, + bcopy(dn->dn_phys->dn_blkptr, ABD_TO_BUF(db->db.db_data), sizeof (blkptr_t) * nblkptr); arc_buf_freeze(db->db_buf); } @@ -98,7 +98,8 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) child->db_parent = db; dbuf_add_ref(db, child); if (db->db.db_data) - child->db_blkptr = (blkptr_t *)db->db.db_data + i; + child->db_blkptr = + (blkptr_t *)ABD_TO_BUF(db->db.db_data) + i; else child->db_blkptr = NULL; dprintf_dbuf_bp(child, child->db_blkptr, @@ -159,6 +160,21 @@ free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx) } #ifdef ZFS_DEBUG +static int +checkzero(const void *p, uint64_t size, void *private) +{ + int i; + const uint64_t *x = p; + uint64_t *t = private; + for (i = 0; i < size >> 3; i++) { + if (x[i] != 0) { + *t = x[i]; + return (1); + } + } + return (0); +} + static void free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) { @@ -181,10 +197,9 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) ASSERT(db->db_blkptr != NULL); for (i = off; i < off+num; i++) { - uint64_t *buf; + abd_t *buf; dmu_buf_impl_t *child; dbuf_dirty_record_t *dr; - int j; ASSERT(db->db_level == 1); @@ -203,13 +218,14 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) /* data_old better be zeroed */ if (dr) { + uint64_t tmp = 0; buf = dr->dt.dl.dr_data->b_data; - for (j = 0; j < child->db.db_size >> 3; j++) { - if (buf[j] != 0) { - panic("freed data not zero: " - "child=%p i=%d off=%d num=%d\n", - (void *)child, i, off, num); - } + abd_iterate_rfunc(buf, child->db.db_size, + checkzero, &tmp); + if (tmp) { + panic("freed data not zero: " + "child=%p i=%d off=%d num=%d\n", + (void *)child, i, off, num); } } @@ -221,12 +237,13 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) buf = child->db.db_data; if (buf != NULL && child->db_state != DB_FILL && child->db_last_dirty == NULL) { - for (j = 0; j < child->db.db_size >> 3; j++) { - if (buf[j] != 0) { - panic("freed data not zero: " - "child=%p i=%d off=%d num=%d\n", - (void *)child, i, off, num); - } + uint64_t tmp = 0; + abd_iterate_rfunc(buf, child->db.db_size, + checkzero, &tmp); + if (tmp) { + panic("freed data not zero: " + "child=%p i=%d off=%d num=%d\n", + (void *)child, i, off, num); } } mutex_exit(&child->db_mtx); @@ -257,7 +274,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); dbuf_release_bp(db); - bp = db->db.db_data; + bp = (blkptr_t *)ABD_TO_BUF(db->db.db_data); DB_DNODE_ENTER(db); dn = DB_DNODE(db); @@ -296,13 +313,14 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, } /* If this whole block is free, free ourself too. */ - for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) { + bp = ABD_TO_BUF(db->db.db_data); + for (i = 0; i < 1 << epbs; i++, bp++) { if (!BP_IS_HOLE(bp)) break; } if (i == 1 << epbs) { /* didn't find any non-holes */ - bzero(db->db.db_data, db->db.db_size); + bzero(ABD_TO_BUF(db->db.db_data), db->db.db_size); free_blocks(dn, db->db_blkptr, 1, tx); } else { /* diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 79cb6a3a25e5..322809e8ecd3 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -290,7 +290,7 @@ dsl_dataset_get_snapname(dsl_dataset_t *ds) FTAG, &headdbuf); if (err != 0) return (err); - headphys = headdbuf->db_data; + headphys = ABD_TO_BUF(headdbuf->db_data); err = zap_value_search(dp->dp_meta_objset, headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname); dmu_buf_rele(headdbuf, FTAG); @@ -368,7 +368,7 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); ds->ds_dbuf = dbuf; ds->ds_object = dsobj; - ds->ds_phys = dbuf->db_data; + ds->ds_phys = ABD_TO_BUF(dbuf->db_data); list_link_init(&ds->ds_synced_link); mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); @@ -459,7 +459,7 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, } } ASSERT3P(ds->ds_dbuf, ==, dbuf); - ASSERT3P(ds->ds_phys, ==, dbuf->db_data); + ASSERT3P(ds->ds_phys, ==, ABD_TO_BUF(dbuf->db_data)); ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 || spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); @@ -658,7 +658,7 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); - dsphys = dbuf->db_data; + dsphys = ABD_TO_BUF(dbuf->db_data); bzero(dsphys, sizeof (dsl_dataset_phys_t)); dsphys->ds_dir_obj = dd->dd_object; dsphys->ds_flags = flags; @@ -1064,7 +1064,7 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); - dsphys = dbuf->db_data; + dsphys = ABD_TO_BUF(dbuf->db_data); bzero(dsphys, sizeof (dsl_dataset_phys_t)); dsphys->ds_dir_obj = ds->ds_dir->dd_object; dsphys->ds_fsid_guid = unique_create(); diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index 8a4362ff9c1a..6a34f4e7a01e 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -111,7 +111,7 @@ dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) } dl->dl_oldfmt = B_FALSE; - dl->dl_phys = dl->dl_dbuf->db_data; + dl->dl_phys = ABD_TO_BUF(dl->dl_dbuf->db_data); dl->dl_havetree = B_FALSE; } @@ -482,7 +482,7 @@ dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) zap_cursor_fini(&zc); VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); - dlp = bonus->db_data; + dlp = ABD_TO_BUF(bonus->db_data); dmu_buf_will_dirty(bonus, tx); bzero(dlp, sizeof (*dlp)); dmu_buf_rele(bonus, FTAG); diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index b94b68e15774..b61b3a67ff41 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -101,7 +101,7 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, dd->dd_object = ddobj; dd->dd_dbuf = dbuf; dd->dd_pool = dp; - dd->dd_phys = dbuf->db_data; + dd->dd_phys = ABD_TO_BUF(dbuf->db_data); mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t), @@ -148,7 +148,7 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus); if (err != 0) goto errout; - origin_phys = origin_bonus->db_data; + origin_phys = ABD_TO_BUF(origin_bonus->db_data); dd->dd_origin_txg = origin_phys->ds_creation_txg; dmu_buf_rele(origin_bonus, FTAG); @@ -405,7 +405,7 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, } VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); - ddphys = dbuf->db_data; + ddphys = ABD_TO_BUF(dbuf->db_data); ddphys->dd_creation_time = gethrestime_sec(); if (pds) diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index b54c03bc33fe..bee9f4cab392 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -316,7 +316,14 @@ dsl_pool_close(dsl_pool_t *dp) txg_list_destroy(&dp->dp_sync_tasks); txg_list_destroy(&dp->dp_dirty_dirs); - arc_flush(dp->dp_spa); + /* + * We can't set retry to TRUE since we're explicitly specifying + * a spa to flush. This is good enough; any missed buffers for + * this spa won't cause trouble, and they'll eventually fall + * out of the ARC just like any other unused buffer. + */ + arc_flush(dp->dp_spa, FALSE); + txg_fini(dp); dsl_scan_fini(dp); rrw_destroy(&dp->dp_config_rwlock); diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 8b166bcc68eb..7938ef2b5524 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -571,7 +571,7 @@ dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp, uint64_t objset, uint64_t object, uint64_t blkid) { zbookmark_phys_t czb; - uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; + arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; if (zfs_no_scrub_prefetch) return; @@ -636,7 +636,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, int err; if (BP_GET_LEVEL(bp) > 0) { - uint32_t flags = ARC_WAIT; + arc_flags_t flags = ARC_FLAG_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; @@ -648,11 +648,13 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, scn->scn_phys.scn_errors++; return (err); } - for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { + cbp = ABD_TO_BUF(buf->b_data); + for (i = 0; i < epb; i++, cbp++) { dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset, zb->zb_object, zb->zb_blkid * epb + i); } - for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { + cbp = ABD_TO_BUF(buf->b_data); + for (i = 0; i < epb; i++, cbp++) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, @@ -663,7 +665,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, } (void) arc_buf_remove_ref(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { - uint32_t flags = ARC_WAIT; + arc_flags_t flags = ARC_FLAG_WAIT; dnode_phys_t *cdnp; int i, j; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; @@ -675,21 +677,23 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, scn->scn_phys.scn_errors++; return (err); } - for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { + cdnp = ABD_TO_BUF(buf->b_data); + for (i = 0; i < epb; i++, cdnp++) { for (j = 0; j < cdnp->dn_nblkptr; j++) { blkptr_t *cbp = &cdnp->dn_blkptr[j]; dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset, zb->zb_blkid * epb + i, j); } } - for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { + cdnp = ABD_TO_BUF(buf->b_data); + for (i = 0; i < epb; i++, cdnp++) { dsl_scan_visitdnode(scn, ds, ostype, cdnp, zb->zb_blkid * epb + i, tx); } (void) arc_buf_remove_ref(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { - uint32_t flags = ARC_WAIT; + arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; arc_buf_t *buf; @@ -700,7 +704,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, return (err); } - osp = buf->b_data; + osp = ABD_TO_BUF(buf->b_data); dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx); @@ -1717,7 +1721,7 @@ dsl_scan_scrub_done(zio_t *zio) { spa_t *spa = zio->io_spa; - zio_data_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_data, zio->io_size); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; @@ -1801,7 +1805,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, if (needs_io && !zfs_no_scrub_io) { vdev_t *rvd = spa->spa_root_vdev; uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight; - void *data = zio_data_buf_alloc(size); + abd_t *data = abd_alloc_linear(size); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight >= maxinflight) diff --git a/module/zfs/multilist.c b/module/zfs/multilist.c new file mode 100644 index 000000000000..7f48297f2503 --- /dev/null +++ b/module/zfs/multilist.c @@ -0,0 +1,372 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + */ + +#include +#include + +/* needed for spa_get_random() */ +#include + +/* + * Given the object contained on the list, return a pointer to the + * object's multilist_node_t structure it contains. + */ +static multilist_node_t * +multilist_d2l(multilist_t *ml, void *obj) +{ + return ((multilist_node_t *)((char *)obj + ml->ml_offset)); +} + +/* + * Initialize a new mutlilist using the parameters specified. + * + * - 'size' denotes the size of the structure containing the + * multilist_node_t. + * - 'offset' denotes the byte offset of the mutlilist_node_t within + * the structure that contains it. + * - 'num' specifies the number of internal sublists to create. + * - 'index_func' is used to determine which sublist to insert into + * when the multilist_insert() function is called; as well as which + * sublist to remove from when multilist_remove() is called. The + * requirements this function must meet, are the following: + * + * - It must always return the same value when called on the same + * object (to ensure the object is removed from the list it was + * inserted into). + * + * - It must return a value in the range [0, number of sublists). + * The multilist_get_num_sublists() function may be used to + * determine the number of sublists in the multilist. + * + * Also, in order to reduce internal contention between the sublists + * during insertion and removal, this function should choose evenly + * between all available sublists when inserting. This isn't a hard + * requirement, but a general rule of thumb in order to garner the + * best multi-threaded performance out of the data structure. + */ +void +multilist_create(multilist_t *ml, size_t size, size_t offset, unsigned int num, + multilist_sublist_index_func_t *index_func) +{ + int i; + + ASSERT3P(ml, !=, NULL); + ASSERT3U(size, >, 0); + ASSERT3U(size, >=, offset + sizeof (multilist_node_t)); + ASSERT3U(num, >, 0); + ASSERT3P(index_func, !=, NULL); + + ml->ml_offset = offset; + ml->ml_num_sublists = num; + ml->ml_index_func = index_func; + + ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) * + ml->ml_num_sublists, KM_SLEEP); + + ASSERT3P(ml->ml_sublists, !=, NULL); + + for (i = 0; i < ml->ml_num_sublists; i++) { + multilist_sublist_t *mls = &ml->ml_sublists[i]; + mutex_init(&mls->mls_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&mls->mls_list, size, offset); + } +} + +/* + * Destroy the given multilist object, and free up any memory it holds. + */ +void +multilist_destroy(multilist_t *ml) +{ + int i; + + ASSERT(multilist_is_empty(ml)); + + for (i = 0; i < ml->ml_num_sublists; i++) { + multilist_sublist_t *mls = &ml->ml_sublists[i]; + + ASSERT(list_is_empty(&mls->mls_list)); + + list_destroy(&mls->mls_list); + mutex_destroy(&mls->mls_lock); + } + + ASSERT3P(ml->ml_sublists, !=, NULL); + kmem_free(ml->ml_sublists, + sizeof (multilist_sublist_t) * ml->ml_num_sublists); + + ml->ml_num_sublists = 0; + ml->ml_offset = 0; +} + +/* + * Insert the given object into the multilist. + * + * This function will insert the object specified into the sublist + * determined using the function given at multilist creation time. + * + * The sublist locks are automatically acquired if not already held, to + * ensure consistency when inserting and removing from multiple threads. + */ +void +multilist_insert(multilist_t *ml, void *obj) +{ + unsigned int sublist_idx = ml->ml_index_func(ml, obj); + multilist_sublist_t *mls; + boolean_t need_lock; + + DTRACE_PROBE3(multilist__insert, multilist_t *, ml, + unsigned int, sublist_idx, void *, obj); + + ASSERT3U(sublist_idx, <, ml->ml_num_sublists); + + mls = &ml->ml_sublists[sublist_idx]; + + /* + * Note: Callers may already hold the sublist lock by calling + * multilist_sublist_lock(). Here we rely on MUTEX_HELD() + * returning TRUE if and only if the current thread holds the + * lock. While it's a little ugly to make the lock recursive in + * this way, it works and allows the calling code to be much + * simpler -- otherwise it would have to pass around a flag + * indicating that it already has the lock. + */ + need_lock = !MUTEX_HELD(&mls->mls_lock); + + if (need_lock) + mutex_enter(&mls->mls_lock); + + ASSERT(!multilist_link_active(multilist_d2l(ml, obj))); + + multilist_sublist_insert_head(mls, obj); + + if (need_lock) + mutex_exit(&mls->mls_lock); +} + +/* + * Remove the given object from the multilist. + * + * This function will remove the object specified from the sublist + * determined using the function given at multilist creation time. + * + * The necessary sublist locks are automatically acquired, to ensure + * consistency when inserting and removing from multiple threads. + */ +void +multilist_remove(multilist_t *ml, void *obj) +{ + unsigned int sublist_idx = ml->ml_index_func(ml, obj); + multilist_sublist_t *mls; + boolean_t need_lock; + + DTRACE_PROBE3(multilist__remove, multilist_t *, ml, + unsigned int, sublist_idx, void *, obj); + + ASSERT3U(sublist_idx, <, ml->ml_num_sublists); + + mls = &ml->ml_sublists[sublist_idx]; + /* See comment in multilist_insert(). */ + need_lock = !MUTEX_HELD(&mls->mls_lock); + + if (need_lock) + mutex_enter(&mls->mls_lock); + + ASSERT(multilist_link_active(multilist_d2l(ml, obj))); + + multilist_sublist_remove(mls, obj); + + if (need_lock) + mutex_exit(&mls->mls_lock); +} + +/* + * Check to see if this multilist object is empty. + * + * This will return TRUE if it finds all of the sublists of this + * multilist to be empty, and FALSE otherwise. Each sublist lock will be + * automatically acquired as necessary. + * + * If concurrent insertions and removals are occurring, the semantics + * of this function become a little fuzzy. Instead of locking all + * sublists for the entire call time of the function, each sublist is + * only locked as it is individually checked for emptiness. Thus, it's + * possible for this function to return TRUE with non-empty sublists at + * the time the function returns. This would be due to another thread + * inserting into a given sublist, after that specific sublist was check + * and deemed empty, but before all sublists have been checked. + */ +int +multilist_is_empty(multilist_t *ml) +{ + int i; + + for (i = 0; i < ml->ml_num_sublists; i++) { + multilist_sublist_t *mls = &ml->ml_sublists[i]; + /* See comment in multilist_insert(). */ + boolean_t need_lock = !MUTEX_HELD(&mls->mls_lock); + + if (need_lock) + mutex_enter(&mls->mls_lock); + + if (!list_is_empty(&mls->mls_list)) { + if (need_lock) + mutex_exit(&mls->mls_lock); + + return (FALSE); + } + + if (need_lock) + mutex_exit(&mls->mls_lock); + } + + return (TRUE); +} + +/* Return the number of sublists composing this multilist */ +unsigned int +multilist_get_num_sublists(multilist_t *ml) +{ + return (ml->ml_num_sublists); +} + +/* Return a randomly selected, valid sublist index for this multilist */ +unsigned int +multilist_get_random_index(multilist_t *ml) +{ + return (spa_get_random(ml->ml_num_sublists)); +} + +/* Lock and return the sublist specified at the given index */ +multilist_sublist_t * +multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx) +{ + multilist_sublist_t *mls; + + ASSERT3U(sublist_idx, <, ml->ml_num_sublists); + mls = &ml->ml_sublists[sublist_idx]; + mutex_enter(&mls->mls_lock); + + return (mls); +} + +void +multilist_sublist_unlock(multilist_sublist_t *mls) +{ + mutex_exit(&mls->mls_lock); +} + +/* + * We're allowing any object to be inserted into this specific sublist, + * but this can lead to trouble if multilist_remove() is called to + * remove this object. Specifically, if calling ml_index_func on this + * object returns an index for sublist different than what is passed as + * a parameter here, any call to multilist_remove() with this newly + * inserted object is undefined! (the call to multilist_remove() will + * remove the object from a list that it isn't contained in) + */ +void +multilist_sublist_insert_head(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + list_insert_head(&mls->mls_list, obj); +} + +/* please see comment above multilist_sublist_insert_head */ +void +multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + list_insert_tail(&mls->mls_list, obj); +} + +/* + * Move the object one element forward in the list. + * + * This function will move the given object forward in the list (towards + * the head) by one object. So, in essence, it will swap its position in + * the list with its "prev" pointer. If the given object is already at the + * head of the list, it cannot be moved forward any more than it already + * is, so no action is taken. + * + * NOTE: This function **must not** remove any object from the list other + * than the object given as the parameter. This is relied upon in + * arc_evict_state_impl(). + */ +void +multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj) +{ + void *prev = list_prev(&mls->mls_list, obj); + + ASSERT(MUTEX_HELD(&mls->mls_lock)); + ASSERT(!list_is_empty(&mls->mls_list)); + + /* 'obj' must be at the head of the list, nothing to do */ + if (prev == NULL) + return; + + list_remove(&mls->mls_list, obj); + list_insert_before(&mls->mls_list, prev, obj); +} + +void +multilist_sublist_remove(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + list_remove(&mls->mls_list, obj); +} + +void * +multilist_sublist_head(multilist_sublist_t *mls) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_head(&mls->mls_list)); +} + +void * +multilist_sublist_tail(multilist_sublist_t *mls) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_tail(&mls->mls_list)); +} + +void * +multilist_sublist_next(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_next(&mls->mls_list, obj)); +} + +void * +multilist_sublist_prev(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_prev(&mls->mls_list, obj)); +} + +void +multilist_link_init(multilist_node_t *link) +{ + list_link_init(link); +} + +int +multilist_link_active(multilist_node_t *link) +{ + return (list_link_active(link)); +} diff --git a/module/zfs/sa.c b/module/zfs/sa.c index 9063d1dae449..60a4c586fb9f 100644 --- a/module/zfs/sa.c +++ b/module/zfs/sa.c @@ -724,8 +724,9 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, } /* setup starting pointers to lay down data */ - data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize); - sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data; + data_start = + (void *)((uintptr_t)ABD_TO_BUF(hdl->sa_bonus->db_data) + hdrsize); + sahdr = (sa_hdr_phys_t *)ABD_TO_BUF(hdl->sa_bonus->db_data); buftype = SA_BONUS; attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, @@ -753,7 +754,9 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, hash = -1ULL; len_idx = 0; - sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data; + sahdr = (sa_hdr_phys_t *) + ABD_TO_BUF(hdl->sa_spill->db_data); + sahdr->sa_magic = SA_MAGIC; data_start = (void *)((uintptr_t)sahdr + spillhdrsize); @@ -1676,7 +1679,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, if (dn->dn_bonuslen != 0) { bonus_data_size = hdl->sa_bonus->db_size; old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP); - bcopy(hdl->sa_bonus->db_data, old_data[0], + abd_copy_to_buf(old_data[0], hdl->sa_bonus->db_data, hdl->sa_bonus->db_size); bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count; } else { @@ -1689,7 +1692,7 @@ sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, if ((error = sa_get_spill(hdl)) == 0) { spill_data_size = hdl->sa_spill->db_size; old_data[1] = zio_buf_alloc(spill_data_size); - bcopy(hdl->sa_spill->db_data, old_data[1], + abd_copy_to_buf(old_data[1], hdl->sa_spill->db_data, hdl->sa_spill->db_size); spill_attr_count = hdl->sa_spill_tab->sa_layout->lot_attr_count; diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 8bee15094685..10a4ba8349a5 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1583,7 +1583,7 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) if (error) return (error); - nvsize = *(uint64_t *)db->db_data; + nvsize = *(uint64_t *)ABD_TO_BUF(db->db_data); dmu_buf_rele(db, FTAG); packed = vmem_alloc(nvsize, KM_SLEEP); @@ -1866,7 +1866,7 @@ spa_load_verify_done(zio_t *zio) else atomic_add_64(&sle->sle_data_count, 1); } - zio_data_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_data, zio->io_size); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; @@ -1889,7 +1889,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, { zio_t *rio; size_t size; - void *data; + abd_t *data; if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); @@ -1905,7 +1905,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, rio = arg; size = BP_GET_PSIZE(bp); - data = zio_data_buf_alloc(size); + data = abd_alloc_linear(size); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) @@ -5961,7 +5961,7 @@ spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); dmu_buf_will_dirty(db, tx); - *(uint64_t *)db->db_data = nvsize; + *(uint64_t *)ABD_TO_BUF(db->db_data) = nvsize; dmu_buf_rele(db, FTAG); } diff --git a/module/zfs/spa_history.c b/module/zfs/spa_history.c index 14e681e77d8b..875c671e5bd8 100644 --- a/module/zfs/spa_history.c +++ b/module/zfs/spa_history.c @@ -99,7 +99,7 @@ spa_history_create_obj(spa_t *spa, dmu_tx_t *tx) VERIFY(0 == dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); ASSERT(dbp->db_size >= sizeof (spa_history_phys_t)); - shpp = dbp->db_data; + shpp = ABD_TO_BUF(dbp->db_data); dmu_buf_will_dirty(dbp, tx); /* @@ -222,7 +222,7 @@ spa_history_log_sync(void *arg, dmu_tx_t *tx) * Update the offset when the write completes. */ VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); - shpp = dbp->db_data; + shpp = ABD_TO_BUF(dbp->db_data); dmu_buf_will_dirty(dbp, tx); @@ -361,7 +361,7 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0) return (err); - shpp = dbp->db_data; + shpp = ABD_TO_BUF(dbp->db_data); #ifdef ZFS_DEBUG { diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index ec0c0195947c..d97f35dd60f3 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1738,7 +1738,6 @@ spa_init(int mode) refcount_init(); unique_init(); range_tree_init(); - ddt_init(); zio_init(); dmu_init(); zil_init(); @@ -1763,7 +1762,6 @@ spa_fini(void) zil_fini(); dmu_fini(); zio_fini(); - ddt_fini(); range_tree_fini(); unique_fini(); refcount_fini(); diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c index 3e39dba2c2e6..2b8559b5d276 100644 --- a/module/zfs/spa_stats.c +++ b/module/zfs/spa_stats.c @@ -200,7 +200,7 @@ spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags) if (zfs_read_history == 0 && ssh->size == 0) return; - if (zfs_read_history_hits == 0 && (aflags & ARC_CACHED)) + if (zfs_read_history_hits == 0 && (aflags & ARC_FLAG_CACHED)) return; srh = kmem_zalloc(sizeof (spa_read_history_t), KM_SLEEP); diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c index b3aa469bf45b..d13f55ee3f18 100644 --- a/module/zfs/space_map.c +++ b/module/zfs/space_map.c @@ -349,7 +349,7 @@ space_map_open_impl(space_map_t *sm) return (error); dmu_object_size_from_db(sm->sm_dbuf, &sm->sm_blksz, &blocks); - sm->sm_phys = sm->sm_dbuf->db_data; + sm->sm_phys = ABD_TO_BUF(sm->sm_dbuf->db_data); return (0); } diff --git a/module/zfs/trace.c b/module/zfs/trace.c index 470cf18bff30..7bef71b980e4 100644 --- a/module/zfs/trace.c +++ b/module/zfs/trace.c @@ -23,6 +23,7 @@ * (and only one) C file, so this dummy file exists for that purpose. */ +#include #include #include #include diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 52198261e434..e53af316f3dc 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -945,12 +945,12 @@ vdev_probe_done(zio_t *zio) ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); } else { - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_data, zio->io_size); } } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_error == 0) vps->vps_writeable = 1; - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_data, zio->io_size); } else if (zio->io_type == ZIO_TYPE_NULL) { zio_t *pio; @@ -1067,7 +1067,7 @@ vdev_probe(vdev_t *vd, zio_t *zio) zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, offsetof(vdev_label_t, vl_pad2)), - VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE), + VDEV_PAD_SIZE, abd_alloc_linear(VDEV_PAD_SIZE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } diff --git a/module/zfs/vdev_cache.c b/module/zfs/vdev_cache.c index 389fa6fd9d07..2d0a878dc653 100644 --- a/module/zfs/vdev_cache.c +++ b/module/zfs/vdev_cache.c @@ -146,7 +146,7 @@ vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) avl_remove(&vc->vc_lastused_tree, ve); avl_remove(&vc->vc_offset_tree, ve); - zio_buf_free(ve->ve_data, VCBS); + abd_free(ve->ve_data, VCBS); kmem_free(ve, sizeof (vdev_cache_entry_t)); } @@ -183,7 +183,7 @@ vdev_cache_allocate(zio_t *zio) ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); ve->ve_offset = offset; ve->ve_lastused = ddi_get_lbolt(); - ve->ve_data = zio_buf_alloc(VCBS); + ve->ve_data = abd_alloc_scatter(VCBS); avl_add(&vc->vc_offset_tree, ve); avl_add(&vc->vc_lastused_tree, ve); @@ -206,7 +206,8 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) } ve->ve_hits++; - bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size); + abd_copy_off(zio->io_data, ve->ve_data, zio->io_size, + 0, cache_phase); } /* @@ -357,8 +358,8 @@ vdev_cache_write(zio_t *zio) if (ve->ve_fill_io != NULL) { ve->ve_missed_update = 1; } else { - bcopy((char *)zio->io_data + start - io_start, - ve->ve_data + start - ve->ve_offset, end - start); + abd_copy_off(ve->ve_data, zio->io_data, end - start, + start - ve->ve_offset, start - io_start); } ve = AVL_NEXT(&vc->vc_offset_tree, ve); } diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 7f2263457177..6dfaf2578daa 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -507,12 +507,15 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, size_t kbuf_size, uint64_t kbuf_offset, int flags) { dio_request_t *dr; - caddr_t bio_ptr; + uint64_t zio_offset; uint64_t bio_offset; int bio_size, bio_count = 16; int i = 0, error = 0; + abd_t *zio_data = NULL; ASSERT3U(kbuf_offset + kbuf_size, <=, bdev->bd_inode->i_size); + /* we take either zio or kbuf_ptr, not both */ + ASSERT((!zio || !kbuf_ptr) && (zio || kbuf_ptr)); retry: dr = vdev_disk_dio_alloc(bio_count); @@ -532,9 +535,14 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, * their volume block size to match the maximum request size and * the common case will be one bio per vdev IO request. */ - bio_ptr = kbuf_ptr; + if (zio) + zio_data = zio->io_data; + else + zio_data = abd_get_from_buf(kbuf_ptr, kbuf_size); + + zio_offset = 0; bio_offset = kbuf_offset; - bio_size = kbuf_size; + bio_size = kbuf_size; for (i = 0; i <= dr->dr_bio_count; i++) { /* Finished constructing bio's for given buffer */ @@ -553,9 +561,11 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, } dr->dr_bio[i] = bio_alloc(GFP_NOIO, - bio_nr_pages(bio_ptr, bio_size)); + abd_bio_nr_pages_off(zio_data, bio_size, zio_offset)); /* bio_alloc() with __GFP_WAIT never returns NULL */ if (unlikely(dr->dr_bio[i] == NULL)) { + if (kbuf_ptr) + abd_put(zio_data); vdev_disk_dio_free(dr); return (ENOMEM); } @@ -570,10 +580,11 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, dr->dr_bio[i]->bi_private = dr; /* Remaining size is returned to become the new size */ - bio_size = bio_map(dr->dr_bio[i], bio_ptr, bio_size); + bio_size = abd_bio_map_off(dr->dr_bio[i], zio_data, + bio_size, zio_offset); /* Advance in buffer and construct another bio if needed */ - bio_ptr += BIO_BI_SIZE(dr->dr_bio[i]); + zio_offset += BIO_BI_SIZE(dr->dr_bio[i]); bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); } @@ -601,6 +612,9 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, ASSERT3S(atomic_read(&dr->dr_ref), ==, 1); } + if (kbuf_ptr) + abd_put(zio_data); + (void) vdev_disk_dio_put(dr); return (error); @@ -712,7 +726,7 @@ vdev_disk_io_start(zio_t *zio) return (ZIO_PIPELINE_CONTINUE); } - error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_data, + error = __vdev_disk_physio(vd->vd_bdev, zio, NULL, zio->io_size, zio->io_offset, flags); if (error) { zio->io_error = error; diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c index 7f43ad8001f4..296e27ebd889 100644 --- a/module/zfs/vdev_file.c +++ b/module/zfs/vdev_file.c @@ -149,12 +149,23 @@ vdev_file_io_strategy(void *arg) vdev_t *vd = zio->io_vd; vdev_file_t *vf = vd->vdev_tsd; ssize_t resid; + void *buf; + + if (zio->io_type == ZIO_TYPE_READ) + buf = abd_borrow_buf(zio->io_data, zio->io_size); + else + buf = abd_borrow_buf_copy(zio->io_data, zio->io_size); zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? - UIO_READ : UIO_WRITE, vf->vf_vnode, zio->io_data, + UIO_READ : UIO_WRITE, vf->vf_vnode, buf, zio->io_size, zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); + if (zio->io_type == ZIO_TYPE_READ) + abd_return_buf_copy(zio->io_data, buf, zio->io_size); + else + abd_return_buf(zio->io_data, buf, zio->io_size); + if (resid != 0 && zio->io_error == 0) zio->io_error = SET_ERROR(ENOSPC); diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 7f588ed6b0b5..7dcfb95436bd 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -178,7 +178,7 @@ vdev_label_number(uint64_t psize, uint64_t offset) } static void -vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, +vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, int flags) { ASSERT(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) == @@ -192,7 +192,7 @@ vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, } static void -vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, +vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, int flags) { ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL || @@ -430,6 +430,7 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) spa_t *spa = vd->vdev_spa; nvlist_t *config = NULL; vdev_phys_t *vp; + abd_t *vp_abd; zio_t *zio; uint64_t best_txg = 0; int error = 0; @@ -442,7 +443,8 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) if (!vdev_readable(vd)) return (NULL); - vp = zio_buf_alloc(sizeof (vdev_phys_t)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t)); + vp = ABD_TO_BUF(vp_abd); retry: for (l = 0; l < VDEV_LABELS; l++) { @@ -450,7 +452,7 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) zio = zio_root(spa, NULL, NULL, flags); - vdev_label_read(zio, vd, l, vp, + vdev_label_read(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), NULL, NULL, flags); @@ -489,7 +491,7 @@ vdev_label_read_config(vdev_t *vd, uint64_t txg) goto retry; } - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(vp_abd, sizeof (vdev_phys_t)); return (config); } @@ -627,6 +629,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) vdev_phys_t *vp; char *pad2; uberblock_t *ub; + abd_t *vp_abd, *pad2_abd, *ub_abd; zio_t *zio; char *buf; size_t buflen; @@ -710,7 +713,8 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * Initialize its label. */ - vp = zio_buf_alloc(sizeof (vdev_phys_t)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t)); + vp = ABD_TO_BUF(vp_abd); bzero(vp, sizeof (vdev_phys_t)); /* @@ -771,7 +775,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); if (error != 0) { nvlist_free(label); - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(vp_abd, sizeof (vdev_phys_t)); /* EFAULT means nvlist_pack ran out of room */ return (error == EFAULT ? ENAMETOOLONG : EINVAL); } @@ -779,13 +783,15 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * Initialize uberblock template. */ - ub = zio_buf_alloc(VDEV_UBERBLOCK_RING); + ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING); + ub = ABD_TO_BUF(ub_abd); bzero(ub, VDEV_UBERBLOCK_RING); *ub = spa->spa_uberblock; ub->ub_txg = 0; /* Initialize the 2nd padding area. */ - pad2 = zio_buf_alloc(VDEV_PAD_SIZE); + pad2_abd = abd_alloc_linear(VDEV_PAD_SIZE); + pad2 = ABD_TO_BUF(pad2_abd); bzero(pad2, VDEV_PAD_SIZE); /* @@ -796,7 +802,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) for (l = 0; l < VDEV_LABELS; l++) { - vdev_label_write(zio, vd, l, vp, + vdev_label_write(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), NULL, NULL, flags); @@ -805,11 +811,11 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) * Zero out the 2nd padding area where it might have * left over data from previous filesystem format. */ - vdev_label_write(zio, vd, l, pad2, + vdev_label_write(zio, vd, l, pad2_abd, offsetof(vdev_label_t, vl_pad2), VDEV_PAD_SIZE, NULL, NULL, flags); - vdev_label_write(zio, vd, l, ub, + vdev_label_write(zio, vd, l, ub_abd, offsetof(vdev_label_t, vl_uberblock), VDEV_UBERBLOCK_RING, NULL, NULL, flags); } @@ -822,9 +828,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) } nvlist_free(label); - zio_buf_free(pad2, VDEV_PAD_SIZE); - zio_buf_free(ub, VDEV_UBERBLOCK_RING); - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(pad2_abd, VDEV_PAD_SIZE); + abd_free(ub_abd, VDEV_UBERBLOCK_RING); + abd_free(vp_abd, sizeof (vdev_phys_t)); /* * If this vdev hasn't been previously identified as a spare, then we @@ -888,7 +894,7 @@ vdev_uberblock_load_done(zio_t *zio) vdev_t *vd = zio->io_vd; spa_t *spa = zio->io_spa; zio_t *rio = zio->io_private; - uberblock_t *ub = zio->io_data; + uberblock_t *ub = ABD_TO_BUF(zio->io_data); struct ubl_cbdata *cbp = rio->io_private; ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd)); @@ -909,7 +915,7 @@ vdev_uberblock_load_done(zio_t *zio) mutex_exit(&rio->io_lock); } - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_data, zio->io_size); } static void @@ -925,7 +931,7 @@ vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, for (l = 0; l < VDEV_LABELS; l++) { for (n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { vdev_label_read(zio, vd, l, - zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)), + abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd)), VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), vdev_uberblock_load_done, zio, flags); @@ -993,6 +999,7 @@ vdev_uberblock_sync_done(zio_t *zio) static void vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) { + abd_t *ub_abd; uberblock_t *ubbuf; int c, l, n; @@ -1007,17 +1014,18 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1); - ubbuf = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)); + ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd)); + ubbuf = ABD_TO_BUF(ub_abd); bzero(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); *ubbuf = *ub; for (l = 0; l < VDEV_LABELS; l++) - vdev_label_write(zio, vd, l, ubbuf, + vdev_label_write(zio, vd, l, ub_abd, VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), vdev_uberblock_sync_done, zio->io_private, flags | ZIO_FLAG_DONT_PROPAGATE); - zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); + abd_free(ub_abd, VDEV_UBERBLOCK_SIZE(vd)); } /* Sync the uberblocks to all vdevs in svd[] */ @@ -1094,6 +1102,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) { nvlist_t *label; vdev_phys_t *vp; + abd_t *vp_abd; char *buf; size_t buflen; int c; @@ -1112,7 +1121,8 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) */ label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE); - vp = zio_buf_alloc(sizeof (vdev_phys_t)); + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t)); + vp = ABD_TO_BUF(vp_abd); bzero(vp, sizeof (vdev_phys_t)); buf = vp->vp_nvlist; @@ -1120,7 +1130,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) if (!nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP)) { for (; l < VDEV_LABELS; l += 2) { - vdev_label_write(zio, vd, l, vp, + vdev_label_write(zio, vd, l, vp_abd, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), vdev_label_sync_done, zio->io_private, @@ -1128,7 +1138,7 @@ vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) } } - zio_buf_free(vp, sizeof (vdev_phys_t)); + abd_free(vp_abd, sizeof (vdev_phys_t)); nvlist_free(label); } diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index 77c3d8d385e9..bbdf923f1692 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -262,13 +262,12 @@ vdev_mirror_scrub_done(zio_t *zio) while ((pio = zio_walk_parents(zio)) != NULL) { mutex_enter(&pio->io_lock); ASSERT3U(zio->io_size, >=, pio->io_size); - bcopy(zio->io_data, pio->io_data, pio->io_size); + abd_copy(pio->io_data, zio->io_data, pio->io_size); mutex_exit(&pio->io_lock); } mutex_exit(&zio->io_lock); } - - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_data, zio->io_size); mc->mc_error = zio->io_error; mc->mc_tried = 1; @@ -345,10 +344,16 @@ vdev_mirror_io_start(zio_t *zio) * data into zio->io_data in vdev_mirror_scrub_done. */ for (c = 0; c < mm->mm_children; c++) { + abd_t *tmp; mc = &mm->mm_child[c]; + if (ABD_IS_LINEAR(zio->io_data)) + tmp = abd_alloc_linear(zio->io_size); + else + tmp = abd_alloc_scatter(zio->io_size); + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, mc->mc_vd, mc->mc_offset, - zio_buf_alloc(zio->io_size), zio->io_size, + tmp, zio->io_size, zio->io_type, zio->io_priority, 0, vdev_mirror_scrub_done, mc)); } diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 3fa4219f260e..6984cce610d1 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -448,12 +448,12 @@ vdev_queue_agg_io_done(zio_t *aio) if (aio->io_type == ZIO_TYPE_READ) { zio_t *pio; while ((pio = zio_walk_parents(aio)) != NULL) { - bcopy((char *)aio->io_data + (pio->io_offset - - aio->io_offset), pio->io_data, pio->io_size); + abd_copy_off(pio->io_data, aio->io_data, pio->io_size, + 0, pio->io_offset - aio->io_offset); } } - zio_buf_free(aio->io_data, aio->io_size); + abd_free(aio->io_data, aio->io_size); } /* @@ -591,7 +591,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) ASSERT3U(size, <=, zfs_vdev_aggregation_limit); aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, - zio_buf_alloc(size), size, first->io_type, zio->io_priority, + abd_alloc_scatter(size), size, first->io_type, zio->io_priority, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); aio->io_timestamp = first->io_timestamp; @@ -604,12 +604,11 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) if (dio->io_flags & ZIO_FLAG_NODATA) { ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); - bzero((char *)aio->io_data + (dio->io_offset - - aio->io_offset), dio->io_size); + abd_zero_off(aio->io_data, dio->io_size, + dio->io_offset - aio->io_offset); } else if (dio->io_type == ZIO_TYPE_WRITE) { - bcopy(dio->io_data, (char *)aio->io_data + - (dio->io_offset - aio->io_offset), - dio->io_size); + abd_copy_off(aio->io_data, dio->io_data, dio->io_size, + dio->io_offset - aio->io_offset, 0); } zio_add_child(dio, aio); diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 493b332c4405..bdf4171445cd 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -103,7 +103,7 @@ typedef struct raidz_col { uint64_t rc_devidx; /* child device index for I/O */ uint64_t rc_offset; /* device offset */ uint64_t rc_size; /* I/O size */ - void *rc_data; /* I/O data */ + abd_t *rc_data; /* I/O data */ void *rc_gdata; /* used to store the "good" version */ int rc_error; /* I/O error for this device */ uint8_t rc_tried; /* Did we attempt this I/O column? */ @@ -120,7 +120,7 @@ typedef struct raidz_map { uint64_t rm_firstdatacol; /* First data column/parity count */ uint64_t rm_nskip; /* Skipped sectors for padding */ uint64_t rm_skipstart; /* Column index of padding start */ - void *rm_datacopy; /* rm_asize-buffer of copied data */ + abd_t *rm_datacopy; /* rm_asize-buffer of copied data */ uintptr_t rm_reports; /* # of referencing checksum reports */ uint8_t rm_freed; /* map no longer has referencing ZIO */ uint8_t rm_ecksuminjected; /* checksum error was injected */ @@ -258,7 +258,7 @@ vdev_raidz_map_free(raidz_map_t *rm) size_t size; for (c = 0; c < rm->rm_firstdatacol; c++) { - zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); + abd_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); if (rm->rm_col[c].rc_gdata != NULL) zio_buf_free(rm->rm_col[c].rc_gdata, @@ -266,11 +266,13 @@ vdev_raidz_map_free(raidz_map_t *rm) } size = 0; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + abd_put(rm->rm_col[c].rc_data); size += rm->rm_col[c].rc_size; + } if (rm->rm_datacopy != NULL) - zio_buf_free(rm->rm_datacopy, size); + abd_free(rm->rm_datacopy, size); kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); } @@ -307,7 +309,7 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) size_t x; const char *good = NULL; - const char *bad = rm->rm_col[c].rc_data; + char *bad; if (good_data == NULL) { zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); @@ -321,8 +323,9 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) * data never changes for a given logical ZIO) */ if (rm->rm_col[0].rc_gdata == NULL) { - char *bad_parity[VDEV_RAIDZ_MAXPARITY]; + abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; char *buf; + int offset; /* * Set up the rm_col[]s to generate the parity for @@ -331,14 +334,19 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) */ for (x = 0; x < rm->rm_firstdatacol; x++) { bad_parity[x] = rm->rm_col[x].rc_data; - rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = + rm->rm_col[x].rc_gdata = zio_buf_alloc(rm->rm_col[x].rc_size); + rm->rm_col[x].rc_data = + abd_get_from_buf(rm->rm_col[x].rc_gdata, + rm->rm_col[x].rc_size); } /* fill in the data columns from good_data */ buf = (char *)good_data; for (; x < rm->rm_cols; x++) { - rm->rm_col[x].rc_data = buf; + abd_put(rm->rm_col[x].rc_data); + rm->rm_col[x].rc_data = abd_get_from_buf(buf, + rm->rm_col[x].rc_size); buf += rm->rm_col[x].rc_size; } @@ -348,13 +356,17 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) vdev_raidz_generate_parity(rm); /* restore everything back to its original state */ - for (x = 0; x < rm->rm_firstdatacol; x++) + for (x = 0; x < rm->rm_firstdatacol; x++) { + abd_put(rm->rm_col[x].rc_data); rm->rm_col[x].rc_data = bad_parity[x]; + } - buf = rm->rm_datacopy; + offset = 0; for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { - rm->rm_col[x].rc_data = buf; - buf += rm->rm_col[x].rc_size; + abd_put(rm->rm_col[x].rc_data); + rm->rm_col[x].rc_data = abd_get_offset( + rm->rm_datacopy, offset); + offset += rm->rm_col[x].rc_size; } } @@ -368,8 +380,10 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) good += rm->rm_col[x].rc_size; } + bad = abd_borrow_buf_copy(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); /* we drop the ereport if it ends up that the data was good */ zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); + abd_return_buf(rm->rm_col[c].rc_data, bad, rm->rm_col[c].rc_size); } /* @@ -382,7 +396,7 @@ static void vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) { size_t c = (size_t)(uintptr_t)arg; - caddr_t buf; + size_t offset; raidz_map_t *rm = zio->io_vsd; size_t size; @@ -412,17 +426,22 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) size += rm->rm_col[c].rc_size; - buf = rm->rm_datacopy = zio_buf_alloc(size); + if (ABD_IS_LINEAR(rm->rm_col[rm->rm_firstdatacol].rc_data)) + rm->rm_datacopy = abd_alloc_linear(size); + else + rm->rm_datacopy = abd_alloc_scatter(size); - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { raidz_col_t *col = &rm->rm_col[c]; + abd_t *tmp = abd_get_offset(rm->rm_datacopy, offset); - bcopy(col->rc_data, buf, col->rc_size); - col->rc_data = buf; + abd_copy(tmp, col->rc_data, col->rc_size); + abd_put(col->rc_data); + col->rc_data = tmp; - buf += col->rc_size; + offset += col->rc_size; } - ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); + ASSERT3U(offset, ==, size); } static const zio_vsd_ops_t vdev_raidz_vsd_ops = { @@ -451,6 +470,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, /* The starting byte offset on each child vdev. */ uint64_t o = (b / dcols) << unit_shift; uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; + uint64_t off = 0; /* * "Quotient": The number of data sectors for this stripe on all but @@ -534,13 +554,16 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, ASSERT3U(rm->rm_nskip, <=, nparity); for (c = 0; c < rm->rm_firstdatacol; c++) - rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); + rm->rm_col[c].rc_data = + abd_alloc_linear(rm->rm_col[c].rc_size); - rm->rm_col[c].rc_data = zio->io_data; + rm->rm_col[c].rc_data = abd_get_offset(zio->io_data, 0); + off = rm->rm_col[c].rc_size; - for (c = c + 1; c < acols; c++) - rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + - rm->rm_col[c - 1].rc_size; + for (c = c + 1; c < acols; c++) { + rm->rm_col[c].rc_data = abd_get_offset(zio->io_data, off); + off += rm->rm_col[c].rc_size; + } /* * If all data stored spans all columns, there's a danger that parity @@ -582,29 +605,81 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, return (rm); } +struct pqr_struct { + uint64_t *p; + uint64_t *q; + uint64_t *r; +}; + +static int +vdev_raidz_p_func(const void *buf, uint64_t size, void *private) +{ + struct pqr_struct *pqr = private; + const uint64_t *src = buf; + int i, cnt = size / sizeof (src[0]); + + ASSERT(pqr->p && !pqr->q && !pqr->r); + + for (i = 0; i < cnt; i++, src++, pqr->p++) + *pqr->p ^= *src; + return (0); +} + +static int +vdev_raidz_pq_func(const void *buf, uint64_t size, void *private) +{ + struct pqr_struct *pqr = private; + const uint64_t *src = buf; + uint64_t mask; + int i, cnt = size / sizeof (src[0]); + + ASSERT(pqr->p && pqr->q && !pqr->r); + + for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { + *pqr->p ^= *src; + VDEV_RAIDZ_64MUL_2(*pqr->q, mask); + *pqr->q ^= *src; + } + return (0); +} + +static int +vdev_raidz_pqr_func(const void *buf, uint64_t size, void *private) +{ + struct pqr_struct *pqr = private; + const uint64_t *src = buf; + uint64_t mask; + int i, cnt = size / sizeof (src[0]); + + ASSERT(pqr->p && pqr->q && pqr->r); + + for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { + *pqr->p ^= *src; + VDEV_RAIDZ_64MUL_2(*pqr->q, mask); + *pqr->q ^= *src; + VDEV_RAIDZ_64MUL_4(*pqr->r, mask); + *pqr->r ^= *src; + } + return (0); +} + static void vdev_raidz_generate_parity_p(raidz_map_t *rm) { - uint64_t *p, *src, pcount, ccount, i; + uint64_t *p; int c; - - pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + abd_t *src; for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { src = rm->rm_col[c].rc_data; - p = rm->rm_col[VDEV_RAIDZ_P].rc_data; - ccount = rm->rm_col[c].rc_size / sizeof (src[0]); + p = ABD_TO_BUF(rm->rm_col[VDEV_RAIDZ_P].rc_data); if (c == rm->rm_firstdatacol) { - ASSERT(ccount == pcount); - for (i = 0; i < ccount; i++, src++, p++) { - *p = *src; - } + abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); } else { - ASSERT(ccount <= pcount); - for (i = 0; i < ccount; i++, src++, p++) { - *p ^= *src; - } + struct pqr_struct pqr = { p, NULL, NULL }; + abd_iterate_rfunc(src, rm->rm_col[c].rc_size, + vdev_raidz_p_func, &pqr); } } } @@ -612,50 +687,43 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm) static void vdev_raidz_generate_parity_pq(raidz_map_t *rm) { - uint64_t *p, *q, *src, pcnt, ccnt, mask, i; + uint64_t *p, *q, pcnt, ccnt, mask, i; int c; + abd_t *src; - pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_Q].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { src = rm->rm_col[c].rc_data; - p = rm->rm_col[VDEV_RAIDZ_P].rc_data; - q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + p = ABD_TO_BUF(rm->rm_col[VDEV_RAIDZ_P].rc_data); + q = ABD_TO_BUF(rm->rm_col[VDEV_RAIDZ_Q].rc_data); + + ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); - ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); if (c == rm->rm_firstdatacol) { - ASSERT(ccnt == pcnt || ccnt == 0); - for (i = 0; i < ccnt; i++, src++, p++, q++) { - *p = *src; - *q = *src; - } - for (; i < pcnt; i++, src++, p++, q++) { - *p = 0; - *q = 0; - } + abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); + memcpy(q, p, rm->rm_col[c].rc_size); } else { - ASSERT(ccnt <= pcnt); - - /* - * Apply the algorithm described above by multiplying - * the previous result and adding in the new value. - */ - for (i = 0; i < ccnt; i++, src++, p++, q++) { - *p ^= *src; + struct pqr_struct pqr = { p, q, NULL }; + abd_iterate_rfunc(src, rm->rm_col[c].rc_size, + vdev_raidz_pq_func, &pqr); + } - VDEV_RAIDZ_64MUL_2(*q, mask); - *q ^= *src; + if (c == rm->rm_firstdatacol) { + for (i = ccnt; i < pcnt; i++) { + p[i] = 0; + q[i] = 0; } - + } else { /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ - for (; i < pcnt; i++, q++) { - VDEV_RAIDZ_64MUL_2(*q, mask); + for (i = ccnt; i < pcnt; i++) { + VDEV_RAIDZ_64MUL_2(q[i], mask); } } } @@ -664,10 +732,11 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm) static void vdev_raidz_generate_parity_pqr(raidz_map_t *rm) { - uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; + uint64_t *p, *q, *r, pcnt, ccnt, mask, i; int c; + abd_t *src; - pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_Q].rc_size); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == @@ -675,48 +744,36 @@ vdev_raidz_generate_parity_pqr(raidz_map_t *rm) for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { src = rm->rm_col[c].rc_data; - p = rm->rm_col[VDEV_RAIDZ_P].rc_data; - q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; - r = rm->rm_col[VDEV_RAIDZ_R].rc_data; + p = ABD_TO_BUF(rm->rm_col[VDEV_RAIDZ_P].rc_data); + q = ABD_TO_BUF(rm->rm_col[VDEV_RAIDZ_Q].rc_data); + r = ABD_TO_BUF(rm->rm_col[VDEV_RAIDZ_R].rc_data); - ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); + ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); if (c == rm->rm_firstdatacol) { - ASSERT(ccnt == pcnt || ccnt == 0); - for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { - *p = *src; - *q = *src; - *r = *src; - } - for (; i < pcnt; i++, src++, p++, q++, r++) { - *p = 0; - *q = 0; - *r = 0; - } + abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); + memcpy(q, p, rm->rm_col[c].rc_size); + memcpy(r, p, rm->rm_col[c].rc_size); } else { - ASSERT(ccnt <= pcnt); - - /* - * Apply the algorithm described above by multiplying - * the previous result and adding in the new value. - */ - for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { - *p ^= *src; - - VDEV_RAIDZ_64MUL_2(*q, mask); - *q ^= *src; + struct pqr_struct pqr = { p, q, r }; + abd_iterate_rfunc(src, rm->rm_col[c].rc_size, + vdev_raidz_pqr_func, &pqr); + } - VDEV_RAIDZ_64MUL_4(*r, mask); - *r ^= *src; + if (c == rm->rm_firstdatacol) { + for (i = ccnt; i < pcnt; i++) { + p[i] = 0; + q[i] = 0; + r[i] = 0; } - + } else { /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. */ - for (; i < pcnt; i++, q++, r++) { - VDEV_RAIDZ_64MUL_2(*q, mask); - VDEV_RAIDZ_64MUL_4(*r, mask); + for (i = ccnt; i < pcnt; i++) { + VDEV_RAIDZ_64MUL_2(q[i], mask); + VDEV_RAIDZ_64MUL_4(r[i], mask); } } } @@ -744,40 +801,126 @@ vdev_raidz_generate_parity(raidz_map_t *rm) } } +static int +vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, uint64_t dsize, + uint64_t ssize, void *private) +{ + uint64_t *dst = dbuf, *src = sbuf; + int i, cnt = dsize / sizeof (src[0]); + + ASSERT(dsize == ssize); + + for (i = 0; i < cnt; i++) { + dst[i] ^= src[i]; + } + + return (0); +} + +static int +vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, uint64_t dsize, + uint64_t ssize, void *private) +{ + uint64_t *dst = dbuf, *src = sbuf; + uint64_t mask; + int i, dcnt = dsize / sizeof (src[0]), scnt = ssize / sizeof (src[0]); + + ASSERT(dsize >= ssize); + + for (i = 0; i < scnt; i++, dst++, src++) { + VDEV_RAIDZ_64MUL_2(*dst, mask); + *dst ^= *src; + } + + for (; i < dcnt; i++, dst++) { + VDEV_RAIDZ_64MUL_2(*dst, mask); + } + return (0); +} + +struct reconst_q_struct { + uint64_t *q; + int exp; +}; + +static int +vdev_raidz_reconst_q_post_func(void *buf, uint64_t size, void *private) +{ + struct reconst_q_struct *rq = private; + uint64_t *dst = buf; + uint8_t *b; + int i, j, cnt = size / sizeof (dst[0]); + + for (i = 0; i < cnt; i++, dst++, rq->q++) { + *dst ^= *rq->q; + for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { + *b = vdev_raidz_exp2(*b, rq->exp); + } + } + return (0); +} + +struct reconst_pq_struct { + uint8_t *p; + uint8_t *q; + uint8_t *pxy; + uint8_t *qxy; + int aexp; + int bexp; +}; + +static int +vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, uint64_t xsize, + uint64_t ysize, void *private) +{ + struct reconst_pq_struct *rpq = private; + uint8_t *xd = xbuf, *yd = ybuf; + int i; + + ASSERT(xsize >= ysize); + + for (i = 0; i < xsize; i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, + xd++, yd++) { + *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ + vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); + + if (i < ysize) + *yd = *rpq->p ^ *rpq->pxy ^ *xd; + } + return (0); +} + static int vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) { - uint64_t *dst, *src, xcount, ccount, count, i; int x = tgts[0]; int c; + abd_t *dst, *src; ASSERT(ntgts == 1); ASSERT(x >= rm->rm_firstdatacol); ASSERT(x < rm->rm_cols); - xcount = rm->rm_col[x].rc_size / sizeof (src[0]); - ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); - ASSERT(xcount > 0); + ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size); + ASSERT(rm->rm_col[x].rc_size > 0); src = rm->rm_col[VDEV_RAIDZ_P].rc_data; dst = rm->rm_col[x].rc_data; - for (i = 0; i < xcount; i++, dst++, src++) { - *dst = *src; - } + + abd_copy_from_buf(dst, ABD_TO_BUF(src), rm->rm_col[x].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + uint64_t size = MIN(rm->rm_col[x].rc_size, + rm->rm_col[c].rc_size); + src = rm->rm_col[c].rc_data; dst = rm->rm_col[x].rc_data; if (c == x) continue; - ccount = rm->rm_col[c].rc_size / sizeof (src[0]); - count = MIN(ccount, xcount); - - for (i = 0; i < count; i++, dst++, src++) { - *dst ^= *src; - } + abd_iterate_func2(dst, src, size, size, + vdev_raidz_reconst_p_func, NULL); } return (1 << VDEV_RAIDZ_P); @@ -786,44 +929,30 @@ vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) static int vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) { - uint64_t *dst, *src, xcount, ccount, count, mask, i; - uint8_t *b; int x = tgts[0]; - int c, j, exp; + int c, exp; + abd_t *dst, *src; + struct reconst_q_struct rq; ASSERT(ntgts == 1); - xcount = rm->rm_col[x].rc_size / sizeof (src[0]); - ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); + ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size, + rm->rm_col[c].rc_size); + src = rm->rm_col[c].rc_data; dst = rm->rm_col[x].rc_data; - if (c == x) - ccount = 0; - else - ccount = rm->rm_col[c].rc_size / sizeof (src[0]); - - count = MIN(ccount, xcount); - if (c == rm->rm_firstdatacol) { - for (i = 0; i < count; i++, dst++, src++) { - *dst = *src; - } - for (; i < xcount; i++, dst++) { - *dst = 0; - } - + abd_copy(dst, src, size); + if (rm->rm_col[x].rc_size > size) + abd_zero_off(dst, rm->rm_col[x].rc_size - size, + size); } else { - for (i = 0; i < count; i++, dst++, src++) { - VDEV_RAIDZ_64MUL_2(*dst, mask); - *dst ^= *src; - } - - for (; i < xcount; i++, dst++) { - VDEV_RAIDZ_64MUL_2(*dst, mask); - } + abd_iterate_func2(dst, src, rm->rm_col[x].rc_size, + size, vdev_raidz_reconst_q_pre_func, NULL); } } @@ -831,12 +960,9 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) dst = rm->rm_col[x].rc_data; exp = 255 - (rm->rm_cols - 1 - x); - for (i = 0; i < xcount; i++, dst++, src++) { - *dst ^= *src; - for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { - *b = vdev_raidz_exp2(*b, exp); - } - } + rq = (struct reconst_q_struct) { ABD_TO_BUF(src), exp }; + abd_iterate_wfunc(dst, rm->rm_col[x].rc_size, + vdev_raidz_reconst_q_post_func, &rq); return (1 << VDEV_RAIDZ_Q); } @@ -844,11 +970,13 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) static int vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) { - uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; - void *pdata, *qdata; - uint64_t xsize, ysize, i; + uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; + abd_t *pdata, *qdata; + uint64_t xsize, ysize; int x = tgts[0]; int y = tgts[1]; + abd_t *xd, *yd; + struct reconst_pq_struct rpq; ASSERT(ntgts == 2); ASSERT(x < y); @@ -870,9 +998,9 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) ysize = rm->rm_col[y].rc_size; rm->rm_col[VDEV_RAIDZ_P].rc_data = - zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); + abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size); rm->rm_col[VDEV_RAIDZ_Q].rc_data = - zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); + abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size); rm->rm_col[x].rc_size = 0; rm->rm_col[y].rc_size = 0; @@ -881,10 +1009,10 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) rm->rm_col[x].rc_size = xsize; rm->rm_col[y].rc_size = ysize; - p = pdata; - q = qdata; - pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; - qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + p = ABD_TO_BUF(pdata); + q = ABD_TO_BUF(qdata); + pxy = ABD_TO_BUF(rm->rm_col[VDEV_RAIDZ_P].rc_data); + qxy = ABD_TO_BUF(rm->rm_col[VDEV_RAIDZ_Q].rc_data); xd = rm->rm_col[x].rc_data; yd = rm->rm_col[y].rc_data; @@ -910,17 +1038,13 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; - for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { - *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ - vdev_raidz_exp2(*q ^ *qxy, bexp); + rpq = (struct reconst_pq_struct) { p, q, pxy, qxy, aexp, bexp }; + abd_iterate_func2(xd, yd, xsize, ysize, + vdev_raidz_reconst_pq_func, &rpq); - if (i < ysize) - *yd = *p ^ *pxy ^ *xd; - } - - zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, + abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, rm->rm_col[VDEV_RAIDZ_P].rc_size); - zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, + abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, rm->rm_col[VDEV_RAIDZ_Q].rc_size); /* @@ -1245,7 +1369,7 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, c = used[i]; ASSERT3U(c, <, rm->rm_cols); - src = rm->rm_col[c].rc_data; + src = ABD_TO_BUF(rm->rm_col[c].rc_data); ccount = rm->rm_col[c].rc_size; for (j = 0; j < nmissing; j++) { cc = missing[j] + rm->rm_firstdatacol; @@ -1253,7 +1377,7 @@ vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, ASSERT3U(cc, <, rm->rm_cols); ASSERT3U(cc, !=, c); - dst[j] = rm->rm_col[cc].rc_data; + dst[j] = ABD_TO_BUF(rm->rm_col[cc].rc_data); dcount[j] = rm->rm_col[cc].rc_size; } @@ -1301,8 +1425,25 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; uint8_t *used; + abd_t **bufs = NULL; + int code = 0; + /* + * matrix reconstruction can use scatter buffer yet, so we allocate + * temporary linear abds. + */ + if (!ABD_IS_LINEAR(rm->rm_col[rm->rm_firstdatacol].rc_data)) { + bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + raidz_col_t *col = &rm->rm_col[c]; + + bufs[c] = col->rc_data; + col->rc_data = abd_alloc_linear(col->rc_size); + abd_copy(col->rc_data, bufs[c], col->rc_size); + } + } n = rm->rm_cols - rm->rm_firstdatacol; @@ -1389,6 +1530,20 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) kmem_free(p, psize); + /* + * copy back from temporary linear abds and free them + */ + if (bufs) { + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + raidz_col_t *col = &rm->rm_col[c]; + + abd_copy(bufs[c], col->rc_data, col->rc_size); + abd_free(col->rc_data, col->rc_size); + col->rc_data = bufs[c]; + } + kmem_free(bufs, rm->rm_cols * sizeof (abd_t *)); + } + return (code); } @@ -1661,6 +1816,7 @@ vdev_raidz_io_start(zio_t *zio) static void raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) { + void *buf; vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { @@ -1674,9 +1830,11 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) zbc.zbc_has_cksum = 0; zbc.zbc_injected = rm->rm_ecksuminjected; + buf = abd_borrow_buf_copy(rc->rc_data, rc->rc_size); zfs_ereport_post_checksum(zio->io_spa, vd, zio, - rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, + rc->rc_offset, rc->rc_size, buf, bad_data, &zbc); + abd_return_buf(rc->rc_data, buf, rc->rc_size); } } @@ -1718,7 +1876,7 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) if (!rc->rc_tried || rc->rc_error != 0) continue; orig[c] = zio_buf_alloc(rc->rc_size); - bcopy(rc->rc_data, orig[c], rc->rc_size); + abd_copy_to_buf(orig[c], rc->rc_data, rc->rc_size); } vdev_raidz_generate_parity(rm); @@ -1727,7 +1885,7 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) rc = &rm->rm_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; - if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { + if (bcmp(orig[c], ABD_TO_BUF(rc->rc_data), rc->rc_size) != 0) { raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = SET_ERROR(ECKSUM); ret++; @@ -1835,7 +1993,8 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) ASSERT3S(c, >=, 0); ASSERT3S(c, <, rm->rm_cols); rc = &rm->rm_col[c]; - bcopy(rc->rc_data, orig[i], rc->rc_size); + abd_copy_to_buf(orig[i], rc->rc_data, + rc->rc_size); } /* @@ -1866,7 +2025,8 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) for (i = 0; i < n; i++) { c = tgts[i]; rc = &rm->rm_col[c]; - bcopy(orig[i], rc->rc_data, rc->rc_size); + abd_copy_from_buf(rc->rc_data, orig[i], + rc->rc_size); } do { diff --git a/module/zfs/zap.c b/module/zfs/zap.c index 5ffa138a6b4b..5ebd3b3d3988 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -91,7 +91,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) * explicitly zero it since it might be coming from an * initialized microzap */ - bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size); + bzero(ABD_TO_BUF(zap->zap_dbuf->db_data), zap->zap_dbuf->db_size); zp->zap_block_type = ZBT_HEADER; zp->zap_magic = ZAP_MAGIC; @@ -117,7 +117,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); l->l_dbuf = db; - l->l_phys = db->db_data; + l->l_phys = ABD_TO_BUF(db->db_data); zap_leaf_init(l, zp->zap_normflags != 0); @@ -181,15 +181,16 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db_new, tx); - transfer_func(db_old->db_data, db_new->db_data, hepb); + transfer_func(ABD_TO_BUF(db_old->db_data), + ABD_TO_BUF(db_new->db_data), hepb); dmu_buf_rele(db_new, FTAG); /* second half of entries in old[b] go to new[2*b+1] */ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db_new, tx); - transfer_func((uint64_t *)db_old->db_data + hepb, - db_new->db_data, hepb); + transfer_func((uint64_t *)ABD_TO_BUF(db_old->db_data) + hepb, + ABD_TO_BUF(db_new->db_data), hepb); dmu_buf_rele(db_new, FTAG); dmu_buf_rele(db_old, FTAG); @@ -253,12 +254,11 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, return (err); } dmu_buf_will_dirty(db2, tx); - ((uint64_t *)db2->db_data)[off2] = val; - ((uint64_t *)db2->db_data)[off2+1] = val; + ((uint64_t *)ABD_TO_BUF(db2->db_data))[off2] = val; + ((uint64_t *)ABD_TO_BUF(db2->db_data))[off2+1] = val; dmu_buf_rele(db2, FTAG); } - - ((uint64_t *)db->db_data)[off] = val; + ((uint64_t *)ABD_TO_BUF(db->db_data))[off] = val; dmu_buf_rele(db, FTAG); return (0); @@ -281,7 +281,7 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err) return (err); - *valp = ((uint64_t *)db->db_data)[off]; + *valp = ((uint64_t *)ABD_TO_BUF(db->db_data))[off]; dmu_buf_rele(db, FTAG); if (tbl->zt_nextblk != 0) { @@ -350,7 +350,8 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) return (err); dmu_buf_will_dirty(db_new, tx); zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), - db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); + ABD_TO_BUF(db_new->db_data), + 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); dmu_buf_rele(db_new, FTAG); zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk; @@ -530,7 +531,7 @@ zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, dmu_buf_will_dirty(db, tx); ASSERT3U(l->l_blkid, ==, blkid); ASSERT3P(l->l_dbuf, ==, db); - ASSERT3P(l->l_phys, ==, l->l_dbuf->db_data); + ASSERT3P(l->l_phys, ==, ABD_TO_BUF(l->l_dbuf->db_data)); ASSERT3U(l->l_phys->l_hdr.lh_block_type, ==, ZBT_LEAF); ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); @@ -576,7 +577,7 @@ zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) int err; ASSERT(zap->zap_dbuf == NULL || - zap->zap_f.zap_phys == zap->zap_dbuf->db_data); + zap->zap_f.zap_phys == ABD_TO_BUF(zap->zap_dbuf->db_data)); ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC); idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); err = zap_idx_to_blk(zap, idx, &blk); @@ -1324,7 +1325,7 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err == 0) { - zap_stats_ptrtbl(zap, db->db_data, + zap_stats_ptrtbl(zap, ABD_TO_BUF(db->db_data), 1<<(bs-3), zs); dmu_buf_rele(db, FTAG); } diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index dfa7c6615659..b7f1654f7a0a 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -372,7 +372,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) zap->zap_object = obj; zap->zap_dbuf = db; - if (*(uint64_t *)db->db_data != ZBT_MICRO) { + if (*(uint64_t *)ABD_TO_BUF(db->db_data) != ZBT_MICRO) { mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1; } else { @@ -531,7 +531,7 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags) sz = zap->zap_dbuf->db_size; mzp = zio_buf_alloc(sz); - bcopy(zap->zap_dbuf->db_data, mzp, sz); + bcopy(ABD_TO_BUF(zap->zap_dbuf->db_data), mzp, sz); nchunks = zap->zap_m.zap_num_chunks; if (!flags) { @@ -587,7 +587,7 @@ mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags, #endif dmu_buf_will_dirty(db, tx); - zp = db->db_data; + zp = ABD_TO_BUF(db->db_data); zp->mz_block_type = ZBT_MICRO; zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL; zp->mz_normflags = normflags; diff --git a/module/zfs/zfs_fuid.c b/module/zfs/zfs_fuid.c index 6ca61b87242f..67154cf93dab 100644 --- a/module/zfs/zfs_fuid.c +++ b/module/zfs/zfs_fuid.c @@ -120,7 +120,7 @@ zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree, ASSERT(fuid_obj != 0); VERIFY(0 == dmu_bonus_hold(os, fuid_obj, FTAG, &db)); - fuid_size = *(uint64_t *)db->db_data; + fuid_size = *(uint64_t *)ABD_TO_BUF(db->db_data); dmu_buf_rele(db, FTAG); if (fuid_size) { @@ -281,7 +281,7 @@ zfs_fuid_sync(zfs_sb_t *zsb, dmu_tx_t *tx) VERIFY(0 == dmu_bonus_hold(zsb->z_os, zsb->z_fuid_obj, FTAG, &db)); dmu_buf_will_dirty(db, tx); - *(uint64_t *)db->db_data = zsb->z_fuid_size; + *(uint64_t *)ABD_TO_BUF(db->db_data) = zsb->z_fuid_size; dmu_buf_rele(db, FTAG); zsb->z_fuid_dirty = B_FALSE; diff --git a/module/zfs/zfs_sa.c b/module/zfs/zfs_sa.c index 257ab4254bbd..c15098373c9b 100644 --- a/module/zfs/zfs_sa.c +++ b/module/zfs/zfs_sa.c @@ -77,14 +77,14 @@ zfs_sa_readlink(znode_t *zp, uio_t *uio) bufsz = zp->z_size; if (bufsz + ZFS_OLD_ZNODE_PHYS_SIZE <= db->db_size) { - error = uiomove((caddr_t)db->db_data + - ZFS_OLD_ZNODE_PHYS_SIZE, - MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); + error = abd_uiomove_off(db->db_data, + MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio, + ZFS_OLD_ZNODE_PHYS_SIZE); } else { dmu_buf_t *dbp; if ((error = dmu_buf_hold(ZTOZSB(zp)->z_os, zp->z_id, 0, FTAG, &dbp, DMU_READ_NO_PREFETCH)) == 0) { - error = uiomove(dbp->db_data, + error = abd_uiomove(dbp->db_data, MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); dmu_buf_rele(dbp, FTAG); } @@ -101,8 +101,8 @@ zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx) VERIFY(dmu_set_bonus(db, len + ZFS_OLD_ZNODE_PHYS_SIZE, tx) == 0); if (len) { - bcopy(link, (caddr_t)db->db_data + - ZFS_OLD_ZNODE_PHYS_SIZE, len); + abd_copy_from_buf_off(db->db_data, link, len, + ZFS_OLD_ZNODE_PHYS_SIZE); } } else { dmu_buf_t *dbp; @@ -114,7 +114,7 @@ zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx) dmu_buf_will_dirty(dbp, tx); ASSERT3U(len, <=, dbp->db_size); - bcopy(link, dbp->db_data, len); + abd_copy_from_buf(dbp->db_data, link, len); dmu_buf_rele(dbp, FTAG); } } @@ -145,9 +145,9 @@ zfs_sa_get_scanstamp(znode_t *zp, xvattr_t *xvap) ZFS_OLD_ZNODE_PHYS_SIZE; if (len <= doi.doi_bonus_size) { - (void) memcpy(xoap->xoa_av_scanstamp, - (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, - sizeof (xoap->xoa_av_scanstamp)); + abd_copy_to_buf_off(xoap->xoa_av_scanstamp, + db->db_data, sizeof (xoap->xoa_av_scanstamp), + ZFS_OLD_ZNODE_PHYS_SIZE); } } XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); @@ -175,8 +175,8 @@ zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) ZFS_OLD_ZNODE_PHYS_SIZE; if (len > doi.doi_bonus_size) VERIFY(dmu_set_bonus(db, len, tx) == 0); - (void) memcpy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, - xoap->xoa_av_scanstamp, sizeof (xoap->xoa_av_scanstamp)); + abd_copy_from_buf_off(db->db_data, xoap->xoa_av_scanstamp, + sizeof (xoap->xoa_av_scanstamp), ZFS_OLD_ZNODE_PHYS_SIZE); zp->z_pflags |= ZFS_BONUS_SCANSTAMP; VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zsb), @@ -375,8 +375,8 @@ zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx) /* if scanstamp then add scanstamp */ if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) { - bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, - scanstamp, AV_SCANSTAMP_SZ); + abd_copy_to_buf_off(scanstamp, db->db_data, + AV_SCANSTAMP_SZ, ZFS_OLD_ZNODE_PHYS_SIZE); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SCANSTAMP(zsb), NULL, scanstamp, AV_SCANSTAMP_SZ); zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP; diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c index 4df324a68f88..e98f4bf6a120 100644 --- a/module/zfs/zfs_vfsops.c +++ b/module/zfs/zfs_vfsops.c @@ -1068,29 +1068,52 @@ zfs_root(zfs_sb_t *zsb, struct inode **ipp) } EXPORT_SYMBOL(zfs_root); -#if defined(HAVE_SHRINK) || defined(HAVE_SPLIT_SHRINKER_CALLBACK) +/* + * The ARC has requested that the filesystem drop entries from the dentry + * and inode caches. This can occur when the ARC needs to free meta data + * blocks but can't because they are all pinned by entries in these caches. + */ int zfs_sb_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) { zfs_sb_t *zsb = sb->s_fs_info; + int error = 0; +#if defined(HAVE_SHRINK) || defined(HAVE_SPLIT_SHRINKER_CALLBACK) struct shrinker *shrinker = &sb->s_shrink; struct shrink_control sc = { .nr_to_scan = nr_to_scan, .gfp_mask = GFP_KERNEL, }; +#endif ZFS_ENTER(zsb); -#ifdef HAVE_SPLIT_SHRINKER_CALLBACK + +#if defined(HAVE_SPLIT_SHRINKER_CALLBACK) *objects = (*shrinker->scan_objects)(shrinker, &sc); -#else +#elif defined(HAVE_SHRINK) *objects = (*shrinker->shrink)(shrinker, &sc); +#else + /* + * Linux kernels older than 3.1 do not support a per-filesystem + * shrinker. Therefore, we must fall back to the only available + * interface which is to discard all unused dentries and inodes. + * This behavior clearly isn't ideal but it's required so the ARC + * may free memory. The performance impact is mitigated by the + * fact that the frequently accessed dentry and inode buffers will + * still be in the ARC making them relatively cheap to recreate. + */ + *objects = 0; + shrink_dcache_parent(sb->s_root); #endif ZFS_EXIT(zsb); - return (0); + dprintf_ds(zsb->z_os->os_dsl_dataset, + "pruning, nr_to_scan=%lu objects=%d error=%d\n", + nr_to_scan, *objects, error); + + return (error); } EXPORT_SYMBOL(zfs_sb_prune); -#endif /* defined(HAVE_SHRINK) || defined(HAVE_SPLIT_SHRINKER_CALLBACK) */ /* * Teardown the zfs_sb_t. @@ -1286,6 +1309,8 @@ zfs_domount(struct super_block *sb, void *data, int silent) if (!zsb->z_issnap) zfsctl_create(zsb); + + zsb->z_arc_prune = arc_add_prune_callback(zpl_prune_sb, sb); out: if (error) { dmu_objset_disown(zsb->z_os, zsb); @@ -1324,6 +1349,7 @@ zfs_umount(struct super_block *sb) zfs_sb_t *zsb = sb->s_fs_info; objset_t *os; + arc_remove_prune_callback(zsb->z_arc_prune); VERIFY(zfs_sb_teardown(zsb, B_TRUE) == 0); os = zsb->z_os; bdi_destroy(sb->s_bdi); @@ -1682,7 +1708,6 @@ zfs_init(void) zfs_znode_init(); dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); register_filesystem(&zpl_fs_type); - (void) arc_add_prune_callback(zpl_prune_sbs, NULL); } void diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 723d6210f26f..f14387aaa36d 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -711,7 +711,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) error = SET_ERROR(EDQUOT); break; } - + /* TODO: abd can't handle xuio */ if (xuio && abuf == NULL) { ASSERT(i_iov < iovcnt); aiov = &iovp[i_iov]; @@ -738,7 +738,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) max_blksz); ASSERT(abuf != NULL); ASSERT(arc_buf_size(abuf) == max_blksz); - if ((error = uiocopy(abuf->b_data, max_blksz, + if ((error = abd_uiocopy(abuf->b_data, max_blksz, UIO_WRITE, uio, &cbytes))) { dmu_return_arcbuf(abuf); break; @@ -800,6 +800,7 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) * block-aligned, use assign_arcbuf(). Otherwise, * write via dmu_write(). */ + /* TODO: abd can't handle xuio */ if (tx_bytes < max_blksz && (!write_eof || aiov->iov_base != abuf->b_data)) { ASSERT(xuio); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 15897b363de6..c363e7fae87e 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -204,7 +204,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, char **end) { enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf = NULL; zbookmark_phys_t zb; int error; @@ -235,7 +235,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, cksum.zc_word[ZIL_ZC_SEQ]++; if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { - zil_chain_t *zilc = abuf->b_data; + zil_chain_t *zilc = ABD_TO_BUF(abuf->b_data); char *lr = (char *)(zilc + 1); uint64_t len = zilc->zc_nused - sizeof (zil_chain_t); @@ -243,12 +243,12 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { error = SET_ERROR(ECKSUM); } else { - bcopy(lr, dst, len); + memcpy(dst, lr, len); *end = (char *)dst + len; *nbp = zilc->zc_next_blk; } } else { - char *lr = abuf->b_data; + char *lr = ABD_TO_BUF(abuf->b_data); uint64_t size = BP_GET_LSIZE(bp); zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; @@ -257,7 +257,7 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, (zilc->zc_nused > (size - sizeof (*zilc)))) { error = SET_ERROR(ECKSUM); } else { - bcopy(lr, dst, zilc->zc_nused); + memcpy(dst, lr, zilc->zc_nused); *end = (char *)dst + zilc->zc_nused; *nbp = zilc->zc_next_blk; } @@ -277,7 +277,7 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) { enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; const blkptr_t *bp = &lr->lr_blkptr; - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf = NULL; zbookmark_phys_t zb; int error; @@ -299,7 +299,7 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) if (error == 0) { if (wbuf != NULL) - bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); + abd_copy_to_buf(wbuf, abuf->b_data, arc_buf_size(abuf)); (void) arc_buf_remove_ref(abuf, &abuf); } @@ -887,6 +887,7 @@ zil_lwb_write_done(zio_t *zio) * one in zil_commit_writer(). zil_sync() will only remove * the lwb if lwb_buf is null. */ + abd_put(zio->io_data); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); mutex_enter(&zilog->zl_lock); lwb->lwb_zio = NULL; @@ -923,12 +924,16 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) /* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */ mutex_enter(&zilog->zl_lock); if (lwb->lwb_zio == NULL) { + abd_t *lwb_abd; if (!lwb->lwb_fastwrite) { metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk); lwb->lwb_fastwrite = 1; } + + lwb_abd = abd_get_from_buf(lwb->lwb_buf, + BP_GET_LSIZE(&lwb->lwb_blk)); lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, - 0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk), + 0, &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_FASTWRITE, &zb); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 49e2d93b6783..ac88c2642cc4 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -256,11 +256,16 @@ zio_data_buf_free(void *buf, size_t size) * ========================================================================== */ static void -zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize, +zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform) { zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); + if (ABD_IS_LINEAR(zio->io_data)) + ASSERT_ABD_LINEAR(data); + else + ASSERT_ABD_SCATTER(data); + zt->zt_orig_data = zio->io_data; zt->zt_orig_size = zio->io_size; zt->zt_bufsize = bufsize; @@ -284,7 +289,7 @@ zio_pop_transforms(zio_t *zio) zt->zt_orig_data, zt->zt_orig_size); if (zt->zt_bufsize != 0) - zio_buf_free(zio->io_data, zt->zt_bufsize); + abd_free(zio->io_data, zt->zt_bufsize); zio->io_data = zt->zt_orig_data; zio->io_size = zt->zt_orig_size; @@ -300,21 +305,29 @@ zio_pop_transforms(zio_t *zio) * ========================================================================== */ static void -zio_subblock(zio_t *zio, void *data, uint64_t size) +zio_subblock(zio_t *zio, abd_t *data, uint64_t size) { ASSERT(zio->io_size > size); if (zio->io_type == ZIO_TYPE_READ) - bcopy(zio->io_data, data, size); + abd_copy(data, zio->io_data, size); } static void -zio_decompress(zio_t *zio, void *data, uint64_t size) +zio_decompress(zio_t *zio, abd_t *data, uint64_t size) { - if (zio->io_error == 0 && - zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), - zio->io_data, data, zio->io_size, size) != 0) - zio->io_error = SET_ERROR(EIO); + void *buf1, *buf2; + if (zio->io_error == 0) { + buf1 = abd_borrow_buf_copy(zio->io_data, zio->io_size); + buf2 = abd_borrow_buf(data, size); + + if (zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), + buf1, buf2, zio->io_size, size) != 0) + zio->io_error = SET_ERROR(EIO); + + abd_return_buf_copy(data, buf2, size); + abd_return_buf(zio->io_data, buf1, zio->io_size); + } } /* @@ -483,7 +496,7 @@ zio_inherit_child_errors(zio_t *zio, enum zio_child c) */ static zio_t * zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, - void *data, uint64_t size, zio_done_func_t *done, void *private, + abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_type_t type, zio_priority_t priority, enum zio_flag flags, vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb, enum zio_stage stage, enum zio_stage pipeline) @@ -595,7 +608,7 @@ zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) zio_t * zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, - void *data, uint64_t size, zio_done_func_t *done, void *private, + abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) { zio_t *zio; @@ -611,7 +624,7 @@ zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, zio_t * zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, const zio_prop_t *zp, + abd_t *data, uint64_t size, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) @@ -650,7 +663,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, } zio_t * -zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, +zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t size, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) { @@ -802,7 +815,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio_t * zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, - void *data, int checksum, zio_done_func_t *done, void *private, + abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; @@ -823,7 +836,7 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_t * zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, - void *data, int checksum, zio_done_func_t *done, void *private, + abd_t *data, int checksum, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; @@ -846,8 +859,13 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, * Therefore, we must make a local copy in case the data is * being written to multiple places in parallel. */ - void *wbuf = zio_buf_alloc(size); - bcopy(data, wbuf, size); + abd_t *wbuf; + if (ABD_IS_LINEAR(data)) + wbuf = abd_alloc_linear(size); + else + wbuf = abd_alloc_scatter(size); + abd_copy(wbuf, data, size); + zio_push_transform(zio, wbuf, size, size, NULL); } @@ -859,7 +877,7 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, */ zio_t * zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, - void *data, uint64_t size, int type, zio_priority_t priority, + abd_t *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; @@ -903,7 +921,7 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, } zio_t * -zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, +zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, int type, zio_priority_t priority, enum zio_flag flags, zio_done_func_t *done, void *private) { @@ -961,14 +979,24 @@ zio_read_bp_init(zio_t *zio) !(zio->io_flags & ZIO_FLAG_RAW)) { uint64_t psize = BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); - void *cbuf = zio_buf_alloc(psize); + abd_t *cbuf; + if (ABD_IS_LINEAR(zio->io_data)) + cbuf = abd_alloc_linear(psize); + else + cbuf = abd_alloc_scatter(psize); zio_push_transform(zio, cbuf, psize, psize, zio_decompress); } if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { + void *data; + int psize; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - decode_embedded_bp_compressed(bp, zio->io_data); + + psize = BPE_GET_PSIZE(bp); + data = abd_borrow_buf(zio->io_data, psize); + decode_embedded_bp_compressed(bp, data); + abd_return_buf_copy(zio->io_data, data, psize); } else { ASSERT(!BP_IS_EMBEDDED(bp)); } @@ -1072,11 +1100,25 @@ zio_write_bp_init(zio_t *zio) } if (compress != ZIO_COMPRESS_OFF) { - void *cbuf = zio_buf_alloc(lsize); - psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); + void *cbuf; + void *dbuf; + abd_t *cdata; + + if (ABD_IS_LINEAR(zio->io_data)) + cdata = abd_alloc_linear(lsize); + else + cdata = abd_alloc_scatter(lsize); + + cbuf = abd_borrow_buf(cdata, lsize); + + dbuf = abd_borrow_buf_copy(zio->io_data, lsize); + psize = zio_compress_data(compress, dbuf, cbuf, lsize); + abd_return_buf(zio->io_data, dbuf, lsize); + if (psize == 0 || psize == lsize) { compress = ZIO_COMPRESS_OFF; - zio_buf_free(cbuf, lsize); + abd_return_buf(cdata, cbuf, lsize); + abd_free(cdata, lsize); } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE && zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { @@ -1085,7 +1127,8 @@ zio_write_bp_init(zio_t *zio) BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); BP_SET_TYPE(bp, zio->io_prop.zp_type); BP_SET_LEVEL(bp, zio->io_prop.zp_level); - zio_buf_free(cbuf, lsize); + abd_return_buf(cdata, cbuf, lsize); + abd_free(cdata, lsize); bp->blk_birth = zio->io_txg; zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; ASSERT(spa_feature_is_active(spa, @@ -1098,15 +1141,16 @@ zio_write_bp_init(zio_t *zio) */ size_t rounded = P2ROUNDUP(psize, (size_t)SPA_MINBLOCKSIZE); + abd_return_buf_copy(cdata, cbuf, lsize); if (rounded > psize) { - bzero((char *)cbuf + psize, rounded - psize); + abd_zero_off(cdata, rounded - psize, psize); psize = rounded; } if (psize == lsize) { compress = ZIO_COMPRESS_OFF; - zio_buf_free(cbuf, lsize); + abd_free(cdata, lsize); } else { - zio_push_transform(zio, cbuf, + zio_push_transform(zio, cdata, psize, lsize, NULL); } } @@ -1600,26 +1644,38 @@ zio_resume_wait(spa_t *spa) * ========================================================================== */ +static void +zio_gang_issue_func_done(zio_t *zio) +{ + abd_put(zio->io_data); +} + static zio_t * -zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) { if (gn != NULL) return (pio); - return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp), - NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), + return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset), + BP_GET_PSIZE(bp), zio_gang_issue_func_done, + NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); } -zio_t * -zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +static zio_t * +zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) { zio_t *zio; + abd_t *gbh_abd; if (gn != NULL) { + gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, - gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority, - ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL, + pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), + &pio->io_bookmark); /* * As we rewrite each gang header, the pipeline will compute * a new gang block header checksum for it; but no one will @@ -1630,8 +1686,12 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) * this is just good hygiene.) */ if (gn != pio->io_gang_leader->io_gang_tree) { + abd_t *buf = abd_get_offset(data, offset); + zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), - data, BP_GET_PSIZE(bp)); + buf, BP_GET_PSIZE(bp)); + + abd_put(buf); } /* * If we are here to damage data for testing purposes, @@ -1641,7 +1701,8 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } else { zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, - data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, + abd_get_offset(data, offset), BP_GET_PSIZE(bp), + zio_gang_issue_func_done, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); } @@ -1649,16 +1710,18 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) } /* ARGSUSED */ -zio_t * -zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +static zio_t * +zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) { return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, ZIO_GANG_CHILD_FLAGS(pio))); } /* ARGSUSED */ -zio_t * -zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) +static zio_t * +zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) { return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); @@ -1722,13 +1785,14 @@ static void zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) { zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); + abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); ASSERT(gio->io_gang_leader == gio); ASSERT(BP_IS_GANG(bp)); - zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, - SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, - gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); + zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE, + zio_gang_tree_assemble_done, gn, gio->io_priority, + ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); } static void @@ -1746,12 +1810,14 @@ zio_gang_tree_assemble_done(zio_t *zio) return; if (BP_SHOULD_BYTESWAP(bp)) - byteswap_uint64_array(zio->io_data, zio->io_size); + byteswap_uint64_array(ABD_TO_BUF(zio->io_data), zio->io_size); - ASSERT(zio->io_data == gn->gn_gbh); + ASSERT(ABD_TO_BUF(zio->io_data) == gn->gn_gbh); ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); + abd_put(zio->io_data); + for (g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (!BP_IS_GANG(gbp)) @@ -1761,7 +1827,8 @@ zio_gang_tree_assemble_done(zio_t *zio) } static void -zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) +zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data, + uint64_t offset) { zio_t *gio = pio->io_gang_leader; zio_t *zio; @@ -1775,7 +1842,7 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) * If you're a gang header, your data is in gn->gn_gbh. * If you're a gang member, your data is in 'data' and gn == NULL. */ - zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); + zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset); if (gn != NULL) { ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); @@ -1784,13 +1851,14 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (BP_IS_HOLE(gbp)) continue; - zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data); - data = (char *)data + BP_GET_PSIZE(gbp); + zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data, + offset); + offset += BP_GET_PSIZE(gbp); } } if (gn == gio->io_gang_tree) - ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); + ASSERT3U(gio->io_size, ==, offset); if (zio != pio) zio_nowait(zio); @@ -1823,7 +1891,8 @@ zio_gang_issue(zio_t *zio) ASSERT(zio->io_child_type > ZIO_CHILD_GANG); if (zio->io_child_error[ZIO_CHILD_GANG] == 0) - zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); + zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data, + 0); else zio_gang_tree_free(&zio->io_gang_tree); @@ -1863,6 +1932,12 @@ zio_write_gang_member_ready(zio_t *zio) mutex_exit(&pio->io_lock); } +static void +zio_write_gang_done(zio_t *zio) +{ + abd_put(zio->io_data); +} + static int zio_write_gang_block(zio_t *pio) { @@ -1872,6 +1947,7 @@ zio_write_gang_block(zio_t *pio) zio_t *zio; zio_gang_node_t *gn, **gnpp; zio_gbh_phys_t *gbh; + abd_t *gbh_abd; uint64_t txg = pio->io_txg; uint64_t resid = pio->io_size; uint64_t lsize; @@ -1898,12 +1974,14 @@ zio_write_gang_block(zio_t *pio) gn = zio_gang_node_alloc(gnpp); gbh = gn->gn_gbh; bzero(gbh, SPA_GANGBLOCKSIZE); + gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE); /* * Create the gang header. */ - zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL, - pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, + zio_write_gang_done, NULL, pio->io_priority, + ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); /* * Create and nowait the gang children. @@ -1923,10 +2001,10 @@ zio_write_gang_block(zio_t *pio) zp.zp_nopwrite = B_FALSE; zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], - (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, - zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g], - pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), - &pio->io_bookmark)); + abd_get_offset(pio->io_data, pio->io_size - resid), lsize, + &zp, zio_write_gang_member_ready, NULL, + zio_write_gang_done, &gn->gn_child[g], pio->io_priority, + ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark)); } /* @@ -2017,10 +2095,11 @@ zio_ddt_child_read_done(zio_t *zio) ddp = ddt_phys_select(dde, bp); if (zio->io_error == 0) ddt_phys_clear(ddp); /* this ddp doesn't need repair */ + if (zio->io_error == 0 && dde->dde_repair_data == NULL) dde->dde_repair_data = zio->io_data; else - zio_buf_free(zio->io_data, zio->io_size); + abd_free(zio->io_data, zio->io_size); mutex_exit(&pio->io_lock); } @@ -2053,10 +2132,10 @@ zio_ddt_read_start(zio_t *zio) ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, &blk); zio_nowait(zio_read(zio, zio->io_spa, &blk, - zio_buf_alloc(zio->io_size), zio->io_size, - zio_ddt_child_read_done, dde, zio->io_priority, - ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, - &zio->io_bookmark)); + abd_alloc_linear(zio->io_size), + zio->io_size, zio_ddt_child_read_done, dde, + zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) | + ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark)); } return (ZIO_PIPELINE_CONTINUE); } @@ -2093,7 +2172,8 @@ zio_ddt_read_done(zio_t *zio) return (ZIO_PIPELINE_STOP); } if (dde->dde_repair_data != NULL) { - bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); + abd_copy(zio->io_data, dde->dde_repair_data, + zio->io_size); zio->io_child_error[ZIO_CHILD_DDT] = 0; } ddt_repair_done(ddt, dde); @@ -2122,7 +2202,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) if (lio != NULL) { return (lio->io_orig_size != zio->io_orig_size || - bcmp(zio->io_orig_data, lio->io_orig_data, + abd_cmp(zio->io_orig_data, lio->io_orig_data, zio->io_orig_size) != 0); } } @@ -2132,7 +2212,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) if (ddp->ddp_phys_birth != 0) { arc_buf_t *abuf = NULL; - uint32_t aflags = ARC_WAIT; + arc_flags_t aflags = ARC_FLAG_WAIT; blkptr_t blk = *zio->io_bp; int error; @@ -2147,7 +2227,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) if (error == 0) { if (arc_buf_size(abuf) != zio->io_orig_size || - bcmp(abuf->b_data, zio->io_orig_data, + abd_cmp(abuf->b_data, zio->io_orig_data, zio->io_orig_size) != 0) error = SET_ERROR(EEXIST); VERIFY(arc_buf_remove_ref(abuf, &abuf)); @@ -2574,11 +2654,15 @@ zio_vdev_io_start(zio_t *zio) P2PHASE(zio->io_size, align) != 0) { /* Transform logical writes to be a full physical block size. */ uint64_t asize = P2ROUNDUP(zio->io_size, align); - char *abuf = zio_buf_alloc(asize); + abd_t *abuf; + if (ABD_IS_LINEAR(zio->io_data)) + abuf = abd_alloc_linear(asize); + else + abuf = abd_alloc_scatter(asize); ASSERT(vd == vd->vdev_top); if (zio->io_type == ZIO_TYPE_WRITE) { - bcopy(zio->io_data, abuf, zio->io_size); - bzero(abuf + zio->io_size, asize - zio->io_size); + abd_copy(abuf, zio->io_data, zio->io_size); + abd_zero_off(abuf, asize - zio->io_size, zio->io_size); } zio_push_transform(zio, abuf, asize, asize, zio_subblock); } @@ -2703,7 +2787,7 @@ zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) { void *buf = zio_buf_alloc(zio->io_size); - bcopy(zio->io_data, buf, zio->io_size); + abd_copy_to_buf(buf, zio->io_data, zio->io_size); zcr->zcr_cbinfo = zio->io_size; zcr->zcr_cbdata = buf; @@ -3028,21 +3112,27 @@ zio_done(zio_t *zio) zio_cksum_report_t *zcr = zio->io_cksum_report; uint64_t align = zcr->zcr_align; uint64_t asize = P2ROUNDUP(zio->io_size, align); - char *abuf = zio->io_data; + char *abuf; + abd_t *adata = zio->io_data; if (asize != zio->io_size) { - abuf = zio_buf_alloc(asize); - bcopy(zio->io_data, abuf, zio->io_size); - bzero(abuf+zio->io_size, asize-zio->io_size); + adata = abd_alloc_linear(asize); + abd_copy(adata, zio->io_data, zio->io_size); + abd_zero_off(adata, asize-zio->io_size, + zio->io_size); } + abuf = abd_borrow_buf_copy(adata, asize); + zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; zcr->zcr_finish(zcr, abuf); zfs_ereport_free_checksum(zcr); + abd_return_buf(adata, abuf, asize); + if (asize != zio->io_size) - zio_buf_free(abuf, asize); + abd_free(adata, asize); } } diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c index 3a5c73a6a1e9..b96149f96a00 100644 --- a/module/zfs/zio_checksum.c +++ b/module/zfs/zio_checksum.c @@ -28,6 +28,7 @@ #include #include #include +#include #include /* @@ -62,22 +63,58 @@ /*ARGSUSED*/ static void -zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp) +abd_checksum_off(abd_t *abd, uint64_t size, zio_cksum_t *zcp) { ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); } +void +abd_checksum_SHA256(abd_t *abd, uint64_t size, zio_cksum_t *zcp) +{ + void *buf = abd_borrow_buf_copy(abd, size); + zio_checksum_SHA256(buf, size, zcp); + abd_return_buf(abd, buf, size); +} + +void +abd_fletcher_2_native(abd_t *abd, uint64_t size, zio_cksum_t *zcp) +{ + fletcher_2_native_init(zcp); + abd_iterate_rfunc(abd, size, fletcher_2_incremental_native, zcp); +} + +void +abd_fletcher_2_byteswap(abd_t *abd, uint64_t size, zio_cksum_t *zcp) +{ + fletcher_2_byteswap_init(zcp); + abd_iterate_rfunc(abd, size, fletcher_2_incremental_byteswap, zcp); +} + +void +abd_fletcher_4_native(abd_t *abd, uint64_t size, zio_cksum_t *zcp) +{ + fletcher_4_native_init(zcp); + abd_iterate_rfunc(abd, size, fletcher_4_incremental_native, zcp); +} + +void +abd_fletcher_4_byteswap(abd_t *abd, uint64_t size, zio_cksum_t *zcp) +{ + fletcher_4_byteswap_init(zcp); + abd_iterate_rfunc(abd, size, fletcher_4_incremental_byteswap, zcp); +} + zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { - {{NULL, NULL}, 0, 0, 0, "inherit"}, - {{NULL, NULL}, 0, 0, 0, "on"}, - {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "off"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "label"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "gang_header"}, - {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, 0, "zilog"}, - {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, 0, "fletcher2"}, - {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "sha256"}, - {{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zilog2"}, +{{NULL, NULL}, 0, 0, 0, "inherit"}, +{{NULL, NULL}, 0, 0, 0, "on"}, +{{abd_checksum_off, abd_checksum_off}, 0, 0, 0, "off"}, +{{abd_checksum_SHA256, abd_checksum_SHA256}, 1, 1, 0, "label"}, +{{abd_checksum_SHA256, abd_checksum_SHA256}, 1, 1, 0, "gang_header"}, +{{abd_fletcher_2_native, abd_fletcher_2_byteswap}, 0, 1, 0, "zilog"}, +{{abd_fletcher_2_native, abd_fletcher_2_byteswap}, 0, 0, 0, "fletcher2"}, +{{abd_fletcher_4_native, abd_fletcher_4_byteswap}, 1, 0, 0, "fletcher4"}, +{{abd_checksum_SHA256, abd_checksum_SHA256}, 1, 0, 1, "sha256"}, +{{abd_fletcher_4_native, abd_fletcher_4_byteswap}, 0, 1, 0, "zilog2"}, }; enum zio_checksum @@ -150,7 +187,7 @@ zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) */ void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, - void *data, uint64_t size) + abd_t *data, uint64_t size) { blkptr_t *bp = zio->io_bp; uint64_t offset = zio->io_offset; @@ -162,15 +199,16 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, if (ci->ci_eck) { zio_eck_t *eck; + void *buf = ABD_TO_BUF(data); if (checksum == ZIO_CHECKSUM_ZILOG2) { - zil_chain_t *zilc = data; + zil_chain_t *zilc = buf; size = P2ROUNDUP_TYPED(zilc->zc_nused, ZIL_MIN_BLKSZ, uint64_t); eck = &zilc->zc_eck; } else { - eck = (zio_eck_t *)((char *)data + size) - 1; + eck = (zio_eck_t *)((char *)buf + size) - 1; } if (checksum == ZIO_CHECKSUM_GANG_HEADER) zio_checksum_gang_verifier(&eck->zec_cksum, bp); @@ -197,7 +235,7 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) uint64_t size = (bp == NULL ? zio->io_size : (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); uint64_t offset = zio->io_offset; - void *data = zio->io_data; + abd_t *data = zio->io_data; zio_checksum_info_t *ci = &zio_checksum_table[checksum]; zio_cksum_t actual_cksum, expected_cksum, verifier; @@ -206,9 +244,10 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) if (ci->ci_eck) { zio_eck_t *eck; + void *buf = ABD_TO_BUF(data); if (checksum == ZIO_CHECKSUM_ZILOG2) { - zil_chain_t *zilc = data; + zil_chain_t *zilc = buf; uint64_t nused; eck = &zilc->zc_eck; @@ -224,7 +263,7 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t); } else { - eck = (zio_eck_t *)((char *)data + size) - 1; + eck = (zio_eck_t *)((char *)buf + size) - 1; } if (checksum == ZIO_CHECKSUM_GANG_HEADER) diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c index c168f3b47f2e..e629c43b94c0 100644 --- a/module/zfs/zio_inject.c +++ b/module/zfs/zio_inject.c @@ -439,7 +439,11 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) * fault injection isn't a performance critical path. */ if (flags & ZINJECT_FLUSH_ARC) - arc_flush(NULL); + /* + * We must use FALSE to ensure arc_flush returns, since + * we're not preventing concurrent ARC insertions. + */ + arc_flush(NULL, FALSE); return (0); } diff --git a/module/zfs/zpl_super.c b/module/zfs/zpl_super.c index 47cc2fcf4608..ef0f9d311e38 100644 --- a/module/zfs/zpl_super.c +++ b/module/zfs/zpl_super.c @@ -109,6 +109,12 @@ zpl_evict_inode(struct inode *ip) #else +static void +zpl_drop_inode(struct inode *ip) +{ + generic_delete_inode(ip); +} + static void zpl_clear_inode(struct inode *ip) { @@ -125,7 +131,6 @@ zpl_inode_delete(struct inode *ip) truncate_setsize(ip, 0); clear_inode(ip); } - #endif /* HAVE_EVICT_INODE */ static void @@ -276,37 +281,13 @@ zpl_kill_sb(struct super_block *sb) #endif /* HAVE_S_INSTANCES_LIST_HEAD */ } -#if defined(HAVE_SHRINK) || defined(HAVE_SPLIT_SHRINKER_CALLBACK) -/* - * Linux 3.1 - 3.x API - * - * The Linux 3.1 API introduced per-sb cache shrinkers to replace the - * global ones. This allows us a mechanism to cleanly target a specific - * zfs file system when the dnode and inode caches grow too large. - * - * In addition, the 3.0 kernel added the iterate_supers_type() helper - * function which is used to safely walk all of the zfs file systems. - */ -static void -zpl_prune_sb(struct super_block *sb, void *arg) -{ - int objects = 0; - int error; - - error = -zfs_sb_prune(sb, *(unsigned long *)arg, &objects); - ASSERT3S(error, <=, 0); -} -#endif /* defined(HAVE_SHRINK) || defined(HAVE_SPLIT_SHRINKER_CALLBACK) */ - void -zpl_prune_sbs(int64_t bytes_to_scan, void *private) +zpl_prune_sb(int64_t nr_to_scan, void *arg) { -#if defined(HAVE_SHRINK) || defined(HAVE_SPLIT_SHRINKER_CALLBACK) - unsigned long nr_to_scan = (bytes_to_scan / sizeof (znode_t)); + struct super_block *sb = (struct super_block *)arg; + int objects = 0; - iterate_supers_type(&zpl_fs_type, zpl_prune_sb, &nr_to_scan); - kmem_reap(); -#endif /* defined(HAVE_SHRINK) || defined(HAVE_SPLIT_SHRINKER_CALLBACK) */ + (void) -zfs_sb_prune(sb, nr_to_scan, &objects); } #ifdef HAVE_NR_CACHED_OBJECTS @@ -343,10 +324,10 @@ const struct super_operations zpl_super_operations = { .destroy_inode = zpl_inode_destroy, .dirty_inode = zpl_dirty_inode, .write_inode = NULL, - .drop_inode = NULL, #ifdef HAVE_EVICT_INODE .evict_inode = zpl_evict_inode, #else + .drop_inode = zpl_drop_inode, .clear_inode = zpl_clear_inode, .delete_inode = zpl_inode_delete, #endif /* HAVE_EVICT_INODE */