Skip to content
Permalink
Browse files

Persistent L2ARC

This commit makes the L2ARC persistent across reboots. It is largely
based on issue 3525 in Illumos.

Co-authored-by: Saso Kiselkov <skiselkov@gmail.com>
Co-authored-by: Jorgen Lundman <lundman@lundman.net>
Co-authored-by: George Amanakis <gamanakis@gmail.com>
Ported-by: Yuxuan Shui <yshuiv7@gmail.com>
Signed-off-by: George Amanakis <gamanakis@gmail.com>
  • Loading branch information
3 people committed Mar 6, 2019
1 parent 6e1c594 commit e1e917f43bdf4d6c65182e15a105c56eefff493b
@@ -334,6 +334,7 @@ AC_CONFIG_FILES([
tests/zfs-tests/tests/functional/no_space/Makefile
tests/zfs-tests/tests/functional/nopwrite/Makefile
tests/zfs-tests/tests/functional/online_offline/Makefile
tests/zfs-tests/tests/functional/persist_l2arc/Makefile
tests/zfs-tests/tests/functional/pool_checkpoint/Makefile
tests/zfs-tests/tests/functional/pool_names/Makefile
tests/zfs-tests/tests/functional/poolversion/Makefile
@@ -300,13 +300,14 @@ void arc_fini(void);
* Level 2 ARC
*/

void l2arc_add_vdev(spa_t *spa, vdev_t *vd);
void l2arc_add_vdev(spa_t *spa, vdev_t *vd, boolean_t rebuild);
void l2arc_remove_vdev(vdev_t *vd);
boolean_t l2arc_vdev_present(vdev_t *vd);
void l2arc_init(void);
void l2arc_fini(void);
void l2arc_start(void);
void l2arc_stop(void);
void l2arc_spa_rebuild_start(spa_t *spa);

#ifndef _KERNEL
extern boolean_t arc_watch;
@@ -176,6 +176,172 @@ typedef struct l1arc_buf_hdr {
abd_t *b_pabd;
} l1arc_buf_hdr_t;

typedef enum l2arc_dev_hdr_flags_t {
L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */
} l2arc_dev_hdr_flags_t;

/*
* Pointer used in persistent L2ARC (for pointing to log blocks & ARC buffers).
*/
typedef struct l2arc_log_blkptr {
/*
* Offset of log block within the device, in bytes
*/
uint64_t lbp_daddr;
/*
* lbp_prop has the following format:
* * logical size (in sectors)
* * physical (compressed) size (in sectors)
* * compression algorithm (we always LZ4-compress l2arc logs)
* * checksum algorithm (used for lbp_cksum)
*/
uint64_t lbp_prop;
zio_cksum_t lbp_cksum; /* checksum of log */
} l2arc_log_blkptr_t;

/*
* The persistent L2ARC device header.
* Byte order of magic determines whether 64-bit bswap of fields is necessary.
*/
typedef struct l2arc_dev_hdr_phys {
uint64_t dh_magic; /* L2ARC_DEV_HDR_MAGIC */
uint64_t dh_version; /* Persistent L2ARC version */

/*
* Global L2ARC device state and metadata.
*/
uint64_t dh_spa_guid;
uint64_t dh_alloc_space; /* vdev allocated bytes */
uint64_t dh_flags; /* l2arc_dev_hdr_flags_t */

/*
* Start of log block chain. [0] -> newest log, [1] -> one older (used
* for initiating prefetch).
*/
l2arc_log_blkptr_t dh_start_lbps[2];
uint64_t dh_log_blk_ent; /* entries per log blk */
const uint64_t dh_pad[41]; /* pad to 512 bytes */
zio_eck_t dh_tail;
} l2arc_dev_hdr_phys_t;
CTASSERT_GLOBAL(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE);

/*
* A single ARC buffer header entry in a l2arc_log_blk_phys_t.
*/
typedef struct l2arc_log_ent_phys {
dva_t le_dva; /* dva of buffer */
uint64_t le_birth; /* birth txg of buffer */
/*
* le_prop has the following format:
* * logical size (in sectors)
* * physical (compressed) size (in sectors)
* * compression algorithm
* * checksum algorithm (used for le_cksum)
* * object type (used to restore arc_buf_contents_t)
* * protected status (used for encryption)
* * prefetch status (used in l2arc_read_done())
*/
uint64_t le_prop;
uint64_t le_daddr; /* buf location on l2dev */
zio_cksum_t le_cksum; /* checksum of log entry */
const uint64_t le_pad[7]; /* pad to 128 bytes */
} l2arc_log_ent_phys_t;

#define L2ARC_LOG_BLK_MAX_ENTRIES (1023)

/*
* A log block of up to 1023 ARC buffer log entries, chained into the
* persistent L2ARC metadata linked list. Byte order of magic determines
* whether 64-bit bswap of fields is necessary.
*/
typedef struct l2arc_log_blk_phys {
/* Header - see L2ARC_LOG_BLK_HEADER_LEN above */
uint64_t lb_magic; /* L2ARC_LOG_BLK_MAGIC */
/*
* There are 2 chains (headed by dh_start_lbps[2]), and this field
* points back to the previous block in this chain. We alternate
* which chain we append to, so they are time-wise and offset-wise
* interleaved, but that is an optimization rather than for
* correctness.
*/
l2arc_log_blkptr_t lb_prev_lbp; /* pointer to prev log block */
uint64_t lb_pad[9]; /* resv'd for future use */
/* Payload */
l2arc_log_ent_phys_t lb_entries[L2ARC_LOG_BLK_MAX_ENTRIES];
} l2arc_log_blk_phys_t; /* 128K total */
CTASSERT_GLOBAL(sizeof (l2arc_log_blk_phys_t) >= SPA_MINBLOCKSIZE);
CTASSERT_GLOBAL(sizeof (l2arc_log_blk_phys_t) <= SPA_MAXBLOCKSIZE);

/*
* These structures hold in-flight abd buffers for log blocks as they're being
* written to the L2ARC device.
*/
typedef struct l2arc_log_blk_abd {
abd_t *abd;
list_node_t node;
} l2arc_log_blk_abd_t;

/* Macros for setting fields in le_prop and lbp_prop */
#define BLKPROP_GET_LSIZE(field) \
BF64_GET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)
#define BLKPROP_SET_LSIZE(field, x) \
BF64_SET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
#define BLKPROP_GET_PSIZE(field) \
BF64_GET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)
#define BLKPROP_SET_PSIZE(field, x) \
BF64_SET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x)
#define BLKPROP_GET_COMPRESS(field) BF64_GET((field), 32, 7)
#define BLKPROP_SET_COMPRESS(field, x) BF64_SET((field), 32, 7, x)
#define BLKPROP_GET_CHECKSUM(field) BF64_GET((field), 40, 8)
#define BLKPROP_SET_CHECKSUM(field, x) BF64_SET((field), 40, 8, x)
#define BLKPROP_GET_TYPE(field) BF64_GET((field), 48, 8)
#define BLKPROP_SET_TYPE(field, x) BF64_SET((field), 48, 8, x)
#define BLKPROP_GET_PROTECTED(field) BF64_GET((field), 56, 1)
#define BLKPROP_SET_PROTECTED(field, x) BF64_SET((field), 56, 1, x)
#define BLKPROP_GET_PREFETCH(field) BF64_GET((field), 57, 1)
#define BLKPROP_SET_PREFETCH(field, x) BF64_SET((field), 57, 1, x)

#define PTR_SWAP(x, y) \
do { \
void *tmp = (x);\
x = y; \
y = tmp; \
_NOTE(CONSTCOND)\
} while (0)

#define L2ARC_DEV_HDR_MAGIC 0x5a46534341434845LLU /* ASCII: "ZFSCACHE" */
#define L2ARC_LOG_BLK_MAGIC 0x4c4f47424c4b4844LLU /* ASCII: "LOGBLKHD" */

/*
* L2ARC Internals
*/
typedef struct l2arc_dev {
vdev_t *l2ad_vdev; /* vdev */
spa_t *l2ad_spa; /* spa */
uint64_t l2ad_hand; /* next write location */
uint64_t l2ad_start; /* first addr on device */
uint64_t l2ad_end; /* last addr on device */
boolean_t l2ad_first; /* first sweep through */
boolean_t l2ad_writing; /* currently writing */
kmutex_t l2ad_mtx; /* lock for buffer list */
list_t l2ad_buflist; /* buffer list */
list_node_t l2ad_node; /* device list node */
zfs_refcount_t l2ad_alloc; /* allocated bytes */
/*
* Persistence-related stuff
*/
l2arc_dev_hdr_phys_t *l2ad_dev_hdr; /* persistent device header */
uint64_t l2ad_dev_hdr_asize; /* aligned hdr size */
l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */
int l2ad_log_ent_idx; /* index into cur log blk */
/* number of bytes in current log block's payload */
uint64_t l2ad_log_blk_payload_asize;
/* flag indicating whether a rebuild is scheduled or is going on */
boolean_t l2ad_rebuild;
boolean_t l2ad_rebuild_cancel;
boolean_t l2ad_rebuild_began;
} l2arc_dev_t;

/*
* Encrypted blocks will need to be stored encrypted on the L2ARC
* disk as they appear in the main pool. In order for this to work we
@@ -206,20 +372,6 @@ typedef struct arc_buf_hdr_crypt {
uint8_t b_mac[ZIO_DATA_MAC_LEN];
} arc_buf_hdr_crypt_t;

typedef struct l2arc_dev {
vdev_t *l2ad_vdev; /* vdev */
spa_t *l2ad_spa; /* spa */
uint64_t l2ad_hand; /* next write location */
uint64_t l2ad_start; /* first addr on device */
uint64_t l2ad_end; /* last addr on device */
boolean_t l2ad_first; /* first sweep through */
boolean_t l2ad_writing; /* currently writing */
kmutex_t l2ad_mtx; /* lock for buffer list */
list_t l2ad_buflist; /* buffer list */
list_node_t l2ad_node; /* device list node */
zfs_refcount_t l2ad_alloc; /* allocated bytes */
} l2arc_dev_t;

typedef struct l2arc_buf_hdr {
/* protected by arc_buf_hdr mutex */
l2arc_dev_t *b_dev; /* L2ARC device */
@@ -232,6 +384,9 @@ typedef struct l2arc_buf_hdr {
typedef struct l2arc_write_callback {
l2arc_dev_t *l2wcb_dev; /* device info */
arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
abd_t *l2wcb_abd; /* abd for L2ARC dev header */
/* in-flight list of log blocks */
list_t l2wcb_abd_list;
} l2arc_write_callback_t;

struct arc_buf_hdr {
@@ -532,6 +687,76 @@ typedef struct arc_stats {
kstat_named_t arcstat_l2_psize;
/* Not updated directly; only synced in arc_kstat_update. */
kstat_named_t arcstat_l2_hdr_size;
/*
* Number of L2ARC log blocks written. These are used for restoring the
* L2ARC.
* Updated during writing of L2ARC log blocks.
*/
kstat_named_t arcstat_l2_log_blk_writes;
/*
* Moving average of the size of the L2ARC log blocks, in bytes.
* Updated during L2ARC rebuild and during writing of L2ARC log blocks.
*/
kstat_named_t arcstat_l2_log_blk_avg_size;
/*
* Moving average of the physical size of L2ARC restored data, in bytes
* to the physical size of their metadata in ARC, in bytes.
* Updated during L2ARC rebuild and during writing of L2ARC log blocks.
*/
kstat_named_t arcstat_l2_data_to_meta_ratio;
/*
* Number of times the L2ARC rebuild was successful for an L2ARC device.
*/
kstat_named_t arcstat_l2_rebuild_success;
/*
* Number of times the L2ARC rebuild failed because the device header
* was in an unsupported format.
*/
kstat_named_t arcstat_l2_rebuild_abort_unsupported;
/*
* Number of times the L2ARC rebuild failed because of IO errors
* while reading the device header.
*/
kstat_named_t arcstat_l2_rebuild_abort_io_errors;
/*
* Number of times the L2ARC rebuild failed because the device header
* was invalid (either not initialized or corrupted or the IO failed).
*/
kstat_named_t arcstat_l2_rebuild_abort_dh_errors;
/*
* Number of L2ARC log blocks which had none of their log entries
* (buffers) restored in ARC due to checksum errors.
*/
kstat_named_t arcstat_l2_rebuild_abort_cksum_lb_errors;
/*
* Number of L2ARc log entries (buffers) which failed to be restored
* in ARC due to checksum errors.
*/
kstat_named_t arcstat_l2_rebuild_abort_cksum_le_errors;
/*
* Number of times the L2ARC rebuild was aborted due to low system
* memory.
*/
kstat_named_t arcstat_l2_rebuild_abort_lowmem;
/* Logical size of L2ARC restored data, in bytes. */
kstat_named_t arcstat_l2_rebuild_size;
/*
* Number of L2ARC log entries (buffers) that were successfully
* restored in ARC.
*/
kstat_named_t arcstat_l2_rebuild_bufs;
/*
* Number of L2ARc log entries (buffers) already cached in ARC. These
* were not restored again.
*/
kstat_named_t arcstat_l2_rebuild_bufs_precached;
/* Physical size of L2ARC restored data, in bytes. */
kstat_named_t arcstat_l2_rebuild_psize;
/*
* Number of L2ARC log blocks that were restored successfully. Each
* log block may hold up to L2ARC_LOG_BLK_MAX_ENTRIES buffers.
*/
kstat_named_t arcstat_l2_rebuild_log_blks;
kstat_named_t arcstat_memory_throttle_count;
kstat_named_t arcstat_memory_direct_count;
kstat_named_t arcstat_memory_indirect_count;
@@ -573,6 +573,11 @@ typedef enum zfs_key_location {
#define ZPL_VERSION_USERSPACE ZPL_VERSION_4
#define ZPL_VERSION_SA ZPL_VERSION_5

/* Persistent L2ARC version */
#define L2ARC_PERSISTENT_VERSION_1 1ULL
#define L2ARC_PERSISTENT_VERSION L2ARC_PERSISTENT_VERSION_1
#define L2ARC_PERSISTENT_VERSION_STRING "1"

/* Rewind policy information */
#define ZPOOL_NO_REWIND 1 /* No policy - default behavior */
#define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */
@@ -687,6 +692,7 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_PHYS_PATH "phys_path"
#define ZPOOL_CONFIG_IS_LOG "is_log"
#define ZPOOL_CONFIG_L2CACHE "l2cache"
#define ZPOOL_CONFIG_L2CACHE_PERSISTENT "l2cache_persistent"
#define ZPOOL_CONFIG_HOLE_ARRAY "hole_array"
#define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children"
#define ZPOOL_CONFIG_IS_HOLE "is_hole"
@@ -787,6 +787,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
#define SPA_ASYNC_INITIALIZE_RESTART 0x100
#define SPA_ASYNC_TRIM_RESTART 0x200
#define SPA_ASYNC_AUTOTRIM_RESTART 0x400
#define SPA_ASYNC_L2CACHE_REBUILD 0x800

/*
* Controls the behavior of spa_vdev_remove().
@@ -203,7 +203,7 @@ Default value: \fB200\fR%.
.ad
.RS 12n
Do not write buffers to L2ARC if they were prefetched but not used by
applications
applications.
.sp
Use \fB1\fR for yes (default) and \fB0\fR to disable.
.RE
@@ -214,7 +214,7 @@ Use \fB1\fR for yes (default) and \fB0\fR to disable.
\fBl2arc_norw\fR (int)
.ad
.RS 12n
No reads during writes
No reads during writes.
.sp
Use \fB1\fR for yes and \fB0\fR for no (default).
.RE
@@ -237,11 +237,25 @@ Default value: \fB8,388,608\fR.
\fBl2arc_write_max\fR (ulong)
.ad
.RS 12n
Max write bytes per interval
Max write bytes per interval.
.sp
Default value: \fB8,388,608\fR.
.RE

.sp
.ne 2
.na
\fBl2arc_rebuild_enabled\fR (int)
.ad
.RS 12n
Rebuild the L2ARC when importing a pool. This can be disabled if there are
problems importing a pool or attaching an L2ARC device (e.g. the L2ARC device
is slow in reading stored log metadata, or the metadata has become somehow
fragmented/unusable).
.sp
Use \fB1\fR for yes (default) and \fB0\fR for no.
.RE

.sp
.ne 2
.na
@@ -323,8 +323,9 @@ If a read error is encountered on a cache device, that read I/O is reissued to
the original storage pool device, which might be part of a mirrored or raidz
configuration.
.Pp
The content of the cache devices is considered volatile, as is the case with
other system caches.
The content of the cache devices is persistent across reboots. This can be
disabled by setting
.Sy l2arc_rebuild_enabled = 0 .
.Ss Pool checkpoint
Before starting critical procedures that include destructive actions (e.g
.Nm zfs Cm destroy

0 comments on commit e1e917f

Please sign in to comment.
You can’t perform that action at this time.