Skip to content

Commit b9541d6

Browse files
cwillbehlendorf
authored andcommitted
Illumos 5408 - managing ZFS cache devices requires lots of RAM
5408 managing ZFS cache devices requires lots of RAM Reviewed by: Christopher Siden <christopher.siden@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Don Brady <dev.fs.zfs@gmail.com> Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com> Approved by: Garrett D'Amore <garrett@damore.org> Porting notes: Due to the restructuring of the ARC-related structures, this patch conflicts with at least the following existing ZoL commits: 6e1d727 Fix inaccurate arcstat_l2_hdr_size calculations The ARC_SPACE_HDRS constant no longer exists and has been somewhat equivalently replaced by HDR_L2ONLY_SIZE. e0b0ca9 Add visibility in to cached dbufs The new layering of l{1,2}arc_buf_hdr_t within the arc_buf_hdr struct requires additional structure member names to be used when referencing the inner items. Also, the presence of L1 or L2 inner member is indicated by flags using the new HDR_HAS_L{1,2}HDR macros. Ported by: Tim Chase <tim@chase2k.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
1 parent 2a43241 commit b9541d6

File tree

5 files changed

+948
-632
lines changed

5 files changed

+948
-632
lines changed

cmd/ztest/ztest.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4042,7 +4042,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
40424042
* assign an arcbuf to a dbuf.
40434043
*/
40444044
for (j = 0; j < s; j++) {
4045-
if (i != 5) {
4045+
if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
40464046
bigbuf_arcbufs[j] =
40474047
dmu_request_arcbuf(bonus_db, chunksize);
40484048
} else {
@@ -4066,7 +4066,8 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
40664066
umem_free(packbuf, packsize);
40674067
umem_free(bigbuf, bigsize);
40684068
for (j = 0; j < s; j++) {
4069-
if (i != 5) {
4069+
if (i != 5 ||
4070+
chunksize < (SPA_MINBLOCKSIZE * 2)) {
40704071
dmu_return_arcbuf(bigbuf_arcbufs[j]);
40714072
} else {
40724073
dmu_return_arcbuf(
@@ -4111,7 +4112,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
41114112
}
41124113
for (off = bigoff, j = 0; j < s; j++, off += chunksize) {
41134114
dmu_buf_t *dbt;
4114-
if (i != 5) {
4115+
if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
41154116
bcopy((caddr_t)bigbuf + (off - bigoff),
41164117
bigbuf_arcbufs[j]->b_data, chunksize);
41174118
} else {
@@ -4128,7 +4129,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
41284129
VERIFY(dmu_buf_hold(os, bigobj, off,
41294130
FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0);
41304131
}
4131-
if (i != 5) {
4132+
if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) {
41324133
dmu_assign_arcbuf(bonus_db, off,
41334134
bigbuf_arcbufs[j], tx);
41344135
} else {

include/sys/arc.h

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -81,10 +81,29 @@ typedef enum arc_flags
8181
ARC_FLAG_FREED_IN_READ = 1 << 10, /* freed during read */
8282
ARC_FLAG_BUF_AVAILABLE = 1 << 11, /* block not in use */
8383
ARC_FLAG_INDIRECT = 1 << 12, /* indirect block */
84-
ARC_FLAG_FREE_IN_PROGRESS = 1 << 13, /* about to be freed */
85-
ARC_FLAG_L2_WRITING = 1 << 14, /* write in progress */
86-
ARC_FLAG_L2_EVICTED = 1 << 15, /* evicted during I/O */
87-
ARC_FLAG_L2_WRITE_HEAD = 1 << 16, /* head of write list */
84+
ARC_FLAG_L2_WRITING = 1 << 13, /* write in progress */
85+
ARC_FLAG_L2_EVICTED = 1 << 14, /* evicted during I/O */
86+
ARC_FLAG_L2_WRITE_HEAD = 1 << 15, /* head of write list */
87+
/* indicates that the buffer contains metadata (otherwise, data) */
88+
ARC_FLAG_BUFC_METADATA = 1 << 16,
89+
90+
/* Flags specifying whether optional hdr struct fields are defined */
91+
ARC_FLAG_HAS_L1HDR = 1 << 17,
92+
ARC_FLAG_HAS_L2HDR = 1 << 18,
93+
94+
/*
95+
* The arc buffer's compression mode is stored in the top 7 bits of the
96+
* flags field, so these dummy flags are included so that MDB can
97+
* interpret the enum properly.
98+
*/
99+
ARC_FLAG_COMPRESS_0 = 1 << 24,
100+
ARC_FLAG_COMPRESS_1 = 1 << 25,
101+
ARC_FLAG_COMPRESS_2 = 1 << 26,
102+
ARC_FLAG_COMPRESS_3 = 1 << 27,
103+
ARC_FLAG_COMPRESS_4 = 1 << 28,
104+
ARC_FLAG_COMPRESS_5 = 1 << 29,
105+
ARC_FLAG_COMPRESS_6 = 1 << 30
106+
88107
} arc_flags_t;
89108

90109
struct arc_buf {

include/sys/arc_impl.h

Lines changed: 74 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,6 @@ typedef struct arc_state {
7474
arc_state_type_t arcs_state;
7575
} arc_state_t;
7676

77-
typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
78-
7977
typedef struct arc_callback arc_callback_t;
8078

8179
struct arc_callback {
@@ -96,27 +94,45 @@ struct arc_write_callback {
9694
arc_buf_t *awcb_buf;
9795
};
9896

99-
struct arc_buf_hdr {
100-
/* protected by hash lock */
101-
dva_t b_dva;
102-
uint64_t b_birth;
103-
uint64_t b_cksum0;
104-
97+
/*
98+
* ARC buffers are separated into multiple structs as a memory saving measure:
99+
* - Common fields struct, always defined, and embedded within it:
100+
* - L2-only fields, always allocated but undefined when not in L2ARC
101+
* - L1-only fields, only allocated when in L1ARC
102+
*
103+
* Buffer in L1 Buffer only in L2
104+
* +------------------------+ +------------------------+
105+
* | arc_buf_hdr_t | | arc_buf_hdr_t |
106+
* | | | |
107+
* | | | |
108+
* | | | |
109+
* +------------------------+ +------------------------+
110+
* | l2arc_buf_hdr_t | | l2arc_buf_hdr_t |
111+
* | (undefined if L1-only) | | |
112+
* +------------------------+ +------------------------+
113+
* | l1arc_buf_hdr_t |
114+
* | |
115+
* | |
116+
* | |
117+
* | |
118+
* +------------------------+
119+
*
120+
* Because it's possible for the L2ARC to become extremely large, we can wind
121+
* up eating a lot of memory in L2ARC buffer headers, so the size of a header
122+
* is minimized by only allocating the fields necessary for an L1-cached buffer
123+
* when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
124+
* l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
125+
* words in pointers. arc_hdr_realloc() is used to switch a header between
126+
* these two allocation states.
127+
*/
128+
typedef struct l1arc_buf_hdr {
105129
kmutex_t b_freeze_lock;
106-
zio_cksum_t *b_freeze_cksum;
107130

108-
arc_buf_hdr_t *b_hash_next;
109131
arc_buf_t *b_buf;
110-
arc_flags_t b_flags;
111132
uint32_t b_datacnt;
112-
113-
arc_callback_t *b_acb;
133+
/* for waiting on writes to complete */
114134
kcondvar_t b_cv;
115135

116-
/* immutable */
117-
arc_buf_contents_t b_type;
118-
uint64_t b_size;
119-
uint64_t b_spa;
120136

121137
/* protected by arc state mutex */
122138
arc_state_t *b_state;
@@ -133,9 +149,10 @@ struct arc_buf_hdr {
133149
/* self protecting */
134150
refcount_t b_refcnt;
135151

136-
l2arc_buf_hdr_t *b_l2hdr;
137-
list_node_t b_l2node;
138-
};
152+
arc_callback_t *b_acb;
153+
/* temporary buffer holder for in-flight compressed data */
154+
void *b_tmp_cdata;
155+
} l1arc_buf_hdr_t;
139156

140157
typedef struct l2arc_dev {
141158
vdev_t *l2ad_vdev; /* vdev */
@@ -146,15 +163,51 @@ typedef struct l2arc_dev {
146163
uint64_t l2ad_evict; /* last addr eviction reached */
147164
boolean_t l2ad_first; /* first sweep through */
148165
boolean_t l2ad_writing; /* currently writing */
149-
list_t *l2ad_buflist; /* buffer list */
166+
kmutex_t l2ad_mtx; /* lock for buffer list */
167+
list_t l2ad_buflist; /* buffer list */
150168
list_node_t l2ad_node; /* device list node */
151169
} l2arc_dev_t;
152170

171+
typedef struct l2arc_buf_hdr {
172+
/* protected by arc_buf_hdr mutex */
173+
l2arc_dev_t *b_dev; /* L2ARC device */
174+
uint64_t b_daddr; /* disk address, offset byte */
175+
/* real alloc'd buffer size depending on b_compress applied */
176+
uint32_t b_hits;
177+
int32_t b_asize;
178+
179+
list_node_t b_l2node;
180+
} l2arc_buf_hdr_t;
181+
153182
typedef struct l2arc_write_callback {
154183
l2arc_dev_t *l2wcb_dev; /* device info */
155184
arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
156185
} l2arc_write_callback_t;
157186

187+
struct arc_buf_hdr {
188+
/* protected by hash lock */
189+
dva_t b_dva;
190+
uint64_t b_birth;
191+
/*
192+
* Even though this checksum is only set/verified when a buffer is in
193+
* the L1 cache, it needs to be in the set of common fields because it
194+
* must be preserved from the time before a buffer is written out to
195+
* L2ARC until after it is read back in.
196+
*/
197+
zio_cksum_t *b_freeze_cksum;
198+
199+
arc_buf_hdr_t *b_hash_next;
200+
arc_flags_t b_flags;
201+
202+
/* immutable */
203+
int32_t b_size;
204+
uint64_t b_spa;
205+
206+
/* L2ARC fields. Undefined when not in L2ARC. */
207+
l2arc_buf_hdr_t b_l2hdr;
208+
/* L1ARC fields. Undefined when in l2arc_only state */
209+
l1arc_buf_hdr_t b_l1hdr;
210+
};
158211
#ifdef __cplusplus
159212
}
160213
#endif

include/sys/trace_arc.h

Lines changed: 24 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class,
4545
TP_STRUCT__entry(
4646
__array(uint64_t, hdr_dva_word, 2)
4747
__field(uint64_t, hdr_birth)
48-
__field(uint64_t, hdr_cksum0)
4948
__field(uint32_t, hdr_flags)
5049
__field(uint32_t, hdr_datacnt)
5150
__field(arc_buf_contents_t, hdr_type)
@@ -64,27 +63,25 @@ DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class,
6463
__entry->hdr_dva_word[0] = ab->b_dva.dva_word[0];
6564
__entry->hdr_dva_word[1] = ab->b_dva.dva_word[1];
6665
__entry->hdr_birth = ab->b_birth;
67-
__entry->hdr_cksum0 = ab->b_cksum0;
6866
__entry->hdr_flags = ab->b_flags;
69-
__entry->hdr_datacnt = ab->b_datacnt;
70-
__entry->hdr_type = ab->b_type;
67+
__entry->hdr_datacnt = ab->b_l1hdr.b_datacnt;
7168
__entry->hdr_size = ab->b_size;
7269
__entry->hdr_spa = ab->b_spa;
73-
__entry->hdr_state_type = ab->b_state->arcs_state;
74-
__entry->hdr_access = ab->b_arc_access;
75-
__entry->hdr_mru_hits = ab->b_mru_hits;
76-
__entry->hdr_mru_ghost_hits = ab->b_mru_ghost_hits;
77-
__entry->hdr_mfu_hits = ab->b_mfu_hits;
78-
__entry->hdr_mfu_ghost_hits = ab->b_mfu_ghost_hits;
79-
__entry->hdr_l2_hits = ab->b_l2_hits;
80-
__entry->hdr_refcount = ab->b_refcnt.rc_count;
70+
__entry->hdr_state_type = ab->b_l1hdr.b_state->arcs_state;
71+
__entry->hdr_access = ab->b_l1hdr.b_arc_access;
72+
__entry->hdr_mru_hits = ab->b_l1hdr.b_mru_hits;
73+
__entry->hdr_mru_ghost_hits = ab->b_l1hdr.b_mru_ghost_hits;
74+
__entry->hdr_mfu_hits = ab->b_l1hdr.b_mfu_hits;
75+
__entry->hdr_mfu_ghost_hits = ab->b_l1hdr.b_mfu_ghost_hits;
76+
__entry->hdr_l2_hits = ab->b_l1hdr.b_l2_hits;
77+
__entry->hdr_refcount = ab->b_l1hdr.b_refcnt.rc_count;
8178
),
82-
TP_printk("hdr { dva 0x%llx:0x%llx birth %llu cksum0 0x%llx "
79+
TP_printk("hdr { dva 0x%llx:0x%llx birth %llu "
8380
"flags 0x%x datacnt %u type %u size %llu spa %llu "
8481
"state_type %u access %lu mru_hits %u mru_ghost_hits %u "
8582
"mfu_hits %u mfu_ghost_hits %u l2_hits %u refcount %lli }",
8683
__entry->hdr_dva_word[0], __entry->hdr_dva_word[1],
87-
__entry->hdr_birth, __entry->hdr_cksum0, __entry->hdr_flags,
84+
__entry->hdr_birth, __entry->hdr_flags,
8885
__entry->hdr_datacnt, __entry->hdr_type, __entry->hdr_size,
8986
__entry->hdr_spa, __entry->hdr_state_type,
9087
__entry->hdr_access, __entry->hdr_mru_hits,
@@ -261,7 +258,6 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
261258
TP_STRUCT__entry(
262259
__array(uint64_t, hdr_dva_word, 2)
263260
__field(uint64_t, hdr_birth)
264-
__field(uint64_t, hdr_cksum0)
265261
__field(uint32_t, hdr_flags)
266262
__field(uint32_t, hdr_datacnt)
267263
__field(arc_buf_contents_t, hdr_type)
@@ -292,20 +288,18 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
292288
__entry->hdr_dva_word[0] = hdr->b_dva.dva_word[0];
293289
__entry->hdr_dva_word[1] = hdr->b_dva.dva_word[1];
294290
__entry->hdr_birth = hdr->b_birth;
295-
__entry->hdr_cksum0 = hdr->b_cksum0;
296291
__entry->hdr_flags = hdr->b_flags;
297-
__entry->hdr_datacnt = hdr->b_datacnt;
298-
__entry->hdr_type = hdr->b_type;
292+
__entry->hdr_datacnt = hdr->b_l1hdr.b_datacnt;
299293
__entry->hdr_size = hdr->b_size;
300294
__entry->hdr_spa = hdr->b_spa;
301-
__entry->hdr_state_type = hdr->b_state->arcs_state;
302-
__entry->hdr_access = hdr->b_arc_access;
303-
__entry->hdr_mru_hits = hdr->b_mru_hits;
304-
__entry->hdr_mru_ghost_hits = hdr->b_mru_ghost_hits;
305-
__entry->hdr_mfu_hits = hdr->b_mfu_hits;
306-
__entry->hdr_mfu_ghost_hits = hdr->b_mfu_ghost_hits;
307-
__entry->hdr_l2_hits = hdr->b_l2_hits;
308-
__entry->hdr_refcount = hdr->b_refcnt.rc_count;
295+
__entry->hdr_state_type = hdr->b_l1hdr.b_state->arcs_state;
296+
__entry->hdr_access = hdr->b_l1hdr.b_arc_access;
297+
__entry->hdr_mru_hits = hdr->b_l1hdr.b_mru_hits;
298+
__entry->hdr_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits;
299+
__entry->hdr_mfu_hits = hdr->b_l1hdr.b_mfu_hits;
300+
__entry->hdr_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits;
301+
__entry->hdr_l2_hits = hdr->b_l1hdr.b_l2_hits;
302+
__entry->hdr_refcount = hdr->b_l1hdr.b_refcnt.rc_count;
309303

310304
__entry->bp_dva0[0] = bp->blk_dva[0].dva_word[0];
311305
__entry->bp_dva0[1] = bp->blk_dva[0].dva_word[1];
@@ -325,17 +319,17 @@ DECLARE_EVENT_CLASS(zfs_arc_miss_class,
325319
__entry->zb_level = zb->zb_level;
326320
__entry->zb_blkid = zb->zb_blkid;
327321
),
328-
TP_printk("hdr { dva 0x%llx:0x%llx birth %llu cksum0 0x%llx "
329-
"flags 0x%x datacnt %u type %u size %llu spa %llu state_type %u "
322+
TP_printk("hdr { dva 0x%llx:0x%llx birth %llu "
323+
"flags 0x%x datacnt %u size %llu spa %llu state_type %u "
330324
"access %lu mru_hits %u mru_ghost_hits %u mfu_hits %u "
331325
"mfu_ghost_hits %u l2_hits %u refcount %lli } "
332326
"bp { dva0 0x%llx:0x%llx dva1 0x%llx:0x%llx dva2 "
333327
"0x%llx:0x%llx cksum 0x%llx:0x%llx:0x%llx:0x%llx "
334328
"lsize %llu } zb { objset %llu object %llu level %lli "
335329
"blkid %llu }",
336330
__entry->hdr_dva_word[0], __entry->hdr_dva_word[1],
337-
__entry->hdr_birth, __entry->hdr_cksum0, __entry->hdr_flags,
338-
__entry->hdr_datacnt, __entry->hdr_type, __entry->hdr_size,
331+
__entry->hdr_birth, __entry->hdr_flags,
332+
__entry->hdr_datacnt, __entry->hdr_size,
339333
__entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access,
340334
__entry->hdr_mru_hits, __entry->hdr_mru_ghost_hits,
341335
__entry->hdr_mfu_hits, __entry->hdr_mfu_ghost_hits,

0 commit comments

Comments
 (0)