/
rec_write.c
5730 lines (5136 loc) · 169 KB
/
rec_write.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*-
* Copyright (c) 2014-2015 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
* See the file LICENSE for redistribution information.
*/
#include "wt_internal.h"
struct __rec_boundary; typedef struct __rec_boundary WT_BOUNDARY;
struct __rec_dictionary; typedef struct __rec_dictionary WT_DICTIONARY;
struct __rec_kv; typedef struct __rec_kv WT_KV;
/*
* Reconciliation is the process of taking an in-memory page, walking each entry
* in the page, building a backing disk image in a temporary buffer representing
* that information, and writing that buffer to disk. What could be simpler?
*
* WT_RECONCILE --
* Information tracking a single page reconciliation.
*/
typedef struct {
WT_REF *ref; /* Page being reconciled */
WT_PAGE *page;
uint32_t flags; /* Caller's configuration */
WT_ITEM dsk; /* Temporary disk-image buffer */
/* Track whether all changes to the page are written. */
uint64_t max_txn;
uint64_t skipped_txn;
uint32_t orig_write_gen;
/*
* If page updates are skipped because they are as yet unresolved, or
* the page has updates we cannot discard, the page is left "dirty":
* the page cannot be discarded and a subsequent reconciliation will
* be necessary to discard the page.
*/
int leave_dirty;
/*
* Raw compression (don't get me started, as if normal reconciliation
* wasn't bad enough). If an application wants absolute control over
* what gets written to disk, we give it a list of byte strings and it
* gives us back an image that becomes a file block. Because we don't
* know the number of items we're storing in a block until we've done
* a lot of work, we turn off most compression: dictionary, copy-cell,
* prefix and row-store internal page suffix compression are all off.
*/
int raw_compression;
uint32_t raw_max_slots; /* Raw compression array sizes */
uint32_t *raw_entries; /* Raw compression slot entries */
uint32_t *raw_offsets; /* Raw compression slot offsets */
uint64_t *raw_recnos; /* Raw compression recno count */
WT_ITEM raw_destination; /* Raw compression destination buffer */
/*
* Track if reconciliation has seen any overflow items. If a leaf page
* with no overflow items is written, the parent page's address cell is
* set to the leaf-no-overflow type. This means we can delete the leaf
* page without reading it because we don't have to discard any overflow
* items it might reference.
*
* The test test is per-page reconciliation, that is, once we see an
* overflow item on the page, all subsequent leaf pages written for the
* page will not be leaf-no-overflow type, regardless of whether or not
* they contain overflow items. In other words, leaf-no-overflow is not
* guaranteed to be set on every page that doesn't contain an overflow
* item, only that if it is set, the page contains no overflow items.
*
* The reason is because of raw compression: there's no easy/fast way to
* figure out if the rows selected by raw compression included overflow
* items, and the optimization isn't worth another pass over the data.
*/
int ovfl_items;
/*
* Track if reconciliation of a row-store leaf page has seen empty (zero
* length) values. We don't write out anything for empty values, so if
* there are empty values on a page, we have to make two passes over the
* page when it's read to figure out how many keys it has, expensive in
* the common case of no empty values and (entries / 2) keys. Likewise,
* a page with only empty values is another common data set, and keys on
* that page will be equal to the number of entries. In both cases, set
* a flag in the page's on-disk header.
*
* The test is per-page reconciliation as described above for the
* overflow-item test.
*/
int all_empty_value, any_empty_value;
/*
* Reconciliation gets tricky if we have to split a page, which happens
* when the disk image we create exceeds the page type's maximum disk
* image size.
*
* First, the sizes of the page we're building. If WiredTiger is doing
* page layout, page_size is the same as page_size_orig. We accumulate
* a "page size" of raw data and when we reach that size, we split the
* page into multiple chunks, eventually compressing those chunks. When
* the application is doing page layout (raw compression is configured),
* page_size can continue to grow past page_size_orig, and we keep
* accumulating raw data until the raw compression callback accepts it.
*/
uint32_t page_size; /* Set page size */
uint32_t page_size_orig; /* Saved set page size */
/*
* Second, the split size: if we're doing the page layout, split to a
* smaller-than-maximum page size when a split is required so we don't
* repeatedly split a packed page.
*/
uint32_t split_size; /* Split page size */
/*
* The problem with splits is we've done a lot of work by the time we
* realize we're going to have to split, we don't want to start over.
*
* To keep from having to start over when we hit the maximum page size,
* we track the page information when we approach a split boundary.
* If we eventually have to split, we walk this structure and pretend
* we were splitting all along. After that, we continue to append to
* this structure, and eventually walk it to create a new internal page
* that references all of our split pages.
*/
struct __rec_boundary {
/*
* Offset is the byte offset in the initial split buffer of the
* first byte of the split chunk, recorded before we decide to
* split the page; the difference between chunk[1]'s offset and
* chunk[0]'s offset is chunk[0]'s length.
*
* Once we split a page, we stop filling in offset values, we're
* writing the split chunks as we find them.
*/
size_t offset; /* Split's first byte */
/*
* The recno and entries fields are the starting record number
* of the split chunk (for column-store splits), and the number
* of entries in the split chunk. These fields are used both
* to write the split chunk, and to create a new internal page
* to reference the split pages.
*/
uint64_t recno; /* Split's starting record */
uint32_t entries; /* Split's entries */
WT_ADDR addr; /* Split's written location */
uint32_t size; /* Split's size */
uint32_t cksum; /* Split's checksum */
void *dsk; /* Split's disk image */
/*
* When busy pages get large, we need to be able to evict them
* even when they contain unresolved updates, or updates which
* cannot be evicted because of running transactions. In such
* cases, break the page into multiple blocks, write the blocks
* that can be evicted, saving lists of updates for blocks that
* cannot be evicted, then re-instantiate the blocks that cannot
* be evicted as new, in-memory pages, restoring the updates on
* those pages.
*/
WT_UPD_SKIPPED *skip; /* Skipped updates */
uint32_t skip_next;
size_t skip_allocated;
/*
* The key for a row-store page; no column-store key is needed
* because the page's recno, stored in the recno field, is the
* column-store key.
*/
WT_ITEM key; /* Promoted row-store key */
/*
* During wrapup, after reconciling the root page, we write a
* final block as part of a checkpoint. If raw compression
* was configured, that block may have already been compressed.
*/
int already_compressed;
} *bnd; /* Saved boundaries */
uint32_t bnd_next; /* Next boundary slot */
uint32_t bnd_next_max; /* Maximum boundary slots used */
size_t bnd_entries; /* Total boundary slots */
size_t bnd_allocated; /* Bytes allocated */
/*
* We track the total number of page entries copied into split chunks
* so we can easily figure out how many entries in the current split
* chunk.
*/
uint32_t total_entries; /* Total entries in splits */
/*
* And there's state information as to where in this process we are:
* (1) tracking split boundaries because we can still fit more split
* chunks into the maximum page size, (2) tracking the maximum page
* size boundary because we can't fit any more split chunks into the
* maximum page size, (3) not performing boundary checks because it's
* either not useful with the current page size configuration, or
* because we've already been forced to split.
*/
enum { SPLIT_BOUNDARY=0, /* Next: a split page boundary */
SPLIT_MAX=1, /* Next: the maximum page boundary */
SPLIT_TRACKING_OFF=2, /* No boundary checks */
SPLIT_TRACKING_RAW=3 } /* Underlying compression decides */
bnd_state;
/*
* We track current information about the current record number, the
* number of entries copied into the temporary buffer, where we are
* in the temporary buffer, and how much memory remains. Those items
* are packaged here rather than passing pointers to stack locations
* around the code.
*/
uint64_t recno; /* Current record number */
uint32_t entries; /* Current number of entries */
uint8_t *first_free; /* Current first free byte */
size_t space_avail; /* Remaining space in this chunk */
/*
* While reviewing updates for each page, we store skipped updates here,
* and then move them to per-block areas as the blocks are defined.
*/
WT_UPD_SKIPPED *skip; /* Skipped updates */
uint32_t skip_next;
size_t skip_allocated;
/*
* We don't need to keep the 0th key around on internal pages, the
* search code ignores them as nothing can sort less by definition.
* There's some trickiness here, see the code for comments on how
* these fields work.
*/
int cell_zero; /* Row-store internal page 0th key */
/*
* WT_DICTIONARY --
* We optionally build a dictionary of row-store values for leaf
* pages. Where two value cells are identical, only write the value
* once, the second and subsequent copies point to the original cell.
* The dictionary is fixed size, but organized in a skip-list to make
* searches faster.
*/
struct __rec_dictionary {
uint64_t hash; /* Hash value */
void *cell; /* Matching cell */
u_int depth; /* Skiplist */
WT_DICTIONARY *next[0];
} **dictionary; /* Dictionary */
u_int dictionary_next, dictionary_slots; /* Next, max entries */
/* Skiplist head. */
WT_DICTIONARY *dictionary_head[WT_SKIP_MAXDEPTH];
/*
* WT_KV--
* An on-page key/value item we're building.
*/
struct __rec_kv {
WT_ITEM buf; /* Data */
WT_CELL cell; /* Cell and cell's length */
size_t cell_len;
size_t len; /* Total length of cell + data */
} k, v; /* Key/Value being built */
WT_ITEM *cur, _cur; /* Key/Value being built */
WT_ITEM *last, _last; /* Last key/value built */
int key_pfx_compress; /* If can prefix-compress next key */
int key_pfx_compress_conf; /* If prefix compression configured */
int key_sfx_compress; /* If can suffix-compress next key */
int key_sfx_compress_conf; /* If suffix compression configured */
int is_bulk_load; /* If it's a bulk load */
WT_SALVAGE_COOKIE *salvage; /* If it's a salvage operation */
int tested_ref_state; /* Debugging information */
} WT_RECONCILE;
static void __rec_bnd_cleanup(WT_SESSION_IMPL *, WT_RECONCILE *, int);
static void __rec_cell_build_addr(
WT_RECONCILE *, const void *, size_t, u_int, uint64_t);
static int __rec_cell_build_int_key(WT_SESSION_IMPL *,
WT_RECONCILE *, const void *, size_t, int *);
static int __rec_cell_build_leaf_key(WT_SESSION_IMPL *,
WT_RECONCILE *, const void *, size_t, int *);
static int __rec_cell_build_ovfl(WT_SESSION_IMPL *,
WT_RECONCILE *, WT_KV *, uint8_t, uint64_t);
static int __rec_cell_build_val(WT_SESSION_IMPL *,
WT_RECONCILE *, const void *, size_t, uint64_t);
static int __rec_child_deleted(
WT_SESSION_IMPL *, WT_RECONCILE *, WT_REF *, int *);
static int __rec_col_fix(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
static int __rec_col_fix_slvg(WT_SESSION_IMPL *,
WT_RECONCILE *, WT_PAGE *, WT_SALVAGE_COOKIE *);
static int __rec_col_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
static int __rec_col_merge(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
static int __rec_col_var(WT_SESSION_IMPL *,
WT_RECONCILE *, WT_PAGE *, WT_SALVAGE_COOKIE *);
static int __rec_col_var_helper(WT_SESSION_IMPL *, WT_RECONCILE *,
WT_SALVAGE_COOKIE *, WT_ITEM *, int, uint8_t, uint64_t);
static int __rec_destroy_session(WT_SESSION_IMPL *);
static int __rec_root_write(WT_SESSION_IMPL *, WT_PAGE *, uint32_t);
static int __rec_row_int(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
static int __rec_row_leaf(WT_SESSION_IMPL *,
WT_RECONCILE *, WT_PAGE *, WT_SALVAGE_COOKIE *);
static int __rec_row_leaf_insert(
WT_SESSION_IMPL *, WT_RECONCILE *, WT_INSERT *);
static int __rec_row_merge(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
static int __rec_split_col(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
static int __rec_split_discard(WT_SESSION_IMPL *, WT_PAGE *);
static int __rec_split_fixup(WT_SESSION_IMPL *, WT_RECONCILE *);
static int __rec_split_row(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
static int __rec_split_row_promote(
WT_SESSION_IMPL *, WT_RECONCILE *, WT_ITEM *, uint8_t);
static int __rec_split_write(WT_SESSION_IMPL *,
WT_RECONCILE *, WT_BOUNDARY *, WT_ITEM *, int);
static int __rec_write_init(WT_SESSION_IMPL *,
WT_REF *, uint32_t, WT_SALVAGE_COOKIE *, void *);
static int __rec_write_wrapup(WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
static int __rec_write_wrapup_err(
WT_SESSION_IMPL *, WT_RECONCILE *, WT_PAGE *);
static void __rec_dictionary_free(WT_SESSION_IMPL *, WT_RECONCILE *);
static int __rec_dictionary_init(WT_SESSION_IMPL *, WT_RECONCILE *, u_int);
static int __rec_dictionary_lookup(
WT_SESSION_IMPL *, WT_RECONCILE *, WT_KV *, WT_DICTIONARY **);
static void __rec_dictionary_reset(WT_RECONCILE *);
/*
* __wt_reconcile --
* Reconcile an in-memory page into its on-disk format, and write it.
*/
int
__wt_reconcile(WT_SESSION_IMPL *session,
WT_REF *ref, WT_SALVAGE_COOKIE *salvage, uint32_t flags)
{
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_PAGE *page;
WT_PAGE_MODIFY *mod;
WT_RECONCILE *r;
int locked;
conn = S2C(session);
page = ref->page;
mod = page->modify;
/* We're shouldn't get called with a clean page, that's an error. */
if (!__wt_page_is_modified(page))
WT_RET_MSG(session, WT_ERROR,
"Attempt to reconcile a clean page.");
WT_RET(__wt_verbose(session,
WT_VERB_RECONCILE, "%s", __wt_page_type_string(page->type)));
WT_STAT_FAST_CONN_INCR(session, rec_pages);
WT_STAT_FAST_DATA_INCR(session, rec_pages);
if (LF_ISSET(WT_EVICTING)) {
WT_STAT_FAST_CONN_INCR(session, rec_pages_eviction);
WT_STAT_FAST_DATA_INCR(session, rec_pages_eviction);
}
#ifdef HAVE_DIAGNOSTIC
{
/*
* Check that transaction time always moves forward for a given page.
* If this check fails, reconciliation can free something that a future
* reconciliation will need.
*/
uint64_t oldest_id = __wt_txn_oldest_id(session);
WT_ASSERT(session, WT_TXNID_LE(mod->last_oldest_id, oldest_id));
mod->last_oldest_id = oldest_id;
}
#endif
/* Record the most recent transaction ID we will *not* write. */
mod->disk_snap_min = session->txn.snap_min;
/* Initialize the reconciliation structure for each new run. */
WT_RET(__rec_write_init(
session, ref, flags, salvage, &session->reconcile));
r = session->reconcile;
/*
* The compaction process looks at the page's modification information;
* if compaction is running, lock the page down.
*
* Otherwise, flip on the scanning flag: obsolete updates cannot be
* freed while reconciliation is in progress.
*/
locked = 0;
if (conn->compact_in_memory_pass) {
locked = 1;
WT_PAGE_LOCK(session, page);
} else
for (;;) {
F_CAS_ATOMIC(page, WT_PAGE_SCANNING, ret);
if (ret == 0)
break;
__wt_yield();
}
/* Reconcile the page. */
switch (page->type) {
case WT_PAGE_COL_FIX:
if (salvage != NULL)
ret = __rec_col_fix_slvg(session, r, page, salvage);
else
ret = __rec_col_fix(session, r, page);
break;
case WT_PAGE_COL_INT:
WT_WITH_PAGE_INDEX(session,
ret = __rec_col_int(session, r, page));
break;
case WT_PAGE_COL_VAR:
ret = __rec_col_var(session, r, page, salvage);
break;
case WT_PAGE_ROW_INT:
WT_WITH_PAGE_INDEX(session,
ret = __rec_row_int(session, r, page));
break;
case WT_PAGE_ROW_LEAF:
ret = __rec_row_leaf(session, r, page, salvage);
break;
WT_ILLEGAL_VALUE_SET(session);
}
/* Wrap up the page reconciliation. */
if (ret == 0)
ret = __rec_write_wrapup(session, r, page);
else
WT_TRET(__rec_write_wrapup_err(session, r, page));
/* Release the page lock if we're holding one. */
if (locked)
WT_PAGE_UNLOCK(session, page);
else
F_CLR_ATOMIC(page, WT_PAGE_SCANNING);
/*
* Clean up the boundary structures: some workloads result in millions
* of these structures, and if associated with some random session that
* got roped into doing forced eviction, they won't be discarded for the
* life of the session.
*/
__rec_bnd_cleanup(session, r, 0);
WT_RET(ret);
/*
* Root pages are special, splits have to be done, we can't put it off
* as the parent's problem any more.
*/
if (__wt_ref_is_root(ref)) {
WT_WITH_PAGE_INDEX(session,
ret = __rec_root_write(session, page, flags));
return (ret);
}
/*
* Otherwise, mark the page's parent dirty.
* Don't mark the tree dirty: if this reconciliation is in service of a
* checkpoint, it's cleared the tree's dirty flag, and we don't want to
* set it again as part of that walk.
*/
return (__wt_page_parent_modify_set(session, ref, 1));
}
/*
* __rec_root_write --
* Handle the write of a root page.
*/
static int
__rec_root_write(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
{
WT_DECL_RET;
WT_PAGE *next;
WT_PAGE_INDEX *pindex;
WT_PAGE_MODIFY *mod;
WT_REF fake_ref;
uint32_t i;
mod = page->modify;
/*
* If a single root page was written (either an empty page or there was
* a 1-for-1 page swap), we've written root and checkpoint, we're done.
* If the root page split, write the resulting WT_REF array. We already
* have an infrastructure for writing pages, create a fake root page and
* write it instead of adding code to write blocks based on the list of
* blocks resulting from a multiblock reconciliation.
*/
switch (F_ISSET(mod, WT_PM_REC_MASK)) {
case WT_PM_REC_EMPTY: /* Page is empty */
case WT_PM_REC_REPLACE: /* 1-for-1 page swap */
case WT_PM_REC_REWRITE: /* Rewrite */
return (0);
case WT_PM_REC_MULTIBLOCK: /* Multiple blocks */
break;
WT_ILLEGAL_VALUE(session);
}
WT_RET(__wt_verbose(session, WT_VERB_SPLIT,
"root page split -> %" PRIu32 " pages", mod->mod_multi_entries));
/*
* Create a new root page, initialize the array of child references,
* mark it dirty, then write it.
*/
switch (page->type) {
case WT_PAGE_COL_INT:
WT_RET(__wt_page_alloc(session,
WT_PAGE_COL_INT, 1, mod->mod_multi_entries, 0, &next));
break;
case WT_PAGE_ROW_INT:
WT_RET(__wt_page_alloc(session,
WT_PAGE_ROW_INT, 0, mod->mod_multi_entries, 0, &next));
break;
WT_ILLEGAL_VALUE(session);
}
WT_INTL_INDEX_GET(session, next, pindex);
for (i = 0; i < mod->mod_multi_entries; ++i) {
WT_ERR(__wt_multi_to_ref(session,
next, &mod->mod_multi[i], &pindex->index[i], NULL));
pindex->index[i]->home = next;
}
/*
* We maintain a list of pages written for the root in order to free the
* backing blocks the next time the root is written.
*/
mod->mod_root_split = next;
/*
* Mark the page dirty.
* Don't mark the tree dirty: if this reconciliation is in service of a
* checkpoint, it's cleared the tree's dirty flag, and we don't want to
* set it again as part of that walk.
*/
WT_ERR(__wt_page_modify_init(session, next));
__wt_page_only_modify_set(session, next);
/*
* Fake up a reference structure, and write the next root page.
*/
__wt_root_ref_init(&fake_ref, next, page->type == WT_PAGE_COL_INT);
return (__wt_reconcile(session, &fake_ref, NULL, flags));
err: __wt_page_out(session, &next);
return (ret);
}
/*
* __rec_raw_compression_config --
* Configure raw compression.
*/
static inline int
__rec_raw_compression_config(
WT_SESSION_IMPL *session, WT_PAGE *page, WT_SALVAGE_COOKIE *salvage)
{
WT_BTREE *btree;
btree = S2BT(session);
/* Check if raw compression configured. */
if (btree->compressor == NULL ||
btree->compressor->compress_raw == NULL)
return (0);
/* Only for row-store and variable-length column-store objects. */
if (page->type == WT_PAGE_COL_FIX)
return (0);
/*
* Raw compression cannot support dictionary compression. (Technically,
* we could still use the raw callback on column-store variable length
* internal pages with dictionary compression configured, because
* dictionary compression only applies to column-store leaf pages, but
* that seems an unlikely use case.)
*/
if (btree->dictionary != 0)
return (0);
/* Raw compression cannot support prefix compression. */
if (btree->prefix_compression != 0)
return (0);
/*
* Raw compression is also turned off during salvage: we can't allow
* pages to split during salvage, raw compression has no point if it
* can't manipulate the page size.
*/
if (salvage != NULL)
return (0);
return (1);
}
/*
* __rec_write_init --
* Initialize the reconciliation structure.
*/
static int
__rec_write_init(WT_SESSION_IMPL *session,
WT_REF *ref, uint32_t flags, WT_SALVAGE_COOKIE *salvage, void *reconcilep)
{
WT_BTREE *btree;
WT_PAGE *page;
WT_RECONCILE *r;
btree = S2BT(session);
page = ref->page;
if ((r = *(WT_RECONCILE **)reconcilep) == NULL) {
WT_RET(__wt_calloc_one(session, &r));
*(WT_RECONCILE **)reconcilep = r;
session->reconcile_cleanup = __rec_destroy_session;
/* Connect pointers/buffers. */
r->cur = &r->_cur;
r->last = &r->_last;
/* Disk buffers need to be aligned for writing. */
F_SET(&r->dsk, WT_ITEM_ALIGNED);
}
/* Remember the configuration. */
r->ref = ref;
r->page = page;
r->flags = flags;
/* Track if the page can be marked clean. */
r->leave_dirty = 0;
/* Raw compression. */
r->raw_compression =
__rec_raw_compression_config(session, page, salvage);
r->raw_destination.flags = WT_ITEM_ALIGNED;
/* Track overflow items. */
r->ovfl_items = 0;
/* Track empty values. */
r->all_empty_value = 1;
r->any_empty_value = 0;
/* The list of cached, skipped updates. */
r->skip_next = 0;
/*
* Dictionary compression only writes repeated values once. We grow
* the dictionary as necessary, always using the largest size we've
* seen.
*
* Reset the dictionary.
*
* Sanity check the size: 100 slots is the smallest dictionary we use.
*/
if (btree->dictionary != 0 && btree->dictionary > r->dictionary_slots)
WT_RET(__rec_dictionary_init(session,
r, btree->dictionary < 100 ? 100 : btree->dictionary));
__rec_dictionary_reset(r);
/*
* Suffix compression shortens internal page keys by discarding trailing
* bytes that aren't necessary for tree navigation. We don't do suffix
* compression if there is a custom collator because we don't know what
* bytes a custom collator might use. Some custom collators (for
* example, a collator implementing reverse ordering of strings), won't
* have any problem with suffix compression: if there's ever a reason to
* implement suffix compression for custom collators, we can add a
* setting to the collator, configured when the collator is added, that
* turns on suffix compression.
*
* The raw compression routines don't even consider suffix compression,
* but it doesn't hurt to confirm that.
*/
r->key_sfx_compress_conf = 0;
if (btree->collator == NULL &&
btree->internal_key_truncate && !r->raw_compression)
r->key_sfx_compress_conf = 1;
/*
* Prefix compression discards repeated prefix bytes from row-store leaf
* page keys.
*/
r->key_pfx_compress_conf = 0;
if (btree->prefix_compression && page->type == WT_PAGE_ROW_LEAF)
r->key_pfx_compress_conf = 1;
r->salvage = salvage;
/* Save the page's write generation before reading the page. */
WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen);
/*
* Running transactions may update the page after we write it, so
* this is the highest ID we can be confident we will see.
*/
r->skipped_txn = S2C(session)->txn_global.last_running;
return (0);
}
/*
* __rec_destroy --
* Clean up the reconciliation structure.
*/
static void
__rec_destroy(WT_SESSION_IMPL *session, void *reconcilep)
{
WT_RECONCILE *r;
if ((r = *(WT_RECONCILE **)reconcilep) == NULL)
return;
*(WT_RECONCILE **)reconcilep = NULL;
__wt_buf_free(session, &r->dsk);
__wt_free(session, r->raw_entries);
__wt_free(session, r->raw_offsets);
__wt_free(session, r->raw_recnos);
__wt_buf_free(session, &r->raw_destination);
__rec_bnd_cleanup(session, r, 1);
__wt_free(session, r->skip);
__wt_buf_free(session, &r->k.buf);
__wt_buf_free(session, &r->v.buf);
__wt_buf_free(session, &r->_cur);
__wt_buf_free(session, &r->_last);
__rec_dictionary_free(session, r);
__wt_free(session, r);
}
/*
* __rec_destroy_session --
* Clean up the reconciliation structure, session version.
*/
static int
__rec_destroy_session(WT_SESSION_IMPL *session)
{
__rec_destroy(session, &session->reconcile);
return (0);
}
/*
* __rec_bnd_cleanup --
* Cleanup the boundary structure information.
*/
static void
__rec_bnd_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r, int destroy)
{
WT_BOUNDARY *bnd;
uint32_t i, last_used;
if (r->bnd == NULL)
return;
/*
* Free the boundary structures' memory. In the case of normal cleanup,
* discard any memory we won't reuse in the next reconciliation; in the
* case of destruction, discard everything.
*
* During some big-page evictions we have seen boundary arrays that have
* millions of elements. That should not be a normal event, but if the
* memory is associated with a random session, it won't be discarded
* until the session is closed. If there are more than 10,000 boundary
* structure elements, destroy the boundary array and we'll start over.
*/
if (destroy || r->bnd_entries > 10 * 1000) {
for (bnd = r->bnd, i = 0; i < r->bnd_entries; ++bnd, ++i) {
__wt_free(session, bnd->addr.addr);
__wt_free(session, bnd->dsk);
__wt_free(session, bnd->skip);
__wt_buf_free(session, &bnd->key);
}
__wt_free(session, r->bnd);
r->bnd_next = 0;
r->bnd_entries = r->bnd_allocated = 0;
} else {
/*
* The boundary-next field points to the next boundary structure
* we were going to use, but there's no requirement that value
* be incremented before reconciliation updates the structure it
* points to, that is, there's no guarantee elements of the next
* boundary structure are still unchanged. Be defensive, clean
* up the "next" structure as well as the ones we know we used.
*/
last_used = r->bnd_next;
if (last_used < r->bnd_entries)
++last_used;
for (bnd = r->bnd, i = 0; i < last_used; ++bnd, ++i) {
__wt_free(session, bnd->addr.addr);
__wt_free(session, bnd->dsk);
__wt_free(session, bnd->skip);
}
}
}
/*
* __rec_skip_update_save --
* Save a skipped WT_UPDATE list for later restoration.
*/
static int
__rec_skip_update_save(
WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip)
{
WT_RET(__wt_realloc_def(
session, &r->skip_allocated, r->skip_next + 1, &r->skip));
r->skip[r->skip_next].ins = ins;
r->skip[r->skip_next].rip = rip;
++r->skip_next;
return (0);
}
/*
* __rec_skip_update_move --
* Move a skipped WT_UPDATE list from the per-page cache to a specific
* block's list.
*/
static int
__rec_skip_update_move(
WT_SESSION_IMPL *session, WT_BOUNDARY *bnd, WT_UPD_SKIPPED *skip)
{
WT_RET(__wt_realloc_def(
session, &bnd->skip_allocated, bnd->skip_next + 1, &bnd->skip));
bnd->skip[bnd->skip_next] = *skip;
++bnd->skip_next;
skip->ins = NULL;
skip->rip = NULL;
return (0);
}
/*
* __rec_txn_read --
* Return the first visible update in a list (or NULL if none are visible),
* set a flag if any updates were skipped, track the maximum transaction ID on
* the page.
*/
static inline int
__rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r,
WT_INSERT *ins, WT_ROW *rip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp)
{
WT_DECL_RET;
WT_ITEM ovfl;
WT_PAGE *page;
WT_UPDATE *upd, *upd_list, *upd_ovfl;
size_t notused;
uint64_t max_txn, min_txn, txnid;
int skipped;
*updp = NULL;
page = r->page;
/*
* If we're called with an WT_INSERT reference, use its WT_UPDATE
* list, else is an on-page row-store WT_UPDATE list.
*/
upd_list = ins == NULL ? WT_ROW_UPDATE(page, rip) : ins->upd;
skipped = 0;
for (max_txn = WT_TXN_NONE, min_txn = UINT64_MAX, upd = upd_list;
upd != NULL; upd = upd->next) {
if ((txnid = upd->txnid) == WT_TXN_ABORTED)
continue;
/* Track the largest/smallest transaction IDs on the list. */
if (WT_TXNID_LT(max_txn, txnid))
max_txn = txnid;
if (WT_TXNID_LT(txnid, min_txn))
min_txn = txnid;
if (WT_TXNID_LT(txnid, r->skipped_txn) &&
!__wt_txn_visible_all(session, txnid))
r->skipped_txn = txnid;
/*
* Record whether any updates were skipped on the way to finding
* the first visible update.
*
* If updates were skipped before the one being written, future
* reads without intervening modifications to the page could
* see a different value; if no updates were skipped, the page
* can safely be marked clean and does not need to be
* reconciled until modified again.
*/
if (*updp == NULL) {
if (__wt_txn_visible(session, txnid))
*updp = upd;
else
skipped = 1;
}
}
/*
* Track the maximum transaction ID in the page. We store this in the
* page at the end of reconciliation if no updates are skipped, it's
* used to avoid evicting clean pages from memory with changes required
* to satisfy a snapshot read.
*/
if (WT_TXNID_LT(r->max_txn, max_txn))
r->max_txn = max_txn;
/*
* If all updates are globally visible and no updates were skipped, the
* page can be marked clean and we're done, regardless of whether we're
* evicting or checkpointing.
*
* The oldest transaction ID may have moved while we were scanning the
* page, so it is possible to skip an update but then find that by the
* end of the scan, all updates are stable.
*/
if (__wt_txn_visible_all(session, max_txn) && !skipped)
return (0);
/*
* If some updates are not globally visible, or were skipped, the page
* cannot be marked clean.
*/
r->leave_dirty = 1;
/* If we're not evicting, we're done, we know what we'll write. */
if (!F_ISSET(r, WT_EVICTING))
return (0);
/* In some cases, there had better not be any updates we can't write. */
if (F_ISSET(r, WT_SKIP_UPDATE_ERR))
WT_PANIC_RET(session, EINVAL,
"reconciliation illegally skipped an update");
/*
* If evicting and we aren't able to save/restore the not-yet-visible
* updates, the page can't be evicted.
*/
if (!F_ISSET(r, WT_SKIP_UPDATE_RESTORE))
return (EBUSY);
/*
* Evicting a page with not-yet-visible updates: save and restore the
* list of updates on a newly instantiated page.
*
* The order of the updates on the list matters so we can't move only
* the unresolved updates, we have to move the entire update list.
*
* Clear the returned update so our caller ignores the key/value pair
* in the case of an insert/append entry (everything we need is in the
* update list), and otherwise writes the original on-page key/value
* pair to which the update list applies.
*/
*updp = NULL;
/*
* Handle the case were we don't want to write an original on-page value
* item to disk because it's been updated or removed.
*
* Here's the deal: an overflow value was updated or removed and its
* backing blocks freed. If any transaction in the system might still
* read the value, a copy was cached in page reconciliation tracking
* memory, and the page cell set to WT_CELL_VALUE_OVFL_RM. Eviction
* then chose the page and we're splitting it up in order to push parts
* of it out of memory.
*
* We could write the original on-page value item to disk... if we had
* a copy. The cache may not have a copy (a globally visible update
* would have kept a value from ever being cached), or an update that
* subsequent became globally visible could cause a cached value to be
* discarded. Either way, once there's a globally visible update, we
* may not have the value.
*
* Fortunately, if there's a globally visible update we don't care about
* the original version, so we simply ignore it, no transaction can ever
* try and read it. If there isn't a globally visible update, there had
* better be a cached value.
*
* In the latter case, we could write the value out to disk, but (1) we
* are planning on re-instantiating this page in memory, it isn't going
* to disk, and (2) the value item is eventually going to be discarded,
* that seems like a waste of a write. Instead, find the cached value
* and append it to the update list we're saving for later restoration.
*/
if (vpack != NULL && vpack->raw == WT_CELL_VALUE_OVFL_RM &&
!__wt_txn_visible_all(session, min_txn)) {
if ((ret = __wt_ovfl_txnc_search(
page, vpack->data, vpack->size, &ovfl)) != 0)
WT_PANIC_RET(session, ret,
"cached overflow item discarded early");
/*
* Create an update structure with an impossibly low transaction
* ID and append it to the update list we're about to save.