/
txn_inline.h
1801 lines (1570 loc) · 64.9 KB
/
txn_inline.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*-
* Copyright (c) 2014-present MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
* See the file LICENSE for redistribution information.
*/
/*
* __wt_txn_context_prepare_check --
* Return an error if the current transaction is in the prepare state.
*/
static inline int
__wt_txn_context_prepare_check(WT_SESSION_IMPL *session)
{
if (F_ISSET(session->txn, WT_TXN_PREPARE_IGNORE_API_CHECK))
return (0);
if (F_ISSET(session->txn, WT_TXN_PREPARE))
WT_RET_MSG(session, EINVAL, "not permitted in a prepared transaction");
return (0);
}
/*
* __wt_txn_context_check --
* Complain if a transaction is/isn't running.
*/
static inline int
__wt_txn_context_check(WT_SESSION_IMPL *session, bool requires_txn)
{
if (requires_txn && !F_ISSET(session->txn, WT_TXN_RUNNING))
WT_RET_MSG(session, EINVAL, "only permitted in a running transaction");
if (!requires_txn && F_ISSET(session->txn, WT_TXN_RUNNING))
WT_RET_MSG(session, EINVAL, "not permitted in a running transaction");
return (0);
}
/*
* __wt_txn_err_set --
* Set an error in the current transaction.
*/
static inline void
__wt_txn_err_set(WT_SESSION_IMPL *session, int ret)
{
WT_TXN *txn;
txn = session->txn;
/* Ignore standard errors that don't fail the transaction. */
if (ret == WT_NOTFOUND || ret == WT_DUPLICATE_KEY || ret == WT_PREPARE_CONFLICT)
return;
/* Less commonly, it's not a running transaction. */
if (!F_ISSET(txn, WT_TXN_RUNNING))
return;
/* The transaction has to be rolled back. */
F_SET(txn, WT_TXN_ERROR);
/*
* Check for a prepared transaction, and quit: we can't ignore the error and we can't roll back
* a prepared transaction.
*/
if (F_ISSET(txn, WT_TXN_PREPARE))
WT_IGNORE_RET(__wt_panic(session, ret,
"transactional error logged after transaction was prepared, failing the system"));
}
/*
* __wt_txn_op_set_recno --
* Set the latest transaction operation with the given recno.
*/
static inline void
__wt_txn_op_set_recno(WT_SESSION_IMPL *session, uint64_t recno)
{
WT_TXN *txn;
WT_TXN_OP *op;
txn = session->txn;
WT_ASSERT(session, txn->mod_count > 0 && recno != WT_RECNO_OOB);
op = txn->mod + txn->mod_count - 1;
if (WT_SESSION_IS_CHECKPOINT(session) || WT_IS_HS(op->btree->dhandle) ||
WT_IS_METADATA(op->btree->dhandle))
return;
WT_ASSERT(session, op->type == WT_TXN_OP_BASIC_COL || op->type == WT_TXN_OP_INMEM_COL);
/*
* Copy the recno into the transaction operation structure, so when update is evicted to the
* history store, we have a chance of finding it again. Even though only prepared updates can be
* evicted, at this stage we don't know whether this transaction will be prepared or not, hence
* we are copying the key for all operations, so that we can use this key to fetch the update in
* case this transaction is prepared.
*/
op->u.op_col.recno = recno;
}
/*
* __wt_txn_op_set_key --
* Set the latest transaction operation with the given key.
*/
static inline int
__wt_txn_op_set_key(WT_SESSION_IMPL *session, const WT_ITEM *key)
{
WT_TXN *txn;
WT_TXN_OP *op;
txn = session->txn;
WT_ASSERT(session, txn->mod_count > 0 && key->data != NULL);
op = txn->mod + txn->mod_count - 1;
if (WT_SESSION_IS_CHECKPOINT(session) || WT_IS_HS(op->btree->dhandle) ||
WT_IS_METADATA(op->btree->dhandle))
return (0);
WT_ASSERT(session, op->type == WT_TXN_OP_BASIC_ROW || op->type == WT_TXN_OP_INMEM_ROW);
/*
* Copy the key into the transaction operation structure, so when update is evicted to the
* history store, we have a chance of finding it again. Even though only prepared updates can be
* evicted, at this stage we don't know whether this transaction will be prepared or not, hence
* we are copying the key for all operations, so that we can use this key to fetch the update in
* case this transaction is prepared.
*/
return (__wt_buf_set(session, &op->u.op_row.key, key->data, key->size));
}
/*
* __txn_apply_prepare_state_update --
* Change the prepared state of an update.
*/
static inline void
__txn_apply_prepare_state_update(WT_SESSION_IMPL *session, WT_UPDATE *upd, bool commit)
{
WT_TXN *txn;
txn = session->txn;
if (commit) {
/*
* In case of a prepared transaction, the order of modification of the prepare timestamp to
* commit timestamp in the update chain will not affect the data visibility, a reader will
* encounter a prepared update resulting in prepare conflict.
*
* As updating timestamp might not be an atomic operation, we will manage using state.
*/
upd->prepare_state = WT_PREPARE_LOCKED;
WT_WRITE_BARRIER();
upd->start_ts = txn->commit_timestamp;
upd->durable_ts = txn->durable_timestamp;
WT_PUBLISH(upd->prepare_state, WT_PREPARE_RESOLVED);
} else {
/* Set prepare timestamp. */
upd->start_ts = txn->prepare_timestamp;
/*
* By default durable timestamp is assigned with 0 which is same as WT_TS_NONE. Assign it
* with WT_TS_NONE to make sure in case if we change the macro value it shouldn't be a
* problem.
*/
upd->durable_ts = WT_TS_NONE;
WT_PUBLISH(upd->prepare_state, WT_PREPARE_INPROGRESS);
}
}
/*
* __txn_apply_prepare_state_page_del --
* Change a prepared page deleted structure's prepared state.
*/
static inline void
__txn_apply_prepare_state_page_del(WT_SESSION_IMPL *session, WT_PAGE_DELETED *page_del, bool commit)
{
WT_TXN *txn;
txn = session->txn;
if (commit) {
/*
* The page deleted structure is only checked in tree walk. If it is prepared, we will
* instantiate the leaf page and check the keys on it. Therefore, we don't need to worry
* about reading the partial state and don't need to lock the state.
*/
page_del->timestamp = txn->commit_timestamp;
page_del->durable_timestamp = txn->durable_timestamp;
WT_PUBLISH(page_del->prepare_state, WT_PREPARE_RESOLVED);
} else {
/* Set prepare timestamp. */
page_del->timestamp = txn->prepare_timestamp;
/*
* By default durable timestamp is assigned with 0 which is same as WT_TS_NONE. Assign it
* with WT_TS_NONE to make sure in case if we change the macro value it shouldn't be a
* problem.
*/
page_del->durable_timestamp = WT_TS_NONE;
WT_PUBLISH(page_del->prepare_state, WT_PREPARE_INPROGRESS);
}
}
/*
* __txn_next_op --
* Mark a WT_UPDATE object modified by the current transaction.
*/
static inline int
__txn_next_op(WT_SESSION_IMPL *session, WT_TXN_OP **opp)
{
WT_TXN *txn;
WT_TXN_OP *op;
*opp = NULL;
txn = session->txn;
/*
* We're about to perform an update. Make sure we have allocated a transaction ID.
*/
WT_RET(__wt_txn_id_check(session));
WT_ASSERT(session, F_ISSET(txn, WT_TXN_HAS_ID));
WT_RET(__wt_realloc_def(session, &txn->mod_alloc, txn->mod_count + 1, &txn->mod));
op = &txn->mod[txn->mod_count++];
WT_CLEAR(*op);
op->btree = S2BT(session);
(void)__wt_atomic_addi32(&session->dhandle->session_inuse, 1);
*opp = op;
return (0);
}
/*
* __wt_txn_unmodify --
* If threads race making updates, they may discard the last referenced WT_UPDATE item while the
* transaction is still active. This function removes the last update item from the "log".
*/
static inline void
__wt_txn_unmodify(WT_SESSION_IMPL *session)
{
WT_TXN *txn;
WT_TXN_OP *op;
txn = session->txn;
if (F_ISSET(txn, WT_TXN_HAS_ID)) {
WT_ASSERT(session, txn->mod_count > 0);
--txn->mod_count;
op = txn->mod + txn->mod_count;
__wt_txn_op_free(session, op);
}
}
/*
* __wt_txn_op_delete_apply_prepare_state --
* Apply the correct prepare state and the timestamp to the ref and to any updates in the page
* del update list.
*/
static inline void
__wt_txn_op_delete_apply_prepare_state(WT_SESSION_IMPL *session, WT_REF *ref, bool commit)
{
WT_PAGE_DELETED *page_del;
WT_UPDATE **updp;
uint8_t previous_state;
/* Lock the ref to ensure we don't race with page instantiation. */
WT_REF_LOCK(session, ref, &previous_state);
/*
* Timestamps and prepare state are in the page deleted structure for truncates, or in the
* updates list in the case of instantiated pages. We also need to update any page deleted
* structure in the ref.
*
* Only two cases are possible. First: the state is WT_REF_DELETED. In this case page_del cannot
* be NULL yet because an uncommitted operation cannot have reached global visibility. (Or at
* least, global visibility in the sense we need to use it for truncations, in which prepared
* and uncommitted transactions are not visible.)
*
* Otherwise: there is an uncommitted delete operation we're handling, so the page must have
* been deleted at some point, and the tree can't be readonly. Therefore the page must have been
* instantiated, the state must be WT_REF_MEM, and there should be an update list in
* mod->inst_updates. (But just in case, allow the update list to be null.) There might be a
* non-null page_del structure to update, depending on whether the page has been reconciled
* since it was deleted and then instantiated.
*/
if (previous_state != WT_REF_DELETED) {
WT_ASSERT(session, previous_state == WT_REF_MEM);
WT_ASSERT(session, ref->page != NULL && ref->page->modify != NULL);
if ((updp = ref->page->modify->inst_updates) != NULL)
for (; *updp != NULL; ++updp)
__txn_apply_prepare_state_update(session, *updp, commit);
}
if ((page_del = ref->page_del) != NULL)
__txn_apply_prepare_state_page_del(session, page_del, commit);
WT_REF_UNLOCK(ref, previous_state);
}
/*
* __wt_txn_op_delete_commit_apply_timestamps --
* Apply the correct start and durable timestamps to any updates in the page del update list.
*/
static inline void
__wt_txn_op_delete_commit_apply_timestamps(WT_SESSION_IMPL *session, WT_REF *ref)
{
WT_PAGE_DELETED *page_del;
WT_TXN *txn;
WT_UPDATE **updp;
uint8_t previous_state;
txn = session->txn;
/* Lock the ref to ensure we don't race with page instantiation. */
WT_REF_LOCK(session, ref, &previous_state);
/*
* Timestamps are in the page deleted structure for truncates, or in the updates in the case of
* instantiated pages. We also need to update any page deleted structure in the ref. Both commit
* and durable timestamps need to be updated.
*
* Only two cases are possible. First: the state is WT_REF_DELETED. In this case page_del cannot
* be NULL yet because an uncommitted operation cannot have reached global visibility. (Or at
* least, global visibility in the sense we need to use it for truncations, in which prepared
* and uncommitted transactions are not visible.)
*
* Otherwise: there is an uncommitted delete operation we're handling, so the page must have
* been deleted at some point, and the tree can't be readonly. Therefore the page must have been
* instantiated, the state must be WT_REF_MEM, and there should be an update list in
* mod->inst_updates. (But just in case, allow the update list to be null.) There might be a
* non-null page_del structure to update, depending on whether the page has been reconciled
* since it was deleted and then instantiated.
*/
if (previous_state != WT_REF_DELETED) {
WT_ASSERT(session, previous_state == WT_REF_MEM);
WT_ASSERT(session, ref->page != NULL && ref->page->modify != NULL);
if ((updp = ref->page->modify->inst_updates) != NULL)
for (; *updp != NULL; ++updp) {
(*updp)->start_ts = txn->commit_timestamp;
(*updp)->durable_ts = txn->durable_timestamp;
}
}
page_del = ref->page_del;
if (page_del != NULL && page_del->timestamp == WT_TS_NONE) {
page_del->timestamp = txn->commit_timestamp;
page_del->durable_timestamp = txn->durable_timestamp;
}
WT_REF_UNLOCK(ref, previous_state);
}
/*
* __wt_txn_op_set_timestamp --
* Decide whether to copy a commit timestamp into an update. If the op structure doesn't have a
* populated update or ref field or is in prepared state there won't be any check for an
* existing timestamp.
*/
static inline void
__wt_txn_op_set_timestamp(WT_SESSION_IMPL *session, WT_TXN_OP *op)
{
WT_BTREE *btree;
WT_TXN *txn;
WT_UPDATE *upd;
btree = op->btree;
txn = session->txn;
/*
* Updates without a commit time and logged objects don't have timestamps, and only the most
* recently committed data matches files on disk.
*/
if (!F_ISSET(txn, WT_TXN_HAS_TS_COMMIT))
return;
if (F_ISSET(btree, WT_BTREE_LOGGED))
return;
if (F_ISSET(txn, WT_TXN_PREPARE)) {
/*
* We have a commit timestamp for a prepare transaction, this is only possible as part of a
* transaction commit call.
*/
if (op->type == WT_TXN_OP_REF_DELETE)
__wt_txn_op_delete_apply_prepare_state(session, op->u.ref, true);
else {
upd = op->u.op_upd;
/* Resolve prepared update to be committed update. */
__txn_apply_prepare_state_update(session, upd, true);
}
} else {
if (op->type == WT_TXN_OP_REF_DELETE)
__wt_txn_op_delete_commit_apply_timestamps(session, op->u.ref);
else {
/*
* The timestamp is in the update for operations other than truncate. Both commit and
* durable timestamps need to be updated.
*/
upd = op->u.op_upd;
if (upd->start_ts == WT_TS_NONE) {
upd->start_ts = txn->commit_timestamp;
upd->durable_ts = txn->durable_timestamp;
}
}
}
}
/*
* __wt_txn_modify --
* Mark a WT_UPDATE object modified by the current transaction.
*/
static inline int
__wt_txn_modify(WT_SESSION_IMPL *session, WT_UPDATE *upd)
{
WT_TXN *txn;
WT_TXN_OP *op;
txn = session->txn;
if (F_ISSET(txn, WT_TXN_READONLY)) {
if (F_ISSET(txn, WT_TXN_IGNORE_PREPARE))
WT_RET_MSG(
session, ENOTSUP, "Transactions with ignore_prepare=true cannot perform updates");
WT_RET_MSG(session, WT_ROLLBACK, "Attempt to update in a read-only transaction");
}
WT_RET(__txn_next_op(session, &op));
if (F_ISSET(session, WT_SESSION_LOGGING_INMEM)) {
if (op->btree->type == BTREE_ROW)
op->type = WT_TXN_OP_INMEM_ROW;
else
op->type = WT_TXN_OP_INMEM_COL;
} else {
if (op->btree->type == BTREE_ROW)
op->type = WT_TXN_OP_BASIC_ROW;
else
op->type = WT_TXN_OP_BASIC_COL;
}
op->u.op_upd = upd;
/* History store bypasses transactions, transaction modify should never be called on it. */
WT_ASSERT(session, !WT_IS_HS((S2BT(session))->dhandle));
upd->txnid = session->txn->id;
__wt_txn_op_set_timestamp(session, op);
return (0);
}
/*
* __wt_txn_modify_page_delete --
* Remember a page truncated by the current transaction.
*/
static inline int
__wt_txn_modify_page_delete(WT_SESSION_IMPL *session, WT_REF *ref)
{
WT_DECL_RET;
WT_TXN *txn;
WT_TXN_OP *op;
txn = session->txn;
WT_RET(__txn_next_op(session, &op));
op->type = WT_TXN_OP_REF_DELETE;
op->u.ref = ref;
/*
* This access to the WT_PAGE_DELETED structure is safe; caller has the WT_REF locked, and in
* fact just allocated the structure to fill in.
*/
ref->page_del->txnid = txn->id;
__wt_txn_op_set_timestamp(session, op);
if (__wt_log_op(session))
WT_ERR(__wt_txn_log_op(session, NULL));
return (0);
err:
__wt_txn_unmodify(session);
return (ret);
}
/*
* __wt_txn_oldest_id --
* Return the oldest transaction ID that has to be kept for the current tree.
*/
static inline uint64_t
__wt_txn_oldest_id(WT_SESSION_IMPL *session)
{
WT_CONNECTION_IMPL *conn;
WT_TXN_GLOBAL *txn_global;
uint64_t checkpoint_pinned, oldest_id, recovery_ckpt_snap_min;
conn = S2C(session);
txn_global = &conn->txn_global;
/*
* The metadata is tracked specially because of optimizations for checkpoints.
*/
if (session->dhandle != NULL && WT_IS_METADATA(session->dhandle))
return (txn_global->metadata_pinned);
/*
* Take a local copy of these IDs in case they are updated while we are checking visibility. The
* read of the transaction ID pinned by a checkpoint needs to be carefully ordered: if a
* checkpoint is starting and we have to start checking the pinned ID, we take the minimum of it
* with the oldest ID, which is what we want. The logged tables are excluded as part of RTS, so
* there is no need of holding their oldest_id
*/
WT_ORDERED_READ(oldest_id, txn_global->oldest_id);
if (!F_ISSET(conn, WT_CONN_RECOVERING) || session->dhandle == NULL ||
F_ISSET(S2BT(session), WT_BTREE_LOGGED)) {
/*
* Checkpoint transactions often fall behind ordinary application threads. If there is an
* active checkpoint, keep changes until checkpoint is finished.
*/
checkpoint_pinned = txn_global->checkpoint_txn_shared.pinned_id;
if (checkpoint_pinned == WT_TXN_NONE || WT_TXNID_LT(oldest_id, checkpoint_pinned))
return (oldest_id);
return (checkpoint_pinned);
} else {
/*
* Recovered checkpoint snapshot rarely fall behind ordinary application threads. Keep the
* changes until the recovery is finished.
*/
recovery_ckpt_snap_min = conn->recovery_ckpt_snap_min;
if (recovery_ckpt_snap_min == WT_TXN_NONE || WT_TXNID_LT(oldest_id, recovery_ckpt_snap_min))
return (oldest_id);
return (recovery_ckpt_snap_min);
}
}
/*
* __wt_txn_pinned_timestamp --
* Get the first timestamp that has to be kept for the current tree.
*/
static inline void
__wt_txn_pinned_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *pinned_tsp)
{
WT_TXN_GLOBAL *txn_global;
wt_timestamp_t checkpoint_ts, pinned_ts;
*pinned_tsp = WT_TS_NONE;
txn_global = &S2C(session)->txn_global;
/*
* There is no need to go further if no pinned timestamp has been set yet.
*/
if (!txn_global->has_pinned_timestamp)
return;
/* If we have a version cursor open, use the pinned timestamp when it is opened. */
if (S2C(session)->version_cursor_count > 0) {
*pinned_tsp = txn_global->version_cursor_pinned_timestamp;
return;
}
*pinned_tsp = pinned_ts = txn_global->pinned_timestamp;
/*
* The read of checkpoint timestamp needs to be carefully ordered: it needs to be after we have
* read the pinned timestamp and the checkpoint generation, otherwise, we may read earlier
* checkpoint timestamp before the checkpoint generation that is read resulting more data being
* pinned. If a checkpoint is starting and we have to use the checkpoint timestamp, we take the
* minimum of it with the oldest timestamp, which is what we want.
*/
WT_READ_BARRIER();
checkpoint_ts = txn_global->checkpoint_timestamp;
if (checkpoint_ts != 0 && checkpoint_ts < pinned_ts)
*pinned_tsp = checkpoint_ts;
}
/*
* __txn_visible_all_id --
* Check if a given transaction ID is "globally visible". This is, if all sessions in the system
* will see the transaction ID including the ID that belongs to a running checkpoint.
*/
static inline bool
__txn_visible_all_id(WT_SESSION_IMPL *session, uint64_t id)
{
WT_TXN *txn;
uint64_t oldest_id;
txn = session->txn;
/* Make sure that checkpoint cursor transactions only read checkpoints, except for metadata. */
WT_ASSERT(session,
(session->dhandle != NULL && WT_IS_METADATA(session->dhandle)) ||
WT_READING_CHECKPOINT(session) == F_ISSET(session->txn, WT_TXN_IS_CHECKPOINT));
/*
* When reading from a checkpoint, all readers use the same snapshot, so a transaction is
* globally visible if it is visible in that snapshot. Note that this can cause things that were
* not globally visible yet when the checkpoint is taken to become globally visible in the
* checkpoint. This is expected (it is like all the old running transactions exited) -- but note
* that it's important that the inverse change (something globally visible when the checkpoint
* was taken becomes not globally visible in the checkpoint) never happen as this violates basic
* assumptions about visibility. (And, concretely, it can cause stale history store entries to
* come back to life and produce wrong answers.)
*
* Note: we use the transaction to check this rather than testing WT_READING_CHECKPOINT because
* reading the metadata while working with a checkpoint cursor will borrow the transaction; it
* then ends up using it to read a non-checkpoint tree. This is believed to be ok because the
* metadata is always read-uncommitted, but we want to still use the checkpoint-cursor
* visibility logic. Using the regular visibility logic with a checkpoint cursor transaction can
* be logically invalid (it is possible that way for something to be globally visible but
* specifically invisible) and also can end up comparing transaction ids from different database
* opens.
*/
if (F_ISSET(session->txn, WT_TXN_IS_CHECKPOINT))
return (__wt_txn_visible_id_snapshot(
id, txn->snap_min, txn->snap_max, txn->snapshot, txn->snapshot_count));
oldest_id = __wt_txn_oldest_id(session);
return (WT_TXNID_LT(id, oldest_id));
}
/*
* __wt_txn_visible_all --
* Check whether a given time window is either globally visible or obsolete. For global
* visibility checks, the commit times are checked against the oldest possible readers in the
* system. If all possible readers could always see the time window - it is globally visible.
* For obsolete checks callers should generally pass in the durable timestamp, since it is
* guaranteed to be newer than or equal to the commit time, and content needs to be retained
* (not become obsolete) until both the commit and durable times are obsolete. If the commit
* time is used for this check, it's possible that a transaction is committed with a durable
* time and made obsolete before it can be included in a checkpoint - which leads to bugs in
* checkpoint correctness.
*/
static inline bool
__wt_txn_visible_all(WT_SESSION_IMPL *session, uint64_t id, wt_timestamp_t timestamp)
{
wt_timestamp_t pinned_ts;
/*
* When shutting down, the transactional system has finished running and all we care about is
* eviction, make everything visible.
*/
if (F_ISSET(S2C(session), WT_CONN_CLOSING))
return (true);
if (!__txn_visible_all_id(session, id))
return (false);
/* Timestamp check. */
if (timestamp == WT_TS_NONE)
return (true);
/* Make sure that checkpoint cursor transactions only read checkpoints, except for metadata. */
WT_ASSERT(session,
(session->dhandle != NULL && WT_IS_METADATA(session->dhandle)) ||
WT_READING_CHECKPOINT(session) == F_ISSET(session->txn, WT_TXN_IS_CHECKPOINT));
/* When reading a checkpoint, use the checkpoint state instead of the current state. */
if (F_ISSET(session->txn, WT_TXN_IS_CHECKPOINT))
return (session->txn->checkpoint_oldest_timestamp != WT_TS_NONE &&
timestamp <= session->txn->checkpoint_oldest_timestamp);
/* If no oldest timestamp has been supplied, updates have to stay in cache. */
__wt_txn_pinned_timestamp(session, &pinned_ts);
return (pinned_ts != WT_TS_NONE && timestamp <= pinned_ts);
}
/*
* __wt_txn_upd_visible_all --
* Is the given update visible to all (possible) readers?
*/
static inline bool
__wt_txn_upd_visible_all(WT_SESSION_IMPL *session, WT_UPDATE *upd)
{
if (upd->prepare_state == WT_PREPARE_LOCKED || upd->prepare_state == WT_PREPARE_INPROGRESS)
return (false);
/*
* This function is used to determine when an update is obsolete: that should take into account
* the durable timestamp which is greater than or equal to the start timestamp.
*/
return (__wt_txn_visible_all(session, upd->txnid, upd->durable_ts));
}
/*
* __wt_txn_upd_value_visible_all --
* Is the given update value visible to all (possible) readers?
*/
static inline bool
__wt_txn_upd_value_visible_all(WT_SESSION_IMPL *session, WT_UPDATE_VALUE *upd_value)
{
WT_ASSERT(session, upd_value->tw.prepare == 0);
return (upd_value->type == WT_UPDATE_TOMBSTONE ?
__wt_txn_visible_all(session, upd_value->tw.stop_txn, upd_value->tw.durable_stop_ts) :
__wt_txn_visible_all(session, upd_value->tw.start_txn, upd_value->tw.durable_start_ts));
}
/*
* __wt_txn_tw_stop_visible --
* Is the given stop time window visible?
*/
static inline bool
__wt_txn_tw_stop_visible(WT_SESSION_IMPL *session, WT_TIME_WINDOW *tw)
{
return (WT_TIME_WINDOW_HAS_STOP(tw) && !tw->prepare &&
__wt_txn_visible(session, tw->stop_txn, tw->stop_ts, tw->durable_stop_ts));
}
/*
* __wt_txn_tw_start_visible --
* Is the given start time window visible?
*/
static inline bool
__wt_txn_tw_start_visible(WT_SESSION_IMPL *session, WT_TIME_WINDOW *tw)
{
/*
* Check the prepared flag if there is no stop time point or the start and stop time points are
* from the same transaction.
*/
return (((WT_TIME_WINDOW_HAS_STOP(tw) &&
(tw->start_txn != tw->stop_txn || tw->start_ts != tw->stop_ts ||
tw->durable_start_ts != tw->durable_stop_ts)) ||
!tw->prepare) &&
__wt_txn_visible(session, tw->start_txn, tw->start_ts, tw->durable_start_ts));
}
/*
* __wt_txn_tw_start_visible_all --
* Is the given start time window visible to all (possible) readers?
*/
static inline bool
__wt_txn_tw_start_visible_all(WT_SESSION_IMPL *session, WT_TIME_WINDOW *tw)
{
/*
* Check the prepared flag if there is no stop time point or the start and stop time points are
* from the same transaction.
*/
return (((WT_TIME_WINDOW_HAS_STOP(tw) &&
(tw->start_txn != tw->stop_txn || tw->start_ts != tw->stop_ts ||
tw->durable_start_ts != tw->durable_stop_ts)) ||
!tw->prepare) &&
__wt_txn_visible_all(session, tw->start_txn, tw->durable_start_ts));
}
/*
* __wt_txn_tw_stop_visible_all --
* Is the given stop time window visible to all (possible) readers?
*/
static inline bool
__wt_txn_tw_stop_visible_all(WT_SESSION_IMPL *session, WT_TIME_WINDOW *tw)
{
return (WT_TIME_WINDOW_HAS_STOP(tw) && !tw->prepare &&
__wt_txn_visible_all(session, tw->stop_txn, tw->durable_stop_ts));
}
/*
* __wt_txn_visible_id_snapshot --
* Is the id visible in terms of the given snapshot?
*/
static inline bool
__wt_txn_visible_id_snapshot(
uint64_t id, uint64_t snap_min, uint64_t snap_max, uint64_t *snapshot, uint32_t snapshot_count)
{
bool found;
/*
* WT_ISO_SNAPSHOT, WT_ISO_READ_COMMITTED: the ID is visible if it is not the result of a
* concurrent transaction, that is, if was committed before the snapshot was taken.
*
* The order here is important: anything newer than or equal to the maximum ID we saw when
* taking the snapshot should be invisible, even if the snapshot is empty.
*
* Snapshot data:
* ids >= snap_max not visible,
* ids < snap_min are visible,
* everything else is visible unless it is found in the snapshot.
*/
if (WT_TXNID_LE(snap_max, id))
return (false);
if (snapshot_count == 0 || WT_TXNID_LT(id, snap_min))
return (true);
WT_BINARY_SEARCH(id, snapshot, snapshot_count, found);
return (!found);
}
/*
* __txn_visible_id --
* Can the current transaction see the given ID?
*/
static inline bool
__txn_visible_id(WT_SESSION_IMPL *session, uint64_t id)
{
WT_TXN *txn;
txn = session->txn;
/* Changes with no associated transaction are always visible. */
if (id == WT_TXN_NONE)
return (true);
/* Nobody sees the results of aborted transactions. */
if (id == WT_TXN_ABORTED)
return (false);
/* Transactions see their own changes. */
if (id == txn->id)
return (true);
/* Read-uncommitted transactions see all other changes. */
if (txn->isolation == WT_ISO_READ_UNCOMMITTED)
return (true);
/* Otherwise, we should be called with a snapshot. */
WT_ASSERT(session, F_ISSET(txn, WT_TXN_HAS_SNAPSHOT));
return (__wt_txn_visible_id_snapshot(
id, txn->snap_min, txn->snap_max, txn->snapshot, txn->snapshot_count));
}
/*
* __wt_txn_timestamp_visible --
* Can the current transaction see the given timestamp?
*/
static inline bool
__wt_txn_timestamp_visible(
WT_SESSION_IMPL *session, wt_timestamp_t timestamp, wt_timestamp_t durable_timestamp)
{
WT_TXN *txn;
WT_TXN_SHARED *txn_shared;
txn = session->txn;
txn_shared = WT_SESSION_TXN_SHARED(session);
/* Timestamp check. */
if (!F_ISSET(txn, WT_TXN_SHARED_TS_READ) || timestamp == WT_TS_NONE)
return (true);
/*
* For checkpoint cursors, just using the commit timestamp visibility check can go wrong when a
* prepared transaction gets committed in parallel to a running checkpoint.
*
* To avoid this problem, along with the visibility check of a commit timestamp, comparing the
* durable timestamp against the stable timestamp of a checkpoint can avoid the problems of
* returning inconsistent data.
*/
if (WT_READING_CHECKPOINT(session))
return ((timestamp <= txn->checkpoint_read_timestamp) &&
(durable_timestamp <= txn->checkpoint_stable_timestamp));
return (timestamp <= txn_shared->read_timestamp);
}
/*
* __wt_txn_snap_min_visible --
* Can the current transaction snapshot minimum/read timestamp see the given ID/timestamp? This
* visibility check should only be used when assessing broader visibility based on aggregated
* time window. It does not reflect whether a specific update is visible to a transaction.
*/
static inline bool
__wt_txn_snap_min_visible(
WT_SESSION_IMPL *session, uint64_t id, wt_timestamp_t timestamp, wt_timestamp_t durable_timestamp)
{
/* Transaction snapshot minimum check. */
if (!WT_TXNID_LT(id, session->txn->snap_min))
return (false);
/* Transactions read their writes, regardless of timestamps. */
if (F_ISSET(session->txn, WT_TXN_HAS_ID) && id == session->txn->id)
return (true);
/* Timestamp check. */
return (__wt_txn_timestamp_visible(session, timestamp, durable_timestamp));
}
/*
* __wt_txn_visible --
* Can the current transaction see the given ID/timestamp?
*/
static inline bool
__wt_txn_visible(
WT_SESSION_IMPL *session, uint64_t id, wt_timestamp_t timestamp, wt_timestamp_t durable_timestamp)
{
if (!__txn_visible_id(session, id))
return (false);
/* Transactions read their writes, regardless of timestamps. */
if (F_ISSET(session->txn, WT_TXN_HAS_ID) && id == session->txn->id)
return (true);
/* Timestamp check. */
return (__wt_txn_timestamp_visible(session, timestamp, durable_timestamp));
}
/*
* __wt_txn_upd_visible_type --
* Visible type of given update for the current transaction.
*/
static inline WT_VISIBLE_TYPE
__wt_txn_upd_visible_type(WT_SESSION_IMPL *session, WT_UPDATE *upd)
{
uint8_t prepare_state, previous_state;
bool upd_visible;
for (;; __wt_yield()) {
/* Prepare state change is in progress, yield and try again. */
WT_ORDERED_READ(prepare_state, upd->prepare_state);
if (prepare_state == WT_PREPARE_LOCKED)
continue;
/* Entries in the history store are always visible. */
if ((WT_IS_HS(session->dhandle) && upd->txnid != WT_TXN_ABORTED &&
upd->type == WT_UPDATE_STANDARD))
return (WT_VISIBLE_TRUE);
upd_visible = __wt_txn_visible(session, upd->txnid, upd->start_ts, upd->durable_ts);
/*
* The visibility check is only valid if the update does not change state. If the state does
* change, recheck visibility.
*/
previous_state = prepare_state;
WT_ORDERED_READ(prepare_state, upd->prepare_state);
if (previous_state == prepare_state)
break;
WT_STAT_CONN_INCR(session, prepared_transition_blocked_page);
}
if (!upd_visible)
return (WT_VISIBLE_FALSE);
if (prepare_state == WT_PREPARE_INPROGRESS)
return (WT_VISIBLE_PREPARE);
return (WT_VISIBLE_TRUE);
}
/*
* __wt_txn_upd_visible --
* Can the current transaction see the given update.
*/
static inline bool
__wt_txn_upd_visible(WT_SESSION_IMPL *session, WT_UPDATE *upd)
{
return (__wt_txn_upd_visible_type(session, upd) == WT_VISIBLE_TRUE);
}
/*
* __wt_upd_alloc --
* Allocate a WT_UPDATE structure and associated value and fill it in.
*/
static inline int
__wt_upd_alloc(WT_SESSION_IMPL *session, const WT_ITEM *value, u_int modify_type, WT_UPDATE **updp,
size_t *sizep)
{
WT_UPDATE *upd;
size_t allocsz; /* Allocation size in bytes. */
*updp = NULL;
/*
* The code paths leading here are convoluted: assert we never attempt to allocate an update
* structure if only intending to insert one we already have, or pass in a value with a type
* that doesn't support values.
*/
WT_ASSERT(session, modify_type != WT_UPDATE_INVALID);
WT_ASSERT(session,
(value == NULL && (modify_type == WT_UPDATE_RESERVE || modify_type == WT_UPDATE_TOMBSTONE)) ||
(value != NULL &&
!(modify_type == WT_UPDATE_RESERVE || modify_type == WT_UPDATE_TOMBSTONE)));
if (value == NULL || value->size == 0)
allocsz = WT_UPDATE_SIZE_NOVALUE;
else
allocsz = WT_UPDATE_SIZE + value->size;
/*
* Allocate the WT_UPDATE structure and room for the value, then copy any value into place.
* Memory is cleared, which is the equivalent of setting:
* WT_UPDATE.txnid = WT_TXN_NONE;
* WT_UPDATE.durable_ts = WT_TS_NONE;
* WT_UPDATE.start_ts = WT_TS_NONE;
* WT_UPDATE.prepare_state = WT_PREPARE_INIT;
* WT_UPDATE.flags = 0;
*/
WT_RET(__wt_calloc(session, 1, allocsz, &upd));
if (value != NULL && value->size != 0) {
upd->size = WT_STORE_SIZE(value->size);
memcpy(upd->data, value->data, value->size);
}
upd->type = (uint8_t)modify_type;
*updp = upd;
if (sizep != NULL)
*sizep = WT_UPDATE_MEMSIZE(upd);
return (0);
}
/*
* __wt_upd_alloc_tombstone --
* Allocate a tombstone update.
*/