-
Notifications
You must be signed in to change notification settings - Fork 8
/
mmm.h
748 lines (679 loc) · 27.2 KB
/
mmm.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
/*
* Copyright © 2021-2022 John Viega
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Name: mmm.h
* Description: Miniature memory manager: a malloc wrapper to support
* linearization and safe reclaimation for my hash tables.
*
* Author: John Viega, john@zork.org
*/
#ifndef __MMM_H__
#define __MMM_H__
#include <hatrack/hatrack_config.h>
#include <hatrack/debug.h>
#include <hatrack/counters.h>
#include <hatrack/hatomic.h>
#include <stdlib.h>
#include <stdbool.h>
#include <pthread.h>
#include <stdalign.h>
/* This type represents a callback to de-allocate sub-objects before
* the final free for an object allocated via MMM.
*
* We use it any time cleanup activity for objects not allocated via
* mmm also need to defer cleanup. For instance, anything using locks
* on a per-store basis will use this cleanup function, and have a
* handler run through to destroy the locks.
*/
typedef void (*mmm_cleanup_func)(void *, void *);
typedef struct mmm_header_st mmm_header_t;
typedef struct mmm_free_tids_st mmm_free_tids_t;
/* We don't want to keep reservation space for threads that don't need
* it, so we issue a threadid for each thread to keep locally, which
* is an index into that array.
*
* If desired, threads can decide to "give back" their thread IDs, so
* that they can be re-used, for instance, if the thread exits, or
* decides to switch roles to something that won't require it. If you
* never run out of TID space, this will not get used; see
* HATRACK_THREADS_MAX in config.h.
*/
// clang-format off
extern __thread int64_t mmm_mytid;
extern __thread pthread_once_t mmm_inited;
extern _Atomic uint64_t mmm_epoch;
extern uint64_t mmm_reservations[HATRACK_THREADS_MAX];
/* The header data structure. Note that we keep a linked list of
* "retired" records, which is the purpose of the field 'next'. The
* 'data' field is a mere convenience for returning the non-hidden
* part of an allocation.
*
* Note that, when we perform an overwrite, since we want to preserve
* ordering, we need to know the original create time. Since we want
* to be able to free the old records from the original creation,
* we need to cache that time in the create_epoch field.
*
*/
// clang-format off
struct mmm_header_st {
alignas(16)
mmm_header_t *next;
_Atomic uint64_t create_epoch;
_Atomic uint64_t write_epoch;
uint64_t retire_epoch;
mmm_cleanup_func cleanup;
void *cleanup_aux; // Data needed for cleanup, usually the object
alignas(16)
uint8_t data[];
};
struct mmm_free_tids_st {
mmm_free_tids_t *next;
uint64_t tid;
};
void mmm_register_thread (void);
void mmm_reset_tids (void);
void mmm_retire (void *);
void mmm_clean_up_before_exit(void);
#ifdef HATRACK_DEBUG
static inline void hatrack_debug_mmm(void *, char *);
#define DEBUG_MMM(x, y) hatrack_debug_mmm((void *)(x), y)
#ifdef HATRACK_MMM_DEBUG
#define DEBUG_MMM_INTERNAL(x, y) DEBUG_MMM(x, y)
#else
#define DEBUG_MMM_INTERNAL(x, y)
#endif
#else
#define DEBUG_MMM(x, y)
#define DEBUG_MMM_INTERNAL(x, y)
#endif
#ifdef HATRACK_MMMALLOC_CTRS
#define HATRACK_MALLOC_CTR() HATRACK_CTR(HATRACK_CTR_MALLOCS)
#define HATRACK_FREE_CTR() HATRACK_CTR(HATRACK_CTR_FREES)
#define HATRACK_RETIRE_UNUSED_CTR() HATRACK_CTR(HATRACK_CTR_RETIRE_UNUSED)
#else
#define HATRACK_MALLOC_CTR()
#define HATRACK_FREE_CTR()
#define HATRACK_RETIRE_UNUSED_CTR() HATRACK_CTR(HATRACK_CTR_RETIRE_UNUSED)
#endif
// clang-format on
/* This epoch system was inspired by my research into what was out
* there that would be faster and easier to use than hazard
* pointers. Particularly, IBR (Interval Based Reclaimation) seemed to
* be about what I was looking for at the time, although since, I've
* had to adapt it to my needs.
*
* In both algorithms, readers commit to an epoch at the time they
* first enter an "operation". But with my algorithm, writers both
* commit to a read epoch when they first start reading the data
* structure, but they don't commit to a "write epoch" until the end
* of their operation. This is important in our ability to have a
* linear order for write operations, because writes may not complete
* in the order they started.
*
* When I had the epiphany that I could fully linearize my hash table
* if the write doesn't "commit" until a specific epoch, and those
* commits are guaranteed to be ordered by epoch, the challenge was to
* do this in a way that's atomic.
*
* To understand issues that arise, consider the typical (pseudo-)code
* for setting an allocation epoch:
*
* write_epoch = atomic_fetch_and_add(epoch, 1) + 1
*
* The extra + 1 here is because a FAA operation returns the original
* value, not the new epoch. The write reall should happen on the
* boundary of the old epoch and the new epoch, if there are are
* readers looking at that epoch already (since they might have
* already passsed by our node in their work).
*
* Actually, that's not strictly true, because we could force every
* access, including reads, to create a new epoch. If, we do this, we
* are guaranteed that the write's committment epoch will always be
* unique, and that any nearby readers will therefore see what they
* should see.
*
* However, if we want to reduce memory barriers on the epoch counter
* (which is desirable), we must consider writes to happen at the END
* of their epoch, since readers that happened to grab the bumped
* epoch before we commit our write could theoretically not see that
* the record was active in their epoch. That's the approach we take
* here, especially considering that, typically, most hash table
* access is done by readers.
*
* Another issue we have with the attempt to commit writes to an epic,
* is that the store into the memory associated with the variable
* write_epoch is NOT atomic (even without the +1). So the committing
* thread could get suspended.
*
* And, while that thread is suspended, a reader that needs
* linearization can come along, trying to read from a later epoch,
* even though our commit hasn't happened yet.
*
* Our solution to this problem is to not allow readers to pass a
* memory record that they see, where there is no epoch
* committment. Instead, they must try to help, by also creating a
* "new" epoch, and trying to write it to write_epoch via compare and
* swap. Since all readers will do this, and since any CAS failure
* means some previous CAS succeded, it means this record will have an
* epoch before any operation tries to process it.
*
* To clarify, for the writer inserting into the table, here's what the
* process looks like:
*
* 1) Allocate the record, with no epoch data set.
*
* 2) Put it in the place where it belongs in a data structure,
* pointing to whatever record it replaces, if it exists.
*
* 3) Bump the epoch counter, which will also return the current
* epoch, which we again add 1 to, so that our number matches the
* epoch we created.
*
* 4) Attempt to write the value of the epoch we created into the
* write_epoch field, via compare-and-swap.
*
* 5) If the compare-and-swap fails, no big deal... it means the write
* has a later epoch than the one we were going to give it, but one
* more reflective of when the commit finally occured. And, there
* is guaranteed linearization before any other thread attempts to read
* the value in this record.
*
* Note that, as I was working this approach out, I realized there is
* a fundamental race condition in the IBR approach as-described, that
* makes it inappropriate for any kind of linearization. In the
* simplist version of their algorithm, they have a function
* start_op() meant for every thread to use when starting a series of
* reads OR writes to a data structure. Their pseudo-code for this
* operation is:
*
* start_op(): reservations[tid] = epoch
*
* As with the above code, the entire operation cannot be done
* atomically, so far as I'm aware. That means, the following scenario
* is possible:
*
* 1) Our thread runs start_op(), and reads epoch N.
< * 2) Our thread gets SUSPENDED before recording its reservation.
* 3) In epoch N+1, another thread retires a piece of the data structure
* that was alive in epoch N.
* 4) Since no reservations are currently recorded for epoch N, the
* thread deletes the memory it just retired.
* 5) Our thread wakes back up in epoch N+1, and attempts to read data
* that was available in epoch N.
*
* Certainly, in this case, we trust the other thread maintained the
* data structure properly, so that we don't attempt to load a record
* that's been freed. However, it's clear that we cannot even be
* guaranteed that a reader who reserves epoch N will be able to
* access every record that was theoretically alive in epoch N.
*
* For our epoch system, we address this problem by forcing the reader
* to check to make sure the epoch hasn't changed after writing to the
* array (if it hasn't changed, we are safe).
*
* Such a check on its own is not wait-free, because it's possible
* that, every time we go to look at the epoch, a new write has since
* occured, ad infinitum. We have two options for "enhancing" this
* algorithm to make it wait-free:
*
* 1) We can recognize that this is probably never going to be a
* scenario that's likely to cause significant spinning for the
* thread, and specify that no writes are ever allowed under any
* circumstances, once the epoch counter gets to its maximum
* value. The fact that there's a bounded maximum wait time makes
* this algorithm wait-free, even though the bound is exceptionally
* high (2^64).
*
* 2) We can be conservative, and be willing to pay the performance
* penalty to avoid significant spinning in heavy-write
* situations. In such a case, we make a small number of attempts
* (say, 3), and then ask writers to "help" us, with the following
* algorithm:
*
* a) In each item of the "reservations" array, we reserve one
* bit of each thread's reservation field as a "HELP" bit.
*
* When a reader hits the maximum retries, he sets his own
* HELP bit, and atomically bumps a global "help" counter (with
* fetch and add.
*
* b) Writers check the global help counter before every
* write. If it's non-zero, then they look at EVERY value in
* the reservation array before commencing any write operation
* (keeping them from creating a new epoch), looking for the
* help bit. If they see the help bit, they attempt to read
* the current epoch, and swap it into the reservation field.
* Then, the thread also checks against the current epoch,
* retrying if the epoch increases. If the swap succeeds, and
* the epoch didn't increase in the process, then the writer
* removes 1 from the global help counter.
*
* Once there's a successful epoch reservation, any writer
* attempting to help continues to proceed through the array,
* until they get to the end of the array, or the help count
* becomes zero (whichever occurs first).
*
* c) The reader alternates between two operations:
*
* i) Attempting to read an epoch from the gloval epoch counter.
* ii) Reading its reservation bucket (atomically).
*
* If it manages to read an epoch itself, it still has to attempt
* to commit it, and test is as before. If it succeeds in helping
* itself, it also decrements the help counter.
*
* Swaps in this algorithm should always set the "expected" value to
* the "No reservation" value (0), OR'd with the "HELP" bit, which
* would be the most significant bit.
*
* It should be easy to see that this algorithm is also wait-free,
* bounded by the total number of writer threads. In the worst
* case, let's assume there are N writer threads that all start
* writing before thread T asks for help. Any future thread that
* comes through with a second write operation, will NOT bump the
* epoch until it ensures that thread T got a safe epoch. We could
* end up in a situation where all N writer threads are helping T
* get a safe epoch-- in such a case, no thread can possibly be
* incrementing the epoch counter, bounding the work to O(N), thus
* making the algorithm wait-free.
*
* The first algorithm should be more performant in the expected case,
* since checking a "help" counter will generally be a waste. If we
* want to make the second algorithm even less likely to fire, we can
* use a "deferred help" strategy, where writers only help if help is
* still needed when a thread goes through the reclaimation process.
*
*
* In the below enumeration:
*
* The first value will be stored in the reservations array when
* threads do not have an epoch they've reserved for reading.
*
* The second is the first value of the epoch counter, which is 1.
* There will never be any data in this epoch (any writer would bump
* the epoch as part of their commit process).
*
* The third value is if we choose to implement the algorithm above to
* help with epoch acquisitions -- it's the value that readers put
* into their reservation to signal for help. This does reduce the
* space for the epoch counter to 63 bits, which is still vastly more
* epochs than any program could ever use in practice (using 1B epochs
* per second, we'd have enough epochs for almost 300 years).
*
* The final value is used when we're looking to free old records. We
* initialize the maximum value seen to this value, then go through
* the reservations array, reducing the maximum value seen as we go
* along. If it's still this value by the time we get to the end,
* then we can delete all our records.
*/
enum64(mmm_enum_t,
HATRACK_EPOCH_UNRESERVED = 0xffffffffffffffff,
HATRACK_EPOCH_FIRST = 0x0000000000000001,
HATRACK_F_RESERVATION_HELP = 0x8000000000000000,
HATRACK_EPOCH_MAX = 0xffffffffffffffff);
/* This is a macro that allows us to access our hidden header. Note
* that, while this is straightforward, I did run into an issue where
* the definition of mmm_header_t would end up getting a different
* layout depending on the module.
*
* The solution was to force alignment of fields as needed in
* mmm_header_t (above).
*/
static inline mmm_header_t *
mmm_get_header(void *ptr)
{
return (mmm_header_t *)(((uint8_t *)ptr) - sizeof(mmm_header_t));
}
#ifdef HATRACK_DEBUG
/* Conceptually, this might belong in the debug subsystem. However,
* this is a specific debug interface for outputting the epoch info
* associated with an MMM allocation, and as such should live
* independently from the more generic debug mechanism.
*
* As with the debug routines in the debug subsystem, we've sacrificed
* readability for performance, since allocations can be VERY frequent
* in test code.
*
* Specifically, I was originally just using sprintf() and it was
* leading to a dramatic slowdown when debugging was on.
*/
static inline void
hatrack_debug_mmm(void *addr, char *msg)
{
char buf[HATRACK_DEBUG_MSG_SIZE] = {
'0',
'x',
};
static const char rest[] = " :) :r , :w , :c( :x0";
const char *r = &rest[0];
mmm_header_t *hdr = mmm_get_header(addr);
char *p;
uint64_t i;
uintptr_t n;
p = buf + (HATRACK_PTR_CHRS + 3 * HATRACK_EPOCH_DEBUG_LEN + sizeof(rest));
strncpy(p, msg, HATRACK_DEBUG_MSG_SIZE - (p - buf));
*--p = *r++;
*--p = *r++;
*--p = *r++;
n = hdr->retire_epoch;
for (i = 0; i < HATRACK_EPOCH_DEBUG_LEN; i++) {
*--p = __hatrack_hex_conversion_table[n & 0xf];
n >>= 4;
}
*--p = *r++;
*--p = *r++;
*--p = *r++;
*--p = *r++;
*--p = *r++;
*--p = *r++;
n = hdr->write_epoch;
for (i = 0; i < HATRACK_EPOCH_DEBUG_LEN; i++) {
*--p = __hatrack_hex_conversion_table[n & 0xf];
n >>= 4;
}
*--p = *r++;
*--p = *r++;
*--p = *r++;
*--p = *r++;
*--p = *r++;
n = hdr->create_epoch;
for (i = 0; i < HATRACK_EPOCH_DEBUG_LEN; i++) {
*--p = __hatrack_hex_conversion_table[n & 0xf];
n >>= 4;
}
*--p = *r++;
*--p = *r++;
*--p = *r++;
*--p = *r++;
*--p = *r++;
*--p = *r++;
n = (intptr_t)addr;
for (i = 0; i < HATRACK_PTR_CHRS; i++) {
*--p = __hatrack_hex_conversion_table[n & 0xf];
n >>= 4;
}
*--p = *r++;
*--p = *r++;
hatrack_debug(p);
return;
}
#endif
/* We stick our read reservation in mmm_reservations[mmm_mytid]. By
* doing this, we are guaranteeing that we will only read data alive
* during or after this epoch, until we remove our reservation.
*
* It does NOT guarantee that we won't read data written after the
* reserved epoch, and does not ensure linearization (instead, use
* mmm_start_linearized_op).
*
* The call to pthread_once would be better served if it moved to
* thread initialization, but since we want to be as agnostic as
* possible to the threading environment, we'll go ahead and pay the
* (admittedly very small) cost of the implied test with each
* operation.
*/
static inline void
mmm_start_basic_op(void)
{
pthread_once(&mmm_inited, mmm_register_thread);
mmm_reservations[mmm_mytid] = atomic_load(&mmm_epoch);
return;
}
/* mmm_start_linearized_op() is used to help ensure we can safely recover
* a consistent, full ordering of data objects in the lohat family.
*
* 1) When getting a read epoch that needs to be fully ordered, we
* make a reservation. But, we can't rely on that reservation alone
* for linearization. Particularly, if we're very slow to write our
* reservation to the reservations array, items could get retired
* in the same epoch, and freed, all before we complete our
* reservation (if the epoch advances while we're suspended, for
* instance).
*
* That is, once the reservation is in place, it ensures nothing
* that's still in the table from that epoch or later is
* removed. However, this race condition means that it does not
* ensure that nothing EVER alive in that epoch won't be removed.
*
* Checking the reservation again will give us a useful upper
* bound. We know that nothing retired from that epoch on will be
* freed before we're done with our linearization operation. But,
* there might be a write operation in the current epoch, and that
* operation might actually end up delayed.
*
* However! We have a restriction that there can be at most ONE
* write per epoch. Therefore, we can go ahead and use this epoch
* to linearize. Either we will see the new write, in which case
* our linearization point is conceptionally the instant after the
* write (but before any retires), or we won't see the new write,
* in which case our linearization point is effectively the very
* end of the previous epoch, after any retires.
*
* Even if there were retires in this new epoch, it won't matter;
* retires in the same epoch of creation never happen, since the
* writer reserves an epoch equal to or previous to that epoch, and
* creates a new epoch at the end. And, by this point, we will
* have a reservation in the table, so writers in future epochs
* won't free from underneath us either.
*
* When full ordering is necessary, readers should use the function
* mmm_start_linearized_op(), instead of mmm_start_op(). They do
* still use mmm_end_op() to end their operation and give up their
* reservation.
*
* 2) Whenever the reader finds a record w/ no write epoch, it must
* try to help get the record a definitive epoch before proceeding.
* If the stalled thread beats the linearized reader, we will see
* an epoch lower than ours, and will then read the
* record. Otherwise, the record write epoch will be higher than
* our read epoch, and we will not read. As part of our helping, we
* bump the write epoch, as all writes must occur on an epoch
* boundary, with only one write per epoch.
*/
static inline uint64_t
mmm_start_linearized_op(void)
{
uint64_t read_epoch;
pthread_once(&mmm_inited, mmm_register_thread);
mmm_reservations[mmm_mytid] = atomic_load(&mmm_epoch);
read_epoch = atomic_load(&mmm_epoch);
HATRACK_YN_CTR_NORET(read_epoch == mmm_reservations[mmm_mytid],
HATRACK_CTR_LINEAR_EPOCH_EQ);
return read_epoch;
}
/* This does two things:
*
* 1) Inserts a compiler barrier to make sure the rest of the operation
* is ordered AFTER everything that came before it.
*
* 2) Removes our reservation, indicating we are no longer performing
* a data structure operation.
*
* Note that the mmm_start* ops do not require a compiler fence,
* because they both use a memory fence.
*/
static inline void
mmm_end_op(void)
{
atomic_signal_fence(memory_order_seq_cst);
mmm_reservations[mmm_mytid] = HATRACK_EPOCH_UNRESERVED;
return;
}
/* Note that the API for allocating via MMM is a little non-intuitive.
* for malloc users, partially because it supports a couple of
* different use cases:
*
* 1) Managing epochs for purposes of safe record management (and
* sorting) when linearization is NOT important. Here, we are
* good to set our write epoch at allocation time.
*
* 2) Managing epochs for both safe memory management, as well as full
* linearization. In such a case, we don't want to commit the write
* time until the end of our operation.
*
* mmm_alloc() is the interface for use case #2. It returns allocated
* memory of the specific size, but without any committment. The
* algorithm must call mmm_commit_write(), which is below.
*
* mmm_alloc_committed() is more like traditional malloc in that you
* don't have to do any additional work, once you call it.
*
* I named the functions this way to help myself minimize programmer
* errors. I felt if I swapped the functions (i.e., using an
* mmm_alloc_uncommitted() function), then I was VERY likely to miss
* errors where I forgot to commit, when building the fully linearized
* algorithms, because the records would end up with epochs... just
* ones that are wrong.
*
* The way it's done here, epochs that never get set tend to lead to
* errors quickly, and we can easily add debugging to check the
* condition if need be (though we need to be cognizent of possible
* 'helpers').
*/
static inline void *
mmm_alloc(uint64_t size)
{
uint64_t actual_size = sizeof(mmm_header_t) + size;
mmm_header_t *item = (mmm_header_t *)calloc(1, actual_size);
HATRACK_MALLOC_CTR();
DEBUG_MMM_INTERNAL(item->data, "mmm_alloc");
return (void *)item->data;
}
/*
* Note here that atomic_fetch_add() returns the value before the add,
* so we add one to it when storing our write epoch.
*/
static inline void *
mmm_alloc_committed(uint64_t size)
{
uint64_t actual_size = sizeof(mmm_header_t) + size;
mmm_header_t *item = (mmm_header_t *)calloc(1, actual_size);
atomic_store(&item->write_epoch, atomic_fetch_add(&mmm_epoch, 1) + 1);
HATRACK_MALLOC_CTR();
DEBUG_MMM_INTERNAL(item->data, "mmm_alloc_committed");
return (void *)item->data;
}
/* Cleanup handlers get called right before an allocation is freed.
* They're used for sub-objects that aren't allocated via mmm, such as
* mutex objects.
*/
static inline void
mmm_add_cleanup_handler(void *ptr, mmm_cleanup_func handler, void *aux)
{
mmm_header_t *header = mmm_get_header(ptr);
header->cleanup = handler;
header->cleanup_aux = aux;
return;
}
/* When we called mmm_alloc(), we call this to set a write epoch.
* Note that we do it via CAS, instead of a direct write, for the sake
* of other threads that need to read the epoch information, in case
* our thread stalls.
*
* Those other threads might need epoch information from our record
* after we've placed a record, but before we've finished this call to
* mmm_commit_write(). If a thread will need epoch information from
* us, it will first call mmm_help_commit(), which is similar, but
* first tests to make sure an epoch hasn't been written.
*/
static inline void
mmm_commit_write(void *ptr)
{
uint64_t cur_epoch;
uint64_t expected_value = 0;
mmm_header_t *item = mmm_get_header(ptr);
cur_epoch = atomic_fetch_add(&mmm_epoch, 1) + 1;
/* If this CAS operation fails, it can only be because:
*
* 1) We stalled in our write commit, some other thread noticed,
* and helped us out.
*
* 2) We were trying to help, but either the original thread, or
* some other helper got there first.
*
* Either way, we're safe to ignore the return value.
*/
LCAS(&item->write_epoch, &expected_value, cur_epoch, HATRACK_CTR_COMMIT);
DEBUG_MMM_INTERNAL(ptr, "committed");
return;
}
/* Similar to mmm_commit_write(), but called by a different thread on
* a pointer where it wants to ensure there's an epoch value. If it
* doesn't see one, it attempts to install one. Whether it succeeds or
* not, one is guaranteed to be there after the compare-and-swap is
* done.
*/
static inline void
mmm_help_commit(void *ptr)
{
mmm_header_t *item;
uint64_t found_epoch;
uint64_t cur_epoch;
item = mmm_get_header(ptr);
found_epoch = item->write_epoch;
if (!found_epoch) {
cur_epoch = atomic_fetch_add(&mmm_epoch, 1) + 1;
LCAS(&item->write_epoch,
&found_epoch,
cur_epoch,
HATRACK_CTR_COMMIT_HELPS);
}
return;
}
// Call this when we know no other thread ever could have seen the
// data in question, as it can be freed immediately.
static inline void
mmm_retire_unused(void *ptr)
{
DEBUG_MMM_INTERNAL(ptr, "mmm_retire_unused");
HATRACK_RETIRE_UNUSED_CTR();
free(mmm_get_header(ptr));
return;
}
static inline uint64_t
mmm_get_write_epoch(void *ptr)
{
return mmm_get_header(ptr)->write_epoch;
}
static inline void
mmm_set_create_epoch(void *ptr, uint64_t epoch)
{
mmm_get_header(ptr)->create_epoch = epoch;
}
static inline uint64_t
mmm_get_create_epoch(void *ptr)
{
mmm_header_t *header = mmm_get_header(ptr);
return header->create_epoch ? header->create_epoch : header->write_epoch;
}
static inline void
mmm_copy_create_epoch(void *dst, void *src)
{
mmm_set_create_epoch(dst, mmm_get_create_epoch(src));
}
extern __thread mmm_header_t *mmm_retire_list;
// Use this in migration functions to avoid unnecessary scanning of the
// retire list, when we know the epoch won't have changed.
static inline void
mmm_retire_fast(void *ptr)
{
mmm_header_t *cell;
cell = mmm_get_header(ptr);
cell->retire_epoch = atomic_load(&mmm_epoch);
cell->next = mmm_retire_list;
mmm_retire_list = cell;
return;
}
#endif