-
-
Notifications
You must be signed in to change notification settings - Fork 131
/
mormot.core.fpcx64mm.pas
3624 lines (3384 loc) · 135 KB
/
mormot.core.fpcx64mm.pas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/// Fast Memory Manager for FPC x86_64
// - this unit is a part of the Open Source Synopse mORMot framework 2,
// licensed under a MPL/GPL/LGPL three license - see LICENSE.md
unit mormot.core.fpcx64mm;
{
*****************************************************************************
A Multi-thread Friendly Memory Manager for FPC written in x86_64 assembly
- targetting Linux (and Windows) multi-threaded Services
- only for FPC on the x86_64 target - use the RTL MM on Delphi or ARM
- based on proven FastMM4 by Pierre le Riche - with tuning and enhancements
- can report detailed statistics (with threads contention and memory leaks)
- three app modes: default GUI app, FPCMM_SERVER or FPCMM_BOOSTER
Usage: include this unit as the very first in your FPC project uses clause
Why another Memory Manager on FPC?
- The built-in heap.inc is well written and cross-platform and cross-CPU,
but its threadvar arena for small blocks tends to consume a lot of memory
on multi-threaded servers, and has suboptimal allocation performance
- C memory managers (glibc, Intel TBB, jemalloc) have a very high RAM
consumption (especially Intel TBB) and do panic/SIG_KILL on any GPF - but
they were reported to scale better on heavy load with cpu core count > 16
even if getmem() is almost twice faster on single thread with fpcx64mm
- Pascal alternatives (FastMM4,ScaleMM2,BrainMM) are Windows+Delphi specific
- Our lockess round-robin of tiny blocks and freemem bin list are unique
algorithms among Memory Managers, and match modern CPUs and workloads
- It was so fun diving into SSE2 x86_64 assembly and Pierre's insight
- Resulting code is still easy to understand and maintain
DISCLAMER: seems stable on Linux and Win64 but feedback is welcome!
*****************************************************************************
}
(*
In practice, write in your main project (.dpr/.lpr) source:
uses
{$I mormot.uses.inc} // may include fpcx64mm or fpclibcmm
sysutils,
mormot.core.base,
...
Then define either FPC_X64MM or FPC_LIBCMM conditional.
If both are set, FPC_64MM will be used on x86_64, and FPC_LIBCMM otherwise.
*)
{ ---- Ready-To-Use Scenarios for Memory Manager Tuning }
{
TL;DR:
1. default settings target GUI/console almost-mono-threaded apps;
2. define FPCMM_SERVER for a multi-threaded service/daemon;
3. try FPCMM_BOOSTER on high-end hardware;
4. try mormot.core.fpclibcmm as POSIX alternative.
}
// target a multi-threaded service on a modern CPU
// - define FPCMM_DEBUG, FPCMM_ASSUMEMULTITHREAD, FPCMM_ERMS
// - currently mormot2tests run with no contention when FPCMM_SERVER is set :)
{.$define FPCMM_SERVER}
// increase settings for more aggressive multi-threaded process
// - tiny blocks will up to 256 bytes (instead of 128 bytes);
// - will enable FPCMM_SMALLNOTWITHMEDIUM to reduce medium sleeps.
{.$define FPCMM_BOOST}
// target high-end CPU when FPCMM_SERVER/FPCMM_BOOST are not enough
// - will use 128 arenas for <= 256B blocks to scale on high number of cores;
// - enable FPCMM_MULTIPLESMALLNOTWITHMEDIUM to reduce small pools locks;
// - enable FPCMM_TINYPERTHREAD to assign threads to the 128 arenas.
{.$define FPCMM_BOOSTER}
{ ---- Fine Grained Memory Manager Tuning }
// includes more detailed information to WriteHeapStatus()
{.$define FPCMM_DEBUG}
// on thread contention, don't spin executing "pause" but directly call Sleep()
// - may help on a single core CPU, or for very specific workloads
{.$define FPCMM_NOPAUSE}
// let FPCMM_DEBUG include SleepCycles information from rdtsc
// and FPCMM_PAUSE call rdtsc for its spinnning loop
// - since rdtsc is emulated so unrealiable on VM, it is disabled by default
{.$define FPCMM_SLEEPTSC}
// checks leaks and write them to the console at process shutdown
// - only basic information will be included: more debugging information (e.g.
// call stack) may be gathered using heaptrc or valgrid
{.$define FPCMM_REPORTMEMORYLEAKS}
// won't check the IsMultiThread global, but assume it is true
// - multi-threaded apps (e.g. a Server Daemon instance) will be faster with it
// - mono-threaded (console/LCL) apps are faster without this conditional
{.$define FPCMM_ASSUMEMULTITHREAD}
// won't use mremap but a regular getmem/move/freemem pattern for large blocks
// - depending on the actual system (e.g. on a VM), mremap may be slower
// - will disable Linux mremap() or Windows following block VirtualQuery/Alloc
{.$define FPCMM_NOMREMAP}
// customize mmap() allocation strategy
{.$define FPCMM_MEDIUM32BIT} // enable MAP_32BIT for AllocMedium() on Linux
{.$define FPCMM_LARGEBIGALIGN} // align large chunks to 21-bit=2MB=PMD_SIZE
// force the tiny/small blocks to be in their own arena, not with medium blocks
// - would use a little more memory, but medium pool is less likely to sleep
// - not defined for FPCMM_SERVER because no performance difference was found
// - defined for FPCMM_BOOST
{.$define FPCMM_SMALLNOTWITHMEDIUM}
// force several tiny/small blocks arenas, not with medium blocks
// - would use a little more memory, but more medium pools could help
// - defined for FPCMM_BOOSTER
{.$define FPCMM_MULTIPLESMALLNOTWITHMEDIUM}
// use the current thread id to identify the arena for a Tiny block GetMem()
// - defined for FPCMM_BOOSTER (requires enough tiny arenas)
// - warning: EXPERIMENTAL Linux and Win64 ONLY, due to very low-level asm trick
{.$define FPCMM_TINYPERTHREAD}
// use "rep movsb/stosd" ERMS for blocks > 256 bytes instead of SSE2 "movaps"
// - ERMS is available since Ivy Bridge, and we use "movaps" for smallest blocks
// (to not slow down older CPUs), so it is safe to enable this on FPCMM_SERVER
{.$define FPCMM_ERMS}
// try "cmp" before "lock cmpxchg" for old processors with huge lock penalty
{.$define FPCMM_CMPBEFORELOCK}
// will export libc-like functions, and not replace the FPC MM
// - e.g. to use this unit as a stand-alone C memory allocator
{.$define FPCMM_STANDALONE}
// this whole unit will compile as void
// - may be defined e.g. when compiled as Design-Time Lazarus package
{.$define FPCMM_DISABLE}
interface
{$ifdef FPC}
// cut-down version of mormot.defines.inc to make this unit standalone
{$mode Delphi}
{$inline on}
{$R-} // disable Range checking
{$S-} // disable Stack checking
{$W-} // disable stack frame generation
{$Q-} // disable overflow checking
{$B-} // expect short circuit boolean
{$ifdef CPUX64}
{$define FPCX64MM} // this unit is for FPC + x86_64 only
{$asmmode Intel}
{$endif CPUX64}
{$ifdef OLDLINUXKERNEL}
{$define FPCMM_NOMREMAP}
{$endif OLDLINUXKERNEL}
{$ifdef FPCMM_BOOSTER}
{$define FPCMM_BOOST}
{$define FPCMM_MULTIPLESMALLNOTWITHMEDIUM}
{$define FPCMM_TINYPERTHREAD}
{$endif FPCMM_BOOSTER}
{$ifdef FPCMM_BOOST}
{$define FPCMM_SERVER}
{$define FPCMM_SMALLNOTWITHMEDIUM}
{$define FPCMM_LARGEBIGALIGN} // bigger blocks implies less reallocation
{$endif FPCMM_BOOST}
{$ifdef FPCMM_SERVER}
{$define FPCMM_DEBUG}
{$define FPCMM_ASSUMEMULTITHREAD}
{$define FPCMM_ERMS}
{$endif FPCMM_SERVER}
{$ifdef FPCMM_BOOSTER}
{$undef FPCMM_DEBUG} // when performance matters more than stats
{$endif FPCMM_BOOSTER}
{$endif FPC}
{$ifdef FPCMM_DISABLE}
{$undef FPCX64MM} // e.g. when compiled as Design-Time Lazarus package
{$endif FPCMM_DISABLE}
{$ifdef FPCX64MM}
// this unit is available only for FPC + X86_64 CPU
// other targets would compile as a void unit
type
/// Arena (middle/large) heap information as returned by CurrentHeapStatus
TMMStatusArena = record
/// how many bytes are currently reserved (mmap) to the Operating System
CurrentBytes: PtrUInt;
/// how many bytes have been reserved (mmap) to the Operating System
CumulativeBytes: PtrUInt;
{$ifdef FPCMM_DEBUG}
/// maximum bytes count reserved (mmap) to the Operating System
PeakBytes: PtrUInt;
/// how many VirtualAlloc/mmap calls to the Operating System did occur
CumulativeAlloc: PtrUInt;
/// how many VirtualFree/munmap calls to the Operating System did occur
CumulativeFree: PtrUInt;
{$endif FPCMM_DEBUG}
/// how many times this Arena did wait from been unlocked by another thread
SleepCount: PtrUInt;
end;
/// heap information as returned by CurrentHeapStatus
TMMStatus = record
/// how many tiny/small memory blocks (<=2600 bytes) are currently allocated
SmallBlocks: PtrUInt;
/// how many bytes of tiny/small memory blocks are currently allocated
// - this size is included in Medium.CurrentBytes value, even if
// FPCMM_SMALLNOTWITHMEDIUM has been defined
SmallBlocksSize: PtrUInt;
/// information about blocks up to 256KB (tiny, small and medium)
// - includes also the memory needed for tiny/small blocks
// - is shared by both small & medium pools even if FPCMM_SMALLNOTWITHMEDIUM
Medium: TMMStatusArena;
/// information about large blocks > 256KB
// - those blocks are directly handled by the Operating System
Large: TMMStatusArena;
{$ifdef FPCMM_DEBUG}
{$ifdef FPCMM_SLEEPTSC}
/// how much rdtsc cycles were spent within SwitchToThread/NanoSleep API
// - we rdtsc since it is an indicative but very fast way of timing on
// direct hardware
// - warning: on virtual machines, the rdtsc opcode is usually emulated so
// these SleepCycles number are non indicative anymore
SleepCycles: PtrUInt;
{$endif FPCMM_SLEEPTSC}
{$endif FPCMM_DEBUG}
/// how many times the Operating System Sleep/NanoSleep API was called
// - should be as small as possible - 0 is perfect
SleepCount: PtrUInt;
/// how many times Getmem() did block and wait for a tiny/small block
// - see also GetSmallBlockContention() for more detailed information
// - by design, our Freemem() can't block thanks to its lock-less free list
SmallGetmemSleepCount: PtrUInt;
end;
PMMStatus = ^TMMStatus;
/// allocate a new memory buffer
// - as FPC default heap, _Getmem(0) returns _Getmem(1)
function _GetMem(size: PtrUInt): pointer;
/// allocate a new zeroed memory buffer
function _AllocMem(Size: PtrUInt): pointer;
/// release a memory buffer
// - returns the allocated size of the supplied pointer (as FPC default heap)
function _FreeMem(P: pointer): PtrUInt;
/// change the size of a memory buffer
// - won't move any data if in-place reallocation is possible
// - as FPC default heap, _ReallocMem(P=nil,Size) maps P := _getmem(Size) and
// _ReallocMem(P,0) maps _Freemem(P)
function _ReallocMem(var P: pointer; Size: PtrUInt): pointer;
/// retrieve the allocated size of a memory buffer
// - equal or greater to the size supplied to _GetMem(), due to MM granularity
function _MemSize(P: pointer): PtrUInt; inline;
/// retrieve high-level statistics about the current memory manager state
// - see also GetSmallBlockContention for detailed small blocks information
// - standard GetHeapStatus and GetFPCHeapStatus gives less accurate information
// (only CurrHeapSize and MaxHeapSize are set), since we don't track "free" heap
// bytes: I can't figure how "free" memory is relevant nowadays - on 21th century
// Operating Systems, memory is virtual, and reserved/mapped by the OS but
// physically hosted in the HW RAM chips only when written the first time -
// GetHeapStatus information made sense on MSDOS with fixed 640KB of RAM
// - note that FPC GetHeapStatus and GetFPCHeapStatus is only about the
// current thread (irrelevant for sure) whereas CurrentHeapStatus is global
function CurrentHeapStatus: TMMStatus;
{$ifdef FPCMM_STANDALONE}
/// should be called before using any memory function
procedure InitializeMemoryManager;
/// should be called to finalize this memory manager process and release all RAM
procedure FreeAllMemory;
{$undef FPCMM_DEBUG} // excluded FPC-specific debugging
/// IsMultiThread global variable is not correct outside of the FPC RTL
{$define FPCMM_ASSUMEMULTITHREAD}
/// not supported to reduce dependencies and console writing
{$undef FPCMM_REPORTMEMORYLEAKS}
{$else}
type
/// one GetSmallBlockContention info about unexpected multi-thread waiting
TSmallBlockContention = packed record
/// how many times a small block Getmem() has been waiting for unlock
GetmemSleepCount: PtrUInt;
/// the small block size on which Getmem() has been blocked
GetmemBlockSize: PtrUInt;
/// not used in GetSmallBlockContention() context - reserved for future use
Reserved: PtrUInt;
end;
/// small blocks detailed information as returned GetSmallBlockContention
TSmallBlockContentionDynArray = array of TSmallBlockContention;
/// one GetSmallBlockStatus information
TSmallBlockStatus = packed record
/// how many times a memory block of this size has been allocated
Total: PtrUInt;
/// how many memory blocks of this size are currently allocated
Current: PtrUInt;
/// the standard size of the small memory block
BlockSize: PtrUInt;
end;
/// small blocks detailed information as returned GetSmallBlockStatus
TSmallBlockStatusDynArray = array of TSmallBlockStatus;
/// sort order of detailed information as returned GetSmallBlockStatus
TSmallBlockOrderBy = (
obTotal,
obCurrent,
obBlockSize);
/// retrieve the use counts of allocated small blocks
// - returns maxcount biggest results, sorted by "orderby" field occurrence
function GetSmallBlockStatus(maxcount: integer = 10;
orderby: TSmallBlockOrderBy = obTotal; count: PPtrUInt = nil; bytes: PPtrUInt = nil;
small: PCardinal = nil; tiny: PCardinal = nil): TSmallBlockStatusDynArray;
/// retrieve all small blocks which suffered from blocking during multi-thread
// - returns maxcount biggest results, sorted by SleepCount Occurrence
function GetSmallBlockContention(
maxcount: integer = 10): TSmallBlockContentionDynArray;
/// convenient debugging function into the console
// - if smallblockcontentioncount > 0, includes GetSmallBlockContention() info
// up to the smallblockcontentioncount biggest occurrences
// - see also RetrieveMemoryManagerInfo from mormot.core.log for runtime call
procedure WriteHeapStatus(const context: ShortString = '';
smallblockstatuscount: integer = 8; smallblockcontentioncount: integer = 8;
compilationflags: boolean = false);
/// convenient debugging function into a string
// - if smallblockcontentioncount > 0, includes GetSmallBlockContention() info
// up to the smallblockcontentioncount biggest occurrences
// - see also RetrieveMemoryManagerInfo from mormot.core.log for more details
// - warning: this function is not thread-safe
function GetHeapStatus(const context: ShortString; smallblockstatuscount,
smallblockcontentioncount: integer; compilationflags, onsameline: boolean): string;
const
/// human readable information about how our MM was built
// - similar to WriteHeapStatus(compilationflags=true) output
FPCMM_FLAGS = ' '
{$ifdef FPCMM_BOOSTER} + 'BOOSTER ' {$else}
{$ifdef FPCMM_BOOST} + 'BOOST ' {$else}
{$ifdef FPCMM_SERVER} + 'SERVER ' {$endif}
{$endif FPCMM_BOOST}
{$endif FPCMM_BOOSTER}
{$ifdef FPCMM_ASSUMEMULTITHREAD} + ' assumulthrd' {$endif}
{$ifdef FPCMM_PAUSE} + ' pause' {$endif}
{$ifdef FPCMM_SLEEPTSC} + ' rdtsc' {$endif}
{$ifndef BSD}
{$ifdef FPCMM_NOMREMAP} + ' nomremap' {$endif}
{$endif BSD}
{$ifdef FPCMM_SMALLNOTWITHMEDIUM}+ ' smallpool'
{$ifdef FPCMM_MULTIPLESMALLNOTWITHMEDIUM} + 's' {$endif} {$endif}
{$ifdef FPCMM_TINYPERTHREAD} + ' perthrd' {$endif}
{$ifdef FPCMM_ERMS} + ' erms' {$endif}
{$ifdef FPCMM_DEBUG} + ' debug' {$endif}
{$ifdef FPCMM_REPORTMEMORYLEAKS} + ' repmemleak' {$endif};
{$endif FPCMM_STANDALONE}
{$endif FPCX64MM}
implementation
{
High-level Allocation Strategy Description
--------------------------------------------
The allocator handles the following families of memory blocks:
- TINY <= 128 B (<= 256 B for FPCMM_BOOST)
Round-robin distribution into several arenas, fed from one or several pool(s)
(fair scaling from multi-threaded calls, with no threadvar nor GC involved)
- SMALL <= 2600 B
One arena per block size, fed from one or several pool(s)
- MEDIUM <= 256 KB
Separated pool of bitmap-marked chunks, fed from 1MB of OS mmap/virtualalloc
- LARGE > 256 KB
Directly fed from OS mmap/virtualalloc with mremap when growing
The original FastMM4 was enhanced as such, especially in FPCMM_SERVER mode:
- FPC compatibility, even on POSIX/Linux, also for FPC specific API behavior;
- Memory leaks and thread contention tracked without performance impact;
- Detailed per-block statistics with little performance penalty;
- x86_64 code was refactored and tuned in respect to 2020's hardware;
- Inlined SSE2 movaps loop or ERMS are more efficient that subfunction(s);
- New round-robin thread-friendly arenas of tiny blocks;
- Those arenas can be configured by size, and assigned by thread ID;
- Tiny and small blocks can fed from their own pool(s), not the medium pool;
- Lock-less free lists to reduce tiny/small/medium Freemem thread contention;
- Large blocks logic has been rewritten, especially realloc;
- AllocMedium() and AllocLarge() use MAP_POPULATE to reduce page faults;
- On Linux, mremap is used for efficient realloc of large blocks;
- Largest blocks can grow by 2MB=PMD_SIZE chunks for even faster mremap.
About locking:
- Tiny and Small blocks have their own per-size lock;
- Tiny and Small blocks have per-pool lock when feeding;
- Lock-less free lists reduce tiny/small Getmem/Freemem thread contention;
- Lock-less free lists reduce medium Freemem thread contention;
- Medium and Large blocks have one giant lock over their own pool;
- Medium blocks have an unlocked prefetched memory chunk to reduce contention;
- Large blocks don't lock during mmap/virtualalloc system calls;
- SwitchToThread/FpNanoSleep OS call is done after initial spinning;
- FPCMM_DEBUG / WriteHeapStatus helps identifying the lock contention(s).
}
{$ifdef FPCX64MM}
// this unit is available only for FPC + X86_64 CPU
{$ifndef FPCMM_NOPAUSE}
// on contention problem, execute "pause" opcode and spin retrying the lock
// - defined by default to follow Intel recommendatations from
// https://software.intel.com/content/www/us/en/develop/articles/benefitting-power-and-performance-sleep-loops.html
// - spinning loop is either using constants or rdtsc (if FPCMM_SLEEPTSC is set)
// - on SkylakeX (Intel 7th gen), "pause" opcode went from 10-20 to 140 cycles
// so our constants below will favor those latest CPUs with a longer pause
{$define FPCMM_PAUSE}
{$endif FPCMM_NOPAUSE}
{$ifdef FPCMM_MULTIPLESMALLNOTWITHMEDIUM}
{$define FPCMM_SMALLNOTWITHMEDIUM}
{$endif FPCMM_MULTIPLESMALLNOTWITHMEDIUM}
{ ********* Operating System Specific API Calls }
{$ifdef MSWINDOWS}
// Win64: any assembler function with sub-calls should have a stack frame
// -> nostackframe is defined only on Linux or for functions with no nested call
{$undef NOSFRAME}
const
kernel32 = 'kernel32.dll';
MEM_COMMIT = $1000;
MEM_RESERVE = $2000;
MEM_RELEASE = $8000;
MEM_FREE = $10000;
MEM_TOP_DOWN = $100000;
PAGE_READWRITE = 4;
PAGE_GUARD = $0100;
PAGE_VALID = $00e6; // PAGE_READONLY or PAGE_READWRITE or PAGE_EXECUTE or
// PAGE_EXECUTE_READ or PAGE_EXECUTE_READWRITE or PAGE_EXECUTE_WRITECOPY
type
// VirtualQuery() API result structure
TMemInfo = record
BaseAddress, AllocationBase: pointer;
AllocationProtect: cardinal;
PartitionId: word;
RegionSize: PtrUInt;
State, Protect, MemType: cardinal;
end;
function VirtualAlloc(lpAddress: pointer;
dwSize: PtrUInt; flAllocationType, flProtect: cardinal): pointer;
stdcall; external kernel32 name 'VirtualAlloc';
function VirtualFree(lpAddress: pointer; dwSize: PtrUInt;
dwFreeType: cardinal): LongBool;
stdcall; external kernel32 name 'VirtualFree';
procedure SwitchToThread;
stdcall; external kernel32 name 'SwitchToThread';
function VirtualQuery(lpAddress, lpMemInfo: pointer; dwLength: PtrUInt): PtrUInt;
stdcall; external kernel32 name 'VirtualQuery';
function AllocMedium(Size: PtrInt): pointer; inline;
begin
// bottom-up allocation to reduce fragmentation
result := VirtualAlloc(nil, Size, MEM_COMMIT, PAGE_READWRITE);
end;
function AllocLarge(Size: PtrInt): pointer; inline;
begin
// FastMM4 uses top-down allocation (MEM_TOP_DOWN) of large blocks to "reduce
// fragmentation", but on a 64-bit system I am not sure of this statement, and
// VirtualAlloc() was reported to have a huge slowdown due to this option
// https://randomascii.wordpress.com/2011/08/05/making-virtualalloc-arbitrarily-slower
result := VirtualAlloc(nil, Size, MEM_COMMIT, PAGE_READWRITE);
end;
procedure FreeMediumLarge(ptr: pointer; Size: PtrInt); inline;
begin
VirtualFree(ptr, 0, MEM_RELEASE);
end;
{$ifndef FPCMM_NOMREMAP}
function RemapLarge(addr: pointer; old_len, new_len: size_t): pointer;
var
meminfo: TMemInfo;
next: pointer;
nextsize: PtrUInt;
begin
// old_len and new_len have 64KB granularity, so match Windows page size
nextsize := new_len - old_len;
if PtrInt(nextsize) > 0 then
begin
// try to allocate the memory just after the existing one
FillChar(meminfo, SizeOf(meminfo), 0);
next := addr + old_len;
if (VirtualQuery(next, @meminfo, SizeOf(meminfo)) = SizeOf(meminfo)) and
(meminfo.State = MEM_FREE) and
(meminfo.RegionSize >= nextsize) and // enough space?
// set the address space in two reserve + commit steps for thread safety
(VirtualAlloc(next, nextsize, MEM_RESERVE, PAGE_READWRITE) <> nil) and
(VirtualAlloc(next, nextsize, MEM_COMMIT, PAGE_READWRITE) <> nil) then
begin
result := addr; // in-place realloc: no need to move memory :)
exit;
end;
end;
// we need to use the slower but safe Alloc/Move/Free pattern
result := AllocLarge(new_len);
if new_len > old_len then
new_len := old_len; // handle size up or down
Move(addr^, result^, new_len); // RTL non-volatile asm or our AVX MoveFast()
FreeMediumLarge(addr, old_len);
end;
{$endif FPCMM_NOMREMAP}
// aligning large chunks > 4MB to 2MB units seems always a good idea
{$define FPCMM_LARGEBIGALIGN}
// experimental VirtualQuery detection of object class - use at your own risk
{$define FPCMM_REPORTMEMORYLEAKS_EXPERIMENTAL}
{$else}
uses
{$ifndef DARWIN}
syscall,
{$endif DARWIN}
BaseUnix;
// in practice, SYSV ABI seems to not require a stack frame, as Win64 does, for
// our use case of nested calls with no local stack storage and direct kernel
// syscalls - but since it is clearly undocumented, we set it on LINUX only
// -> appears to work with no problem from our tests: feedback is welcome!
// -> see FPCMM_NOSFRAME conditional to disable it on LINUX
{$ifdef LINUX}
{$define NOSFRAME}
{$else}
{$define OLDLINUXKERNEL} // no Linuxism on BSD
{$undef FPCMM_TINYPERTHREAD} // no inlined pthread_self on BSD
{$endif LINUX}
// on Linux, mremap() on PMD_SIZE=2MB aligned data can make a huge speedup
// see https://lwn.net/Articles/833208 - so FPCMM_LARGEBIGALIGN is always set
{$ifdef LINUX}
{$define FPCMM_LARGEBIGALIGN} // align large chunks to 21-bit = 2MB = PMD_SIZE
{$endif LINUX}
// we directly call the OS Kernel, so this unit doesn't require any libc
const
{$ifdef OLDLINUXKERNEL}
{$undef FPCMM_MEDIUM32BIT}
MAP_POPULATE = 0;
{$else}
/// put the mapping in first 2 GB of memory (31-bit addresses) - 2.4.20, 2.6
MAP_32BIT = $40;
/// populate (prefault) pagetables to avoid page faults later - 2.5.46
MAP_POPULATE = $08000;
{$endif OLDLINUXKERNEL}
// tiny/small/medium blocks mmap() flags
// - MAP_POPULATE is included to enhance performance on single thread app, and
// also on heavily multi-threaded process (but perhaps not with few threads)
// - FPCMM_MEDIUM32BIT allocates as 31-bit pointers, but may be incompatible
// with TOrmTable for data >256KB so requires NOPOINTEROFFSET conditional,
// therefore is not set by default
MAP_MEDIUM = MAP_PRIVATE or MAP_ANONYMOUS or MAP_POPULATE
{$ifdef FPCMM_MEDIUM32BIT} or MAP_32BIT {$endif};
// large blocks mmap() flags
// - no MAP_32BIT since could use the whole 64-bit address space
// - MAP_POPULATE is included on Linux to avoid page faults, with
// no penalty since mmap/mremap are called outside the large blocks lock
MAP_LARGE = MAP_PRIVATE or MAP_ANONYMOUS or MAP_POPULATE;
{$ifdef FPCMM_MEDIUM32BIT}
var
AllocMediumflags: integer = MAP_MEDIUM;
{$else}
AllocMediumflags = MAP_MEDIUM;
{$endif FPCMM_MEDIUM32BIT}
function AllocMedium(Size: PtrInt): pointer;
begin
result := fpmmap(nil, Size, PROT_READ or PROT_WRITE, AllocMediumflags, -1, 0);
if result = MAP_FAILED then
result := nil; // as VirtualAlloc()
{$ifdef FPCMM_MEDIUM32BIT}
if (result <> nil) or
((AllocMediumflags and MAP_32BIT) = 0) then
exit;
// try with no 2GB limit from now on
AllocMediumflags := AllocMediumflags and not MAP_32BIT;
result := AllocMedium(Size); // try with no 2GB limit from now on
{$endif FPCMM_MEDIUM32BIT}
end;
function AllocLarge(Size: PtrInt): pointer; inline;
begin
result := fpmmap(nil, Size, PROT_READ or PROT_WRITE, MAP_LARGE, -1, 0);
if result = MAP_FAILED then
result := nil; // as VirtualAlloc()
end;
procedure FreeMediumLarge(ptr: pointer; Size: PtrInt); inline;
begin
fpmunmap(ptr, Size);
end;
{$ifdef LINUX}
{$ifndef FPCMM_NOMREMAP}
const
syscall_nr_mremap = 25; // valid on x86_64 Linux and Android
MREMAP_MAYMOVE = 1;
function RemapLarge(addr: pointer; old_len, new_len: size_t): pointer;
begin
// let the Linux Kernel mremap() the memory using its TLB magic
result := pointer(do_syscall(syscall_nr_mremap, TSysParam(addr),
TSysParam(old_len), TSysParam(new_len), TSysParam(MREMAP_MAYMOVE)));
if result <> MAP_FAILED then
exit;
// some OS (e.g. Alma Linux 9 with 5.x kernel) seems to fail sometimes :(
// https://github.com/ClickHouse/ClickHouse/issues/52955#issuecomment-1664710083
// -> it should not, because we use the MREMAP_MAYMOVE flag - but anyway...
// -> fallback to safe, simple (and slower) Alloc/Move/Free pattern
result := AllocLarge(new_len);
if result = nil then
exit; // out of memory
if new_len > old_len then
new_len := old_len; // resize down
Move(addr^, result^, new_len); // RTL non-volatile asm or our AVX MoveFast()
FreeMediumLarge(addr, old_len);
end;
{$endif FPCMM_NOMREMAP}
// experimental detection of object class - use at your own risk
{$define FPCMM_REPORTMEMORYLEAKS_EXPERIMENTAL}
// (untested on BSD/DARWIN)
{$else}
{$define FPCMM_NOMREMAP} // mremap is a Linux-specific syscall
{$endif LINUX}
procedure SwitchToThread;
var
t: Ttimespec;
begin
// note: nanosleep() adds a few dozen of microsecs for context switching
t.tv_sec := 0;
t.tv_nsec := 10; // empirically identified on a recent Linux Kernel
fpnanosleep(@t, nil);
end;
function pthread_self: PtrUInt; external;
{$endif MSWINDOWS}
// fallback to safe and simple Alloc/Move/Free pattern
{$ifdef FPCMM_NOMREMAP}
function RemapLarge(addr: pointer; old_len, new_len: size_t): pointer;
begin
result := AllocLarge(new_len);
if new_len > old_len then
new_len := old_len; // resize down
Move(addr^, result^, new_len); // RTL non-volatile asm or our AVX MoveFast()
FreeMediumLarge(addr, old_len);
end;
{$undef FPCMM_LARGEBIGALIGN} // keep 64KB granularity if no mremap()
{$endif FPCMM_NOMREMAP}
{ ********* Some Assembly Helpers }
// low-level conditional to disable nostackframe code on Linux
{$ifdef FPCMM_NOSFRAME}
{$undef NOSFRAME}
{$endif FPCMM_NOSFRAME}
var
HeapStatus: TMMStatus;
{$ifdef FPCMM_DEBUG}
procedure ReleaseCore;
{$ifdef NOSFRAME} nostackframe; {$endif} assembler;
asm
{$ifdef FPCMM_SLEEPTSC}
rdtsc // returns the TSC in EDX:EAX
shl rdx, 32
or rax, rdx
push rax
call SwitchToThread
pop rcx
rdtsc
shl rdx, 32
or rax, rdx
lea rdx, [rip + HeapStatus]
sub rax, rcx
lock add qword ptr [rdx + TMMStatus.SleepCycles], rax
{$else}
call SwitchToThread
lea rdx, [rip + HeapStatus]
{$endif FPCMM_SLEEPTSC}
lock inc qword ptr [rdx + TMMStatus.SleepCount]
end;
{$else}
procedure ReleaseCore;
begin
SwitchToThread;
inc(HeapStatus.SleepCount); // indicative counter
end;
{$endif FPCMM_DEBUG}
procedure NotifyArenaAlloc(var Arena: TMMStatusArena; Size: PtrUInt);
nostackframe; assembler;
asm
{$ifdef FPCMM_DEBUG}
lock add qword ptr [Arena].TMMStatusArena.CurrentBytes, Size
lock add qword ptr [Arena].TMMStatusArena.CumulativeBytes, Size
lock inc qword ptr [Arena].TMMStatusArena.CumulativeAlloc
mov rax, qword ptr [Arena].TMMStatusArena.CurrentBytes
cmp rax, qword ptr [Arena].TMMStatusArena.PeakBytes
jbe @s
mov qword ptr [Arena].TMMStatusArena.PeakBytes, rax
@s: {$else}
add qword ptr [Arena].TMMStatusArena.CurrentBytes, Size
add qword ptr [Arena].TMMStatusArena.CumulativeBytes, Size
{$endif FPCMM_DEBUG}
end;
procedure NotifyMediumLargeFree(var Arena: TMMStatusArena; Size: PtrUInt);
nostackframe; assembler;
asm
neg Size
{$ifdef FPCMM_DEBUG}
lock add qword ptr [Arena].TMMStatusArena.CurrentBytes, Size
lock inc qword ptr [Arena].TMMStatusArena.CumulativeFree
{$else}
add qword ptr [Arena].TMMStatusArena.CurrentBytes, Size
{$endif FPCMM_DEBUG}
end;
{ ********* Constants and Data Structures Definitions }
// during spinning, there is clearly thread contention: in this case, plain
// "cmp" before "lock cmpxchg" is mandatory to leverage the CPU cores
{$define FPCMM_CMPBEFORELOCK_SPIN}
// prepare a Medium arena chunk in TMediumInfo.Prefetch outside of the lock
{$define FPCMM_MEDIUMPREFETCH}
const
// define maximum size of tiny blocks, and the number of arenas
{$ifdef FPCMM_BOOSTER}
NumTinyBlockTypesPO2 = 4; // tiny are <= 256 bytes
NumTinyBlockArenasPO2 = 7; // 128 arenas
{$else}
{$ifdef FPCMM_BOOST}
NumTinyBlockTypesPO2 = 4; // tiny are <= 256 bytes
NumTinyBlockArenasPO2 = 3; // 8 arenas
{$else}
// default (or FPCMM_SERVER) settings
NumTinyBlockTypesPO2 = 3; // multiple arenas for tiny blocks <= 128 bytes
NumTinyBlockArenasPO2 = 3; // 8 round-robin arenas (including Small[])
{$endif FPCMM_BOOST}
{$endif FPCMM_BOOSTER}
NumSmallBlockTypes = 46;
NumSmallBlockTypesUnique = NumSmallBlockTypes - 2; // last 2 are redundant
MaximumSmallBlockSize = 2608;
SmallBlockSizes: array[0..NumSmallBlockTypes - 1] of word = (
16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256,
272, 288, 304, 320, 352, 384, 416, 448, 480, 528, 576, 624, 672, 736, 800,
880, 960, 1056, 1152, 1264, 1376, 1504, 1648, 1808, 1984, 2176, 2384,
MaximumSmallBlockSize, MaximumSmallBlockSize, MaximumSmallBlockSize);
NumTinyBlockTypes = 1 shl NumTinyBlockTypesPO2; // 8 (128B) or 16 (256B)
NumTinyBlockArenas = (1 shl NumTinyBlockArenasPO2) - 1; // -1 = main Small[]
NumSmallInfoBlock = NumSmallBlockTypes + NumTinyBlockArenas * NumTinyBlockTypes;
SmallBlockGranularity = 16;
TargetSmallBlocksPerPool = 48;
MinimumSmallBlocksPerPool = 12;
SmallBlockDownsizeCheckAdder = 64;
SmallBlockUpsizeAdder = 32;
SmallBlockTypePO2 = 6; // SizeOf(TSmallBlockType)=64
MediumBlockPoolSizeMem = 20 * 64 * 1024;
MediumBlockPoolSize = MediumBlockPoolSizeMem - 16;
MediumBlockSizeOffset = 48;
MinimumMediumBlockSize = 11 * 256 + MediumBlockSizeOffset;
MediumBlockBinsPerGroup = 32;
MediumBlockBinGroupCount = 32;
MediumBlockBinCount = MediumBlockBinGroupCount * MediumBlockBinsPerGroup;
MediumBlockGranularity = 256;
MaximumMediumBlockSize =
MinimumMediumBlockSize + (MediumBlockBinCount - 1) * MediumBlockGranularity;
OptimalSmallBlockPoolSizeLowerLimit =
29 * 1024 - MediumBlockGranularity + MediumBlockSizeOffset;
OptimalSmallBlockPoolSizeUpperLimit =
64 * 1024 - MediumBlockGranularity + MediumBlockSizeOffset;
MaximumSmallBlockPoolSize =
OptimalSmallBlockPoolSizeUpperLimit + MinimumMediumBlockSize;
MediumInPlaceDownsizeLimit = MinimumMediumBlockSize div 4;
IsFreeBlockFlag = 1;
IsMediumBlockFlag = 2;
IsSmallBlockPoolInUseFlag = 4;
IsLargeBlockFlag = 4;
PreviousMediumBlockIsFreeFlag = 8;
LargeBlockIsSegmented = 8;
DropSmallFlagsMask = -8;
ExtractSmallFlagsMask = 7;
DropMediumAndLargeFlagsMask = -16;
ExtractMediumAndLargeFlagsMask = 15;
{$ifdef FPCMM_SLEEPTSC}
// pause using rdtsc (30 cycles latency on hardware but emulated on VM)
SpinMediumLockTSC = 10000;
SpinLargeLockTSC = 10000;
{$ifdef FPCMM_PAUSE}
SpinSmallGetmemLockTSC = 1000;
{$endif FPCMM_PAUSE}
{$else}
// pause with constant spinning counts (empirical values from fastmm4-avx)
SpinMediumLockCount = 2500;
SpinLargeLockCount = 5000;
{$ifdef FPCMM_PAUSE}
SpinSmallGetmemLockCount = 500;
{$endif FPCMM_PAUSE}
SpinMediumFreememLockCount = 500;
{$endif FPCMM_SLEEPTSC}
{$ifdef FPCMM_ERMS}
// pre-ERMS expects at least 256 bytes, IvyBridge+ with ERMS is good from 64
// (copy_user_enhanced_fast_string() in recent Linux kernel uses 64)
// see https://stackoverflow.com/a/43837564/458259 for explanations and timing
// -> "movaps" loop is used up to 256 bytes of data: good on all CPUs
// -> "movnt" Move/MoveFast is used for large blocks: always faster than ERMS
ErmsMinSize = 256;
{$endif FPCMM_ERMS}
type
PSmallBlockPoolHeader = ^TSmallBlockPoolHeader;
// information for each small block size - 64 bytes long = CPU cache line
TSmallBlockType = record
Locked: boolean;
AllowedGroupsForBlockPoolBitmap: byte;
BlockSize: Word;
MinimumBlockPoolSize: Word;
OptimalBlockPoolSize: Word;
NextPartiallyFreePool: PSmallBlockPoolHeader;
PreviousPartiallyFreePool: PSmallBlockPoolHeader;
NextSequentialFeedBlockAddress: pointer;
MaxSequentialFeedBlockAddress: pointer;
CurrentSequentialFeedPool: PSmallBlockPoolHeader;
GetmemCount: cardinal;
FreememCount: cardinal;
LockLessFree: pointer;
end;
PSmallBlockType = ^TSmallBlockType;
TSmallBlockTypes = array[0..NumSmallBlockTypes - 1] of TSmallBlockType;
TTinyBlockTypes = array[0..NumTinyBlockTypes - 1] of TSmallBlockType;
TSmallBlockInfo = record
Small: TSmallBlockTypes;
Tiny: array[0..NumTinyBlockArenas - 1] of TTinyBlockTypes;
GetmemLookup: array[0..
(MaximumSmallBlockSize - 1) div SmallBlockGranularity] of byte;
// safe access to IsMultiThread global variable - accessed via GOT sub-call
IsMultiThreadPtr: PBoolean;
{$ifndef FPCMM_TINYPERTHREAD}
TinyCurrentArena: integer;
{$endif FPCMM_TINYPERTHREAD}
GetmemSleepCount: array[0..NumSmallBlockTypesUnique - 1] of cardinal;
{$ifdef FPCMM_MULTIPLESMALLNOTWITHMEDIUM} // PMediumBlockInfo lookup
SmallMediumBlockInfo: array[0..NumSmallInfoBlock - 1] of pointer;
// here because there was no room for a new field in TSmallBlockType
{$endif FPCMM_MULTIPLESMALLNOTWITHMEDIUM}
end;
TSmallBlockPoolHeader = record
BlockType: PSmallBlockType;
NextPartiallyFreePool: PSmallBlockPoolHeader;
PreviousPartiallyFreePool: PSmallBlockPoolHeader;
FirstFreeBlock: pointer;
BlocksInUse: cardinal;
SmallBlockPoolSignature: cardinal;
FirstBlockPoolPointerAndFlags: PtrUInt;
end;
PMediumBlockPoolHeader = ^TMediumBlockPoolHeader;
TMediumBlockPoolHeader = record
PreviousMediumBlockPoolHeader: PMediumBlockPoolHeader;
NextMediumBlockPoolHeader: PMediumBlockPoolHeader;
Reserved1: PtrUInt;
FirstMediumBlockSizeAndFlags: PtrUInt;
end;
PMediumFreeBlock = ^TMediumFreeBlock;
TMediumFreeBlock = record
PreviousFreeBlock: PMediumFreeBlock;
NextFreeBlock: PMediumFreeBlock;
end;
PMediumBlockInfo = ^TMediumBlockInfo;
TMediumBlockInfo = record
Locked: boolean;
{$ifdef FPCMM_MEDIUMPREFETCH}
PrefetchLocked: boolean;
{$endif FPCMM_MEDIUMPREFETCH}
PoolsCircularList: TMediumBlockPoolHeader;
LastSequentiallyFed: pointer;
SequentialFeedBytesLeft: cardinal;
BinGroupBitmap: cardinal;
{$ifdef FPCMM_MEDIUMPREFETCH}
Prefetch: pointer;
{$endif FPCMM_MEDIUMPREFETCH}
{$ifndef FPCMM_ASSUMEMULTITHREAD}
IsMultiThreadPtr: PBoolean; // safe access to IsMultiThread global variable
{$endif FPCMM_ASSUMEMULTITHREAD}
LockLessFree: pointer;
BinBitmaps: array[0..MediumBlockBinGroupCount - 1] of cardinal;
Bins: array[0..MediumBlockBinCount - 1] of TMediumFreeBlock;
end;
PLargeBlockHeader = ^TLargeBlockHeader;
TLargeBlockHeader = record
PreviousLargeBlockHeader: PLargeBlockHeader;
NextLargeBlockHeader: PLargeBlockHeader;
Reserved: PtrUInt;
BlockSizeAndFlags: PtrUInt;
end;
const
BlockHeaderSize = SizeOf(pointer);
SmallBlockPoolHeaderSize = SizeOf(TSmallBlockPoolHeader);
SmallBlockTypeSize = SizeOf(TSmallBlockType);
MediumBlockPoolHeaderSize = SizeOf(TMediumBlockPoolHeader);
LargeBlockHeaderSize = SizeOf(TLargeBlockHeader);
LargeBlockGranularity = 1 shl 16; // 64KB for (smallest) large blocks
{$ifdef FPCMM_LARGEBIGALIGN}
LargeBlockGranularity2 = 1 shl 21; // PMD_SIZE=2MB granularity
LargeBlockGranularity2Size = 2 shl 21; // for size >= 4MB
// on Linux, mremap() on PMD_SIZE=2MB aligned data can make a huge speedup
{$endif FPCMM_LARGEBIGALIGN}
var
SmallBlockInfo: TSmallBlockInfo;
MediumBlockInfo: TMediumBlockInfo;
{$ifdef FPCMM_SMALLNOTWITHMEDIUM}
{$ifdef FPCMM_MULTIPLESMALLNOTWITHMEDIUM}
SmallMediumBlockInfo: array[0.. (NumTinyBlockTypes * 2) - 2] of TMediumBlockInfo;
// -2 to ensure same small block size won't share the same medium block