diff --git a/src/core/mormot.core.crypto.asmx64.inc b/src/core/mormot.core.crypto.asmx64.inc
index 72b8bb04d..5c392d3d4 100644
--- a/src/core/mormot.core.crypto.asmx64.inc
+++ b/src/core/mormot.core.crypto.asmx64.inc
@@ -3243,13 +3243,13 @@ end;
 
 // AesNiEncryptCtrNist32() expects the CTR in lowest 32-bit to never overflow
 procedure AesNiEncryptCtrNist(src, dest: PByte; len: cardinal;
-  ctxt, iv: PAesBlock); inline;
+  ctxt, iv: PHash128Rec); inline;
 var
   ctr, blocks: cardinal;
 begin
-  ctr := bswap32(PCardinal(@iv[12])^);
+  ctr := bswap32(iv.c3);
   repeat
-    blocks := len shr 4;
+    blocks := len shr AesBlockShift;
     inc(ctr, blocks);
     if ctr < blocks then
     begin
@@ -3258,10 +3258,10 @@ begin
       ctr := 0;
     end;
     AesNiEncryptCtrNist32(src, dest, blocks, ctxt, iv); // 32-bit CTR asm
-    PCardinal(@iv[12])^ := bswap32(ctr);
+    iv.c3 := bswap32(ctr);
     if ctr = 0 then
-      CtrNistCarry12(iv); // propagate carry
-    blocks := blocks shl 4;
+      CtrNistCarry12(@iv.b); // propagate carry
+    blocks := blocks shl AesBlockShift;
     inc(src, blocks);
     inc(dest, blocks);
     dec(len, blocks);
@@ -3858,7 +3858,7 @@ var
 begin
   ctr := bswap32(PCardinal(@ctxt.fIV[12])^);
   repeat
-    blocks := len shr 4;
+    blocks := len shr AesBlockShift;
     inc(ctr, blocks);
     if ctr < blocks then
     begin
@@ -3870,7 +3870,7 @@ begin
     PCardinal(@ctxt.fIV[12])^ := bswap32(ctr);
     if ctr = 0 then
       CtrNistCarry12(@ctxt.fIV); // propagate carry
-    blocks := blocks shl 4;
+    blocks := blocks shl AesBlockShift;
     inc(src, blocks);
     inc(dest, blocks);
     dec(len, blocks);
@@ -4413,6 +4413,403 @@ procedure AesNiDecryptCfbCrc256(src, dest, aes: pointer; blocks: PtrUInt);
         {$endif WIN64ABI}
 end;
 
+{$ifdef USEGCMAVX}
+
+// prepare the GMAC process for gcmavx_data() and gcmavx_end()
+procedure GcmAvxInit(ptab, ks: pointer; rounds: cardinal);
+{$ifdef FPC} nostackframe; assembler; asm {$else} asm .noframe {$endif}
+        // rcx/rdi=ptab, rdx/rsi=ks, r8/rdx=kslen
+        movdqa  xmm15, dqword ptr [rip + @bswapMask]
+        movdqa  xmm14, dqword ptr [rip + @gcmPoly]
+        movdqu  xmm0, dqword ptr [ks]
+        movdqu  xmm11, dqword ptr [ks + 10H]
+        aesenc  xmm0, xmm11
+        movdqu  xmm11, dqword ptr [ks + 20H]
+        aesenc  xmm0, xmm11
+        movdqu  xmm11, dqword ptr [ks + 30H]
+        aesenc  xmm0, xmm11
+        movdqu  xmm11, dqword ptr [ks + 40H]
+        aesenc  xmm0, xmm11
+        movdqu  xmm11, dqword ptr [ks + 50H]
+        aesenc  xmm0, xmm11
+        movdqu  xmm11, dqword ptr [ks + 60H]
+        aesenc  xmm0, xmm11
+        movdqu  xmm11, dqword ptr [ks + 70H]
+        aesenc  xmm0, xmm11
+        movdqu  xmm11, dqword ptr [ks + 80H]
+        aesenc  xmm0, xmm11
+        movdqu  xmm11, dqword ptr [ks + 90H]
+        aesenc  xmm0, xmm11
+        movdqu  xmm11, dqword ptr [ks + 0A0H]
+        cmp     rounds, 12
+        jc      @last
+        // end of AES-128
+        aesenc  xmm0, xmm11
+        movdqu  xmm11, dqword ptr [ks + 0B0H]
+        aesenc  xmm0, xmm11
+        movdqu  xmm11, dqword ptr [ks + 0C0H]
+        jz      @last
+        // end of AES-192
+        aesenc  xmm0, xmm11
+        movdqu  xmm11, dqword ptr [ks + 0D0H]
+        aesenc  xmm0, xmm11
+        movdqu  xmm11, dqword ptr [ks + 0E0H]
+        // end of AES-256
+@last:  aesenclast xmm0, xmm11
+        pshufb  xmm0, xmm15
+        pshufd  xmm11, xmm0, 0FFH
+        movdqu  xmm12, xmm0
+        psrad   xmm11, 31
+        pand    xmm11, xmm14
+        psrld   xmm12, 31
+        pslldq  xmm12, 4
+        pslld   xmm0, 1
+        pxor    xmm0, xmm11
+        pxor    xmm0, xmm12
+        movdqu  dqword ptr [ptab + 0E0H], xmm0
+        pshufd  xmm1, xmm0, 4EH
+        pxor    xmm1, xmm0
+        movdqu  dqword ptr [ptab + 0F0H], xmm1
+        movdqu  xmm2, xmm0
+        movdqu  xmm3, xmm1
+        mov     al, 7
+        {$ifdef FPC} align 16 {$else} .align 16 {$endif}
+@init:  movdqu  xmm11, xmm2
+        movdqu  xmm12, xmm2
+        movdqu  xmm13, xmm3
+        // pclmulqdq xmm11, xmm0, 00H     @ 66 44: 0F 3A 44. D8, 00
+        // pclmulqdq xmm12, xmm0, 11H     @ 66 44: 0F 3A 44. E0, 11
+        // pclmulqdq xmm13, xmm1, 00H     @ 66 44: 0F 3A 44. E9, 00
+        db $66, $44, $0F, $3A, $44, $D8, $00
+        db $66, $44, $0F, $3A, $44, $E0, $11
+        db $66, $44, $0F, $3A, $44, $E9, $00
+        pxor    xmm13, xmm11
+        pxor    xmm13, xmm12
+        movdqu  xmm4, xmm13
+        pslldq  xmm4, 8
+        psrldq  xmm13, 8
+        pxor    xmm11, xmm4
+        pxor    xmm12, xmm13
+        movdqu  xmm2, xmm14
+        // pclmulqdq xmm2, xmm11, 01H        @ 66 41: 0F 3A 44. D3, 01
+        db $66, $41, $0F, $3A, $44, $D3, $01
+        pshufd  xmm11, xmm11, 4EH
+        pxor    xmm11, xmm2
+        movdqu  xmm2, xmm14
+        // pclmulqdq xmm2, xmm11, 01H        @ 66 41: 0F 3A 44. D3, 01
+        db $66, $41, $0F, $3A, $44, $D3, $01
+        pshufd  xmm11, xmm11, 4EH
+        pxor    xmm2, xmm11
+        pxor    xmm2, xmm12
+        movdqu  dqword ptr [ptab + 0C0H], xmm2
+        pshufd  xmm3, xmm2, 4EH
+        pxor    xmm3, xmm2
+        movdqu  dqword ptr [ptab + 0D0H], xmm3
+        lea     ptab, [ptab - 20H]
+        dec     al
+        jne     @init
+        ret
+        {$ifdef FPC} align 16 {$else} .align 16 {$endif}
+@bswapMask:
+        dq $08090A0B0C0D0E0F
+        dq $0001020304050607
+@gcmPoly:
+        dq $0000000000000001
+        dq $C200000000000000
+end;
+
+// compute GMAC with 8x interleaved pclmulqdq opcode
+procedure GcmAvxAuth(ptab, data: pointer; datalen: PtrInt; hash: pointer);
+{$ifdef FPC} nostackframe; assembler; asm {$else} asm .noframe {$endif}
+        // rdi=ptab, rsi=data, rdx=datalen, rcx=hash
+        movdqu  xmm8, dqword ptr [hash]
+        movdqa  xmm15, dqword ptr [rip + @bswapMask]
+        movdqa  xmm14, dqword ptr [rip + @gcmPoly]
+        test    datalen, datalen
+        jz      @done
+        cmp     datalen, 128
+        jc      @by1
+        {$ifdef FPC} align 16 {$else} .align 16 {$endif}
+@by8:   sub     datalen, 128
+        movdqu  xmm0, dqword ptr [data]
+        movdqu  xmm1, dqword ptr [data + 10H]
+        movdqu  xmm2, dqword ptr [data + 20H]
+        movdqu  xmm3, dqword ptr [data + 30H]
+        movdqu  xmm4, dqword ptr [data + 40H]
+        movdqu  xmm5, dqword ptr [data + 50H]
+        movdqu  xmm6, dqword ptr [data + 60H]
+        movdqu  xmm7, dqword ptr [data + 70H]
+        lea     data, [data + 80H]
+        pshufb  xmm0, xmm15
+        pshufb  xmm1, xmm15
+        pshufb  xmm2, xmm15
+        pshufb  xmm3, xmm15
+        pshufb  xmm4, xmm15
+        pshufb  xmm5, xmm15
+        pshufb  xmm6, xmm15
+        pshufb  xmm7, xmm15
+        pxor    xmm0, xmm8
+        movdqu  xmm8, dqword ptr [ptab]
+        movdqu  xmm10, dqword ptr [ptab + 10H]
+        movdqu  xmm9, xmm8
+        pshufd  xmm12, xmm0, 4EH
+        pxor    xmm12, xmm0
+        // pclmulqdq xmm8, xmm0, 00H         66 44: 0F 3A 44. C0, 00
+        // pclmulqdq xmm9, xmm0, 11H         66 44: 0F 3A 44. C8, 11
+        // pclmulqdq xmm10, xmm12, 00H       66 45: 0F 3A 44. D4, 00
+        db $66, $44, $0F, $3A, $44, $C0, $00
+        db $66, $44, $0F, $3A, $44, $C8, $11
+        db $66, $45, $0F, $3A, $44, $D4, $00
+        movdqu  xmm12, dqword ptr [ptab + 20H]
+        movdqu  xmm13, xmm12
+        // pclmulqdq xmm12, xmm1, 00H        66 44: 0F 3A 44. E1, 00
+        db $66, $44, $0F, $3A, $44, $E1, $00
+        pxor    xmm8, xmm12
+        // pclmulqdq xmm13, xmm1, 11H        66 44: 0F 3A 44. E9, 11
+        db $66, $44, $0F, $3A, $44, $E9, $11
+        pxor    xmm9, xmm13
+        pshufd  xmm12, xmm1, 4EH
+        pxor    xmm1, xmm12
+        movdqu  xmm12, dqword ptr [ptab + 30H]
+        // pclmulqdq xmm12, xmm1, 00H        66 44: 0F 3A 44. E1, 00
+        db $66, $44, $0F, $3A, $44, $E1, $00
+        pxor    xmm10, xmm12
+        movdqu  xmm12, dqword ptr [ptab + 40H]
+        movdqu  xmm13, xmm12
+        // pclmulqdq xmm12, xmm2, 00H        66 44: 0F 3A 44. E2, 00
+        db $66, $44, $0F, $3A, $44, $E2, $00
+        pxor    xmm8, xmm12
+        // pclmulqdq xmm13, xmm2, 11H        66 44: 0F 3A 44. EA, 11
+        db $66, $44, $0F, $3A, $44, $EA, $11
+        pxor    xmm9, xmm13
+        pshufd  xmm12, xmm2, 4EH
+        pxor    xmm2, xmm12
+        movdqu  xmm12, dqword ptr [ptab + 50H]
+        // pclmulqdq xmm12, xmm2, 00H        66 44: 0F 3A 44. E2, 00
+        db $66, $44, $0F, $3A, $44, $E2, $00
+        pxor    xmm10, xmm12
+        movdqu  xmm12, dqword ptr [ptab + 60H]
+        movdqu  xmm13, xmm12
+        // pclmulqdq xmm12, xmm3, 00H        66 44: 0F 3A 44. E3, 00
+        db $66, $44, $0F, $3A, $44, $E3, $00
+        pxor    xmm8, xmm12
+        // pclmulqdq xmm13, xmm3, 11H        66 44: 0F 3A 44. EB, 11
+        db $66, $44, $0F, $3A, $44, $EB, $11
+        pxor    xmm9, xmm13
+        pshufd  xmm12, xmm3, 4EH
+        pxor    xmm3, xmm12
+        movdqu  xmm12, dqword ptr [ptab + 70H]
+        // pclmulqdq xmm12, xmm3, 00H        66 44: 0F 3A 44. E3, 00
+        db $66, $44, $0F, $3A, $44, $E3, $00
+        pxor    xmm10, xmm12
+        movdqu  xmm12, dqword ptr [ptab + 80H]
+        movdqu  xmm13, xmm12
+        // pclmulqdq xmm12, xmm4, 00H        66 44: 0F 3A 44. E4, 00
+        db $66, $44, $0F, $3A, $44, $E4, $00
+        pxor    xmm8, xmm12
+        // pclmulqdq xmm13, xmm4, 11H        66 44: 0F 3A 44. EC, 11
+        db $66, $44, $0F, $3A, $44, $EC, $11
+        pxor    xmm9, xmm13
+        pshufd  xmm12, xmm4, 4EH
+        pxor    xmm4, xmm12
+        movdqu  xmm12, dqword ptr [ptab + 90H]
+        // pclmulqdq xmm12, xmm4, 00H        66 44: 0F 3A 44. E4, 00
+        db $66, $44, $0F, $3A, $44, $E4, $00
+        pxor    xmm10, xmm12
+        movdqu  xmm12, dqword ptr [ptab + 0A0H]
+        movdqu  xmm13, xmm12
+        // pclmulqdq xmm12, xmm5, 00H        66 44: 0F 3A 44. E5, 00
+        db $66, $44, $0F, $3A, $44, $E5, $00
+        pxor    xmm8, xmm12
+        // pclmulqdq xmm13, xmm5, 11H        66 44: 0F 3A 44. ED, 11
+        db $66, $44, $0F, $3A, $44, $ED, $11
+        pxor    xmm9, xmm13
+        pshufd  xmm12, xmm5, 4EH
+        pxor    xmm5, xmm12
+        movdqu  xmm12, dqword ptr [ptab + 0B0H]
+        // pclmulqdq xmm12, xmm5, 00H        66 44: 0F 3A 44. E5, 00
+        db $66, $44, $0F, $3A, $44, $E5, $00
+        pxor    xmm10, xmm12
+        movdqu  xmm12, dqword ptr [ptab + 0C0H]
+        movdqu  xmm13, xmm12
+        // pclmulqdq xmm12, xmm6, 00H        66 44: 0F 3A 44. E6, 00
+        db $66, $44, $0F, $3A, $44, $E6, $00
+        pxor    xmm8, xmm12
+        // pclmulqdq xmm13, xmm6, 11H        66 44: 0F 3A 44. EE, 11
+        db $66, $44, $0F, $3A, $44, $EE, $11
+        pxor    xmm9, xmm13
+        pshufd  xmm12, xmm6, 4EH
+        pxor    xmm6, xmm12
+        movdqu  xmm12, dqword ptr [ptab + 0D0H]
+        // pclmulqdq xmm12, xmm6, 00H        66 44: 0F 3A 44. E6, 00
+        db $66, $44, $0F, $3A, $44, $E6, $00
+        pxor    xmm10, xmm12
+        movdqu  xmm12, dqword ptr [ptab + 0E0H]
+        movdqu  xmm13, xmm12
+        // pclmulqdq xmm12, xmm7, 00H        66 44: 0F 3A 44. E7, 00
+        db $66, $44, $0F, $3A, $44, $E7, $00
+        pxor    xmm8, xmm12
+        // pclmulqdq xmm13, xmm7, 11H        66 44: 0F 3A 44. EF, 11
+        db $66, $44, $0F, $3A, $44, $EF, $11
+        pxor    xmm9, xmm13
+        pshufd  xmm12, xmm7, 4EH
+        pxor    xmm7, xmm12
+        movdqu  xmm12, dqword ptr [ptab + 0F0H]
+        // pclmulqdq xmm12, xmm7, 00H        66 44: 0F 3A 44. E7, 00
+        db $66, $44, $0F, $3A, $44, $E7, $00
+        pxor    xmm10, xmm12
+        pxor    xmm10, xmm8
+        pxor    xmm10, xmm9
+        movdqu  xmm11, xmm10
+        psrldq  xmm10, 8
+        pslldq  xmm11, 8
+        pxor    xmm9, xmm10
+        pxor    xmm8, xmm11
+        movdqu  xmm11, xmm14
+        // pclmulqdq xmm11, xmm8, 01H        66 45: 0F 3A 44. D8, 01
+        db $66, $45, $0F, $3A, $44, $D8, $01
+        pshufd  xmm8, xmm8, 4EH
+        pxor    xmm8, xmm11
+        movdqu  xmm11, xmm14
+        // pclmulqdq xmm11, xmm8, 01H        66 45: 0F 3A 44. D8, 01
+        db $66, $45, $0F, $3A, $44, $D8, $01
+        pshufd  xmm8, xmm8, 4EH
+        pxor    xmm8, xmm11
+        pxor    xmm8, xmm9
+        cmp     datalen, 128
+        jnc     @by8
+@by1:   movdqu  xmm12, dqword ptr [ptab + 0E0H]
+        movdqu  xmm13, dqword ptr [ptab + 0F0H]
+        cmp     datalen, 16
+        jc      @sml
+        {$ifdef FPC} align 16 {$else} .align 16 {$endif}
+@next:  sub     datalen, 16
+        movdqu  xmm0, dqword ptr [data]
+@s:     pshufb  xmm0, xmm15
+        pxor    xmm0, xmm8
+        movdqu  xmm8, xmm12
+        movdqu  xmm10, xmm13
+        movdqu  xmm9, xmm12
+        pshufd  xmm11, xmm0, 4EH
+        pxor    xmm11, xmm0
+        // pclmulqdq xmm8, xmm0, 00H         66 44: 0F 3A 44. C0, 00
+        // pclmulqdq xmm9, xmm0, 11H         66 44: 0F 3A 44. C8, 11
+        // pclmulqdq xmm10, xmm11, 00H       66 45: 0F 3A 44. D3, 00
+        db $66, $44, $0F, $3A, $44, $C0, $00
+        db $66, $44, $0F, $3A, $44, $C8, $11
+        db $66, $45, $0F, $3A, $44, $D3, $00
+        pxor    xmm10, xmm8
+        pxor    xmm10, xmm9
+        movdqu  xmm11, xmm10
+        psrldq  xmm10, 8
+        pslldq  xmm11, 8
+        pxor    xmm9, xmm10
+        pxor    xmm8, xmm11
+        movdqu  xmm11, xmm14
+        // pclmulqdq xmm11, xmm8, 01H        66 45: 0F 3A 44. D8, 01
+        db $66, $45, $0F, $3A, $44, $D8, $01
+        pshufd  xmm8, xmm8, 4EH
+        pxor    xmm8, xmm11
+        movdqu  xmm11, xmm14
+        // pclmulqdq xmm11, xmm8, 01H        66 45: 0F 3A 44. D8, 01
+        db $66, $45, $0F, $3A, $44, $D8, $01
+        pshufd  xmm8, xmm8, 4EH
+        pxor    xmm8, xmm11
+        pxor    xmm8, xmm9
+        lea     data, [data + 10H]
+        cmp     datalen, 16
+        jnc     @next
+@sml:   test    datalen, datalen
+        jz      @done
+        pxor    xmm0, xmm0
+        lea     data, [data + datalen - 1]
+        {$ifdef FPC} align 8 {$else} .align 8 {$endif}
+@ins:   pslldq  xmm0, 1
+        pinsrb  xmm0, byte ptr [data], 00H
+        dec     data
+        dec     datalen
+        jnz     @ins
+        jmp     @s
+        {$ifdef FPC} align 16 {$else} .align 16 {$endif}
+@bswapMask:
+        dq $08090A0B0C0D0E0F
+        dq $0001020304050607
+@gcmPoly:
+        dq $0000000000000001
+        dq $C200000000000000
+@done:  movdqu  dqword ptr [hash], xmm8
+end;
+
+procedure GcmAvxGetTag(ptab, mask, hash: pointer; plen, dlen: PtrInt);
+{$ifdef FPC} nostackframe; assembler; asm {$else} asm .noframe {$endif}
+        // rdi=ptab, rsi=mask, rdx=hash, rcx=plen, r8=dlen
+        {$ifdef WIN64ABI}
+        mov     rax, qword ptr [rsp + $28]  // dlen not passed as register
+        push    rsi
+        push    rdi
+        mov     rdi, ptab    // rcx
+        mov     rsi, mask    // rdx
+        mov     rdx, hash    // r8
+        mov     rcx, plen    // r9
+        {$else}
+        mov     rax, r8
+        {$endif WIN64ABI}
+        shl     rcx, 3
+        movdqu  xmm8, dqword ptr [rdx]
+        movdqu  xmm13, dqword ptr [rsi]
+        movdqa  xmm15, dqword ptr [rip + @bswapMask]
+        movdqa  xmm14, dqword ptr [rip + @gcmPoly]
+        shl     rax, 3
+        movq    xmm0, rcx
+        pinsrq  xmm0, rax, 1
+        pxor    xmm0, xmm8
+        movdqu  xmm8, dqword ptr [rdi + 0E0H]
+        movdqu  xmm10, dqword ptr [rdi + 0F0H]
+        movdqu  xmm9, xmm8
+        // pclmulqdq xmm8, xmm0, 00H         66 44: 0F 3A 44. C0, 00
+        // pclmulqdq xmm9, xmm0, 11H         66 44: 0F 3A 44. C8, 11
+        db $66, $44, $0F, $3A, $44, $C0, $00
+        db $66, $44, $0F, $3A, $44, $C8, $11
+        pshufd  xmm11, xmm0, 4EH
+        pxor    xmm11, xmm0
+        // pclmulqdq xmm10, xmm11, 00H       66 45: 0F 3A 44. D3, 00
+        db $66, $45, $0F, $3A, $44, $D3, $00
+        pxor    xmm10, xmm8
+        pxor    xmm10, xmm9
+        movdqu  xmm11, xmm10
+        psrldq  xmm10, 8
+        pslldq  xmm11, 8
+        pxor    xmm9, xmm10
+        pxor    xmm8, xmm11
+        movdqu  xmm11, xmm14
+        // pclmulqdq xmm11, xmm8, 01H        66 45: 0F 3A 44. D8, 01
+        db $66, $45, $0F, $3A, $44, $D8, $01
+        pshufd  xmm8, xmm8, 4EH
+        pxor    xmm8, xmm11
+        movdqu  xmm11, xmm14
+        // pclmulqdq xmm11, xmm8, 01H        66 45: 0F 3A 44. D8, 01
+        db $66, $45, $0F, $3A, $44, $D8, $01
+        pshufd  xmm8, xmm8, 4EH
+        pxor    xmm8, xmm11
+        pxor    xmm8, xmm9
+        pshufb  xmm8, xmm15
+        pxor    xmm8, xmm13
+        movdqu  dqword ptr [rdx], xmm8
+        {$ifdef WIN64ABI}
+        pop    rdi
+        pop    rsi
+        {$endif WIN64ABI}
+        ret
+        {$ifdef FPC} align 16 {$else} .align 16 {$endif}
+@bswapMask:
+        dq $08090A0B0C0D0E0F
+        dq $0001020304050607
+@gcmPoly:
+        dq $0000000000000001
+        dq $C200000000000000
+end;
+
+{$endif USEGCMAVX}
 
 // compute a := a * b in GF(2^128) using pclmulqdq on WestMere CPUs
 // - three times faster than the pascal version using lookup tables
@@ -4494,6 +4891,10 @@ procedure gf_mul_pclmulqdq(a, b: pointer);
         pxor    xmm0, xmm2
         pshufb  xmm0, xmm10
         movups  dqword ptr [a], xmm0
+        {$ifdef WIN64ABI}
+        pop    rdi
+        pop    rsi
+        {$endif WIN64ABI}
         ret
 {$ifdef FPC} align 16 {$else} .align 16 {$endif}
 @swap:  db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
diff --git a/src/core/mormot.core.crypto.openssl.pas b/src/core/mormot.core.crypto.openssl.pas
index 6f72c8a96..a6cc0a137 100644
--- a/src/core/mormot.core.crypto.openssl.pas
+++ b/src/core/mormot.core.crypto.openssl.pas
@@ -14,7 +14,7 @@
   *****************************************************************************
 
   TL;DR: on x86_64, our mormot.core.crypto.pas asm is stand-alone and faster
-  than OpenSSL for most algorithms, but AES-GCM.
+  than OpenSSL for most algorithms, but AES-GCM (1.8 vs 1.5 GB/s).
 
 }
 
@@ -170,11 +170,11 @@   TAesCtrNistOsl = class(TAesAbstractOsl)
   /// OpenSSL AES-GCM cypher/uncypher
   // - implements AEAD (authenticated-encryption with associated-data) process
   // via MacSetNonce/MacEncrypt or AesGcmAad/AesGcmFinal methods
-  // - OpenSSL is faster than our TAesGcm class which is not interleaved:
-  // $  mormot aes-128-gcm in 14.42ms i.e. 173274/s or 368.7 MB/s
-  // $  mormot aes-256-gcm in 16.98ms i.e. 147206/s or 313.2 MB/s
+  // - OpenSSL is faster than our TAesGcm class, but not so much:
   // $  openssl aes-128-gcm in 2.86ms i.e. 874125/s or 1.8 GB/s
   // $  openssl aes-256-gcm in 3.43ms i.e. 727590/s or 1.5 GB/s
+  // $  mormot aes-128-gcm in 3.45ms i.e. 722752/s or 1.5 GB/s
+  // $  mormot aes-256-gcm in 4.11ms i.e. 607385/s or 1.2 GB/s
   TAesGcmOsl = class(TAesGcmAbstract)
   protected
     fAes: TAesOsl;
diff --git a/src/core/mormot.core.crypto.pas b/src/core/mormot.core.crypto.pas
index 59c1089d3..5dcb639db 100644
--- a/src/core/mormot.core.crypto.pas
+++ b/src/core/mormot.core.crypto.pas
@@ -53,7 +53,8 @@   ESynCrypto = class(ESynException);
   {$ifdef HASAESNI}
     {$define USEAESNI}
     {$define USEAESNI64}
-    {$define USECLMUL} // gf_mul_pclmulqdq() requires some complex opcodes
+    {$define USECLMUL}  // gf_mul_pclmulqdq() requires some complex opcodes
+    {$define USEGCMAVX} // 8x interleaved aesni + pclmulqdq asm for AES-GCM
   {$endif HASAESNI}
   {$ifdef OSWINDOWS}
     {$define CRC32C_X64} // external crc32_iscsi_01 for win64/lin64
@@ -273,7 +274,7 @@ procedure RawSha512Compress(var Hash; Data: pointer);
   TAesGcmEngine = object
   private
     /// standard AES encryption context
-    actx: TAes;
+    aes: TAes;
     /// ghash value of the Authentication Data
     aad_ghv: TAesBlock;
     /// ghash value of the Ciphertext
@@ -289,7 +290,7 @@ procedure RawSha512Compress(var Hash; Data: pointer);
     /// current 0..15 position in encryption block
     blen: byte;
     /// the state of this context
-    flags: set of (flagFinalComputed, flagFlushed, flagCLMUL);
+    flags: set of (flagFinalComputed, flagFlushed, flagCLMUL, flagAVX);
     /// 4KB lookup table for fast Galois Finite Field multiplication
     // - is defined as last field of the object for better code generation
     gf_t4k: array[byte] of THash128Rec;
@@ -904,8 +905,8 @@   TAesOfbCrc = class(TAesSymCrc)
   // $  mormot aes-128-ctr in 1.99ms i.e. 1254390/s or 2.6 GB/s
   // $  mormot aes-256-ctr in 2.64ms i.e. 945179/s or 1.9 GB/s
   // - could be used as an alternative to AES-GCM, even if OpenSSL is available:
-  // $  mormot aes-128-gcm in 14.42ms i.e. 173274/s or 368.7 MB/s
-  // $  mormot aes-256-gcm in 16.98ms i.e. 147206/s or 313.2 MB/s
+  // $  mormot aes-128-gcm in 3.45ms i.e. 722752/s or 1.5 GB/s
+  // $  mormot aes-256-gcm in 4.11ms i.e. 607385/s or 1.2 GB/s
   // $  openssl aes-128-gcm in 2.86ms i.e. 874125/s or 1.8 GB/s
   // $  openssl aes-256-gcm in 3.43ms i.e. 727590/s or 1.5 GB/s
   // - on i386, numbers are lower, because they are not interleaved:
@@ -984,20 +985,21 @@   TAesGcmAbstractClass = class of TAesGcmAbstract;
   // via MacSetNonce/MacEncrypt or AesGcmAad/AesGcmFinal methods
   // - will use AES-NI and CLMUL hardware instructions, if available
   // - expect IV to be set before process, or IVAtBeginning=true
-  // - by design, AES-GCM doesn't expect any MAC to be supplied before processing
-  // - OpenSSL is faster than our TAesGcm class which is not interleaved:
-  // $  mormot aes-128-gcm in 14.42ms i.e. 173274/s or 368.7 MB/s
-  // $  mormot aes-256-gcm in 16.98ms i.e. 147206/s or 313.2 MB/s
+  // - by design, AES-GCM doesn't expect any Nonce to be supplied before processing
+  // - our TAesGcm class is 8x interleaved for both GMAC and AES-CTR
+  // $  mormot aes-128-gcm in 3.45ms i.e. 722752/s or 1.5 GB/s
+  // $  mormot aes-256-gcm in 4.11ms i.e. 607385/s or 1.2 GB/s
+  // - OpenSSL is faster since it performs GMAC and AES-CTR in a single pass
   // $  openssl aes-128-gcm in 2.86ms i.e. 874125/s or 1.8 GB/s
   // $  openssl aes-256-gcm in 3.43ms i.e. 727590/s or 1.5 GB/s
-  // - on i386, numbers are similar:
-  // $ mormot aes-128-gcm in 15.86ms i.e. 157609/s or 335.4 MB/s
-  // $ mormot aes-256-gcm in 18.23ms i.e. 137083/s or 291.7 MB/s
-  // $ openssl aes-128-gcm in 5.49ms i.e. 455290/s or 0.9 GB/s
-  // $ openssl aes-256-gcm in 6.11ms i.e. 408630/s or 869.6 MB/s
+  // - on i386, numbers are much lower, since lacks CLMUL and interleaved asm
+  // $  mormot aes-128-gcm in 15.86ms i.e. 157609/s or 335.4 MB/s
+  // $  mormot aes-256-gcm in 18.23ms i.e. 137083/s or 291.7 MB/s
+  // $  openssl aes-128-gcm in 5.49ms i.e. 455290/s or 0.9 GB/s
+  // $  openssl aes-256-gcm in 6.11ms i.e. 408630/s or 869.6 MB/s
   TAesGcm = class(TAesGcmAbstract)
   protected
-    fAes: TAesGcmEngine;
+    fGcm: TAesGcmEngine;
     function AesGcmInit: boolean; override; // from fKey/fKeySize
     procedure AesGcmDone; override;
     procedure AesGcmReset; override; // from fIV/CTR_POS
@@ -3327,7 +3329,7 @@ procedure TAes.DoBlocksCtr(iv: PAesBlock; src, dst: pointer;
 begin
   {$ifdef USEAESNI64}
   if aesNi in TAesContext(Context).Flags then
-    AesNiEncryptCtrNist(src, dst, blockcount shl 4, @Context, iv)
+    AesNiEncryptCtrNist(src, dst, blockcount shl 4, @Context, pointer(iv))
   else
   {$endif USEAESNI64}
     DoBlocksCtrPas(iv, src, dst, blockcount, TAesContext(Context));
@@ -3578,6 +3580,8 @@ procedure GCM_IncCtr(var x: TAesBlock);
 procedure TAesGcmEngine.internal_crypt(ptp, ctp: PByte; ILen: PtrUInt);
 var
   b_pos: PtrUInt;
+  {$ifdef USEAESNI64} ctr, {$endif USEAESNI64}
+  blocks: cardinal;
 begin
   b_pos := blen;
   inc(blen, ILen);
@@ -3588,30 +3592,46 @@ procedure TAesGcmEngine.internal_crypt(ptp, ctp: PByte; ILen: PtrUInt);
     while (ILen > 0) and
           (b_pos < SizeOf(TAesBlock)) do
     begin
-      ctp^ := ptp^ xor TAesContext(actx).buf[b_pos];
+      ctp^ := ptp^ xor TAesContext(aes).buf[b_pos];
       inc(b_pos);
       inc(ptp);
       inc(ctp);
       dec(ILen);
     end;
-  while ILen >= SizeOf(TAesBlock) do
-  begin
-    GCM_IncCtr(TAesContext(actx).iv.b);
-    actx.Encrypt(TAesContext(actx).iv.b, TAesContext(actx).buf); // maybe AES-NI
-    XorBlock16(pointer(ptp), pointer(ctp), @TAesContext(actx).buf);
-    inc(PAesBlock(ptp));
-    inc(PAesBlock(ctp));
-    dec(ILen, SizeOf(TAesBlock));
-  end;
+  blocks := ILen shr AesBlockShift;
+  if blocks <> 0 then
+    {$ifdef USEAESNI64}
+    if aesNi in TAesContext(aes).Flags then
+    begin
+      // AES-GCM has a 32-bit counter -> don't use 128-bit AesNiEncryptCtrNist()
+      ctr := bswap32(TAesContext(aes).iv.c3) + blocks;
+      GCM_IncCtr(TAesContext(aes).iv.b); // should be done before
+      AesNiEncryptCtrNist32(ptp, ctp, blocks, @aes, @TAesContext(aes).iv);
+      TAesContext(aes).iv.c3 := bswap32(ctr);
+      blocks := blocks shl AesBlockShift;
+      inc(ptp, blocks);
+      inc(ctp, blocks);
+      ILen := Ilen and AesBlockMod;
+    end
+    else
+    {$endif USEAESNI64}
+    repeat
+      GCM_IncCtr(TAesContext(aes).iv.b);
+      aes.Encrypt(TAesContext(aes).iv.b, TAesContext(aes).buf); // maybe AES-NI
+      XorBlock16(pointer(ptp), pointer(ctp), @TAesContext(aes).buf);
+      inc(PAesBlock(ptp));
+      inc(PAesBlock(ctp));
+      dec(ILen, SizeOf(TAesBlock));
+    until ILen < SizeOf(TAesBlock);
   while ILen > 0 do
   begin
     if b_pos = SizeOf(TAesBlock) then
     begin
-      GCM_IncCtr(TAesContext(actx).iv.b);
-      actx.Encrypt(TAesContext(actx).iv.b, TAesContext(actx).buf);
+      GCM_IncCtr(TAesContext(aes).iv.b);
+      aes.Encrypt(TAesContext(aes).iv.b, TAesContext(aes).buf);
       b_pos := 0;
     end;
-    ctp^ := TAesContext(actx).buf[b_pos] xor ptp^;
+    ctp^ := TAesContext(aes).buf[b_pos] xor ptp^;
     inc(b_pos);
     inc(ptp);
     inc(ctp);
@@ -3661,10 +3681,10 @@ procedure TAesGcmEngine.internal_auth(ctp: PByte; ILen: PtrUInt;
 function TAesGcmEngine.Init(const Key; KeyBits: PtrInt): boolean;
 begin
   FillcharFast(self,SizeOf(self), 0);
-  result := actx.EncryptInit(Key, KeyBits);
+  result := aes.EncryptInit(Key, KeyBits);
   if not result then
     exit;
-  actx.Encrypt(ghash_h, ghash_h);
+  aes.Encrypt(ghash_h, ghash_h);
   {$ifdef USECLMUL}
   if cfCLMUL in CpuFeatures then
     include(flags, flagCLMUL)
@@ -3689,48 +3709,44 @@ function TAesGcmEngine.Reset(pIV: pointer; IV_len: PtrInt): boolean;
   if IV_len = CTR_POS then
   begin
     // Initialization Vector size matches perfect size of 12 bytes
-    MoveFast(pIV^, TAesContext(actx).iv, CTR_POS);
-    TAesContext(actx).iv.c3 := $01000000;
+    MoveSmall(pIV, @TAesContext(aes).iv, CTR_POS);
+    TAesContext(aes).iv.c3 := $01000000;
   end
   else
   begin
     // Initialization Vector is otherwise computed from GHASH(IV,H)
     n_pos := IV_len;
-    FillZero(TAesContext(actx).iv.b);
+    FillZero(TAesContext(aes).iv.b);
     while n_pos >= SizeOf(TAesBlock) do
     begin
-      XorBlock16(@TAesContext(actx).iv, pIV);
+      XorBlock16(@TAesContext(aes).iv, pIV);
       inc(PAesBlock(pIV));
       dec(n_pos, SizeOf(TAesBlock));
-      gf_mul_h(self, TAesContext(actx).iv.b); // maybe CLMUL
+      gf_mul_h(self, TAesContext(aes).iv.b); // maybe CLMUL
     end;
     if n_pos > 0 then
     begin
       for i := 0 to n_pos - 1 do
-        TAesContext(actx).iv.b[i] := TAesContext(actx).iv.b[i] xor PAesBlock(pIV)^[i];
-      gf_mul_h(self, TAesContext(actx).iv.b); // maybe CLMUL
+        TAesContext(aes).iv.b[i] := TAesContext(aes).iv.b[i] xor PAesBlock(pIV)^[i];
+      gf_mul_h(self, TAesContext(aes).iv.b); // maybe CLMUL
     end;
     n_pos := IV_len shl 3;
     i := 15;
     while n_pos > 0 do
     begin
-      TAesContext(actx).iv.b[i] := TAesContext(actx).iv.b[i] xor byte(n_pos);
+      TAesContext(aes).iv.b[i] := TAesContext(aes).iv.b[i] xor byte(n_pos);
       n_pos := n_pos shr 8;
       dec(i);
     end;
-    gf_mul_h(self, TAesContext(actx).iv.b); // maybe CLMUL
+    gf_mul_h(self, TAesContext(aes).iv.b); // maybe CLMUL
   end;
   // reset internal state and counters
-  y0_val := TAesContext(actx).iv.c3;
+  y0_val := TAesContext(aes).iv.c3;
   FillZero(aad_ghv);
   FillZero(txt_ghv);
   aad_cnt.V := 0;
   atx_cnt.V := 0;
-  flags := [];
-  {$ifdef USECLMUL}
-  if cfCLMUL in CpuFeatures then
-    include(flags, flagCLMUL);
-  {$endif USECLMUL}
+  flags := flags - [flagFinalComputed, flagFlushed];
   result := true;
 end;
 
@@ -3746,16 +3762,19 @@ function TAesGcmEngine.Encrypt(ptp, ctp: Pointer; ILen: PtrInt): boolean;
       exit;
     end;
     if (ILen and AesBlockMod = 0) and
+       {$ifdef USEAESNI64} // faster with 8x interleaved internal_crypt()
+       not (aesNi in TAesContext(aes).Flags) and
+       {$endif USEAESNI64}
        (blen = 0) then
     begin
       inc(atx_cnt.V, ILen);
       ILen := ILen shr AesBlockShift;
       repeat
         // single-pass loop optimized e.g. for PKCS7 padding
-        {%H-}GCM_IncCtr(TAesContext(actx).iv.b);
-        TAesContext(actx).DoBlock(actx, TAesContext(actx).iv,
-          TAesContext(actx).buf); // buf=AES(iv) maybe AES-NI
-        XorBlock16(ptp, ctp, @TAesContext(actx).buf);
+        {%H-}GCM_IncCtr(TAesContext(aes).iv.b);
+        TAesContext(aes).DoBlock(aes, TAesContext(aes).iv,
+          TAesContext(aes).buf); // buf=AES(iv) maybe AES-NI
+        XorBlock16(ptp, ctp, @TAesContext(aes).buf);
         gf_mul_h(self, txt_ghv);  // maybe CLMUL
         XorBlock16(@txt_ghv, ctp);
         inc(PAesBlock(ptp));
@@ -3786,6 +3805,9 @@ function TAesGcmEngine.Decrypt(ctp, ptp: Pointer; ILen: PtrInt;
        (flagFinalComputed in flags) then
       exit;
     if (ILen and AesBlockMod = 0) and
+       {$ifdef USEAESNI64} // faster with 8x interleaved internal_crypt()
+       not (aesNi in TAesContext(aes).Flags) and
+       {$endif USEAESNI64}
        (blen = 0) then
     begin
       inc(atx_cnt.V, ILen);
@@ -3794,9 +3816,9 @@ function TAesGcmEngine.Decrypt(ctp, ptp: Pointer; ILen: PtrInt;
         // single-pass loop optimized e.g. for PKCS7 padding
         gf_mul_h(self, txt_ghv); // maybe CLMUL
         XorBlock16(@txt_ghv, ctp);
-        GCM_IncCtr(TAesContext(actx).iv.b);
-        actx.Encrypt(TAesContext(actx).iv.b, TAesContext(actx).buf); // maybe AES-NI
-        XorBlock16(ctp, ptp, @TAesContext(actx).buf);
+        GCM_IncCtr(TAesContext(aes).iv.b);
+        aes.Encrypt(TAesContext(aes).iv.b, TAesContext(aes).buf); // maybe AES-NI
+        XorBlock16(ctp, ptp, @TAesContext(aes).buf);
         inc(PAesBlock(ptp));
         inc(PAesBlock(ctp));
         dec(ILen);
@@ -3819,7 +3841,7 @@ function TAesGcmEngine.Decrypt(ctp, ptp: Pointer; ILen: PtrInt;
       begin
         Final(tag, {anddone=}false);
         if not IsEqual(tag, ptag^, tlen) then
-          // check authentication before encryption
+          // check authentication before decryption
           exit;
       end;
       internal_crypt(ctp, ptp, iLen);
@@ -3877,9 +3899,9 @@ function TAesGcmEngine.Final(out tag: TAesBlock; andDone: boolean): boolean;
     XorBlock16(@aad_ghv, @tbuf);
     gf_mul_h(self, aad_ghv); // maybe CLMUL
     // compute E(K,Y0)
-    tbuf := TAesContext(actx).iv.b;
+    tbuf := TAesContext(aes).iv.b;
     TWA4(tbuf)[3] := y0_val;
-    actx.Encrypt(tbuf);
+    aes.Encrypt(tbuf);
     // GMAC = GHASH(H, AAD, ctp) xor E(K,Y0)
     XorBlock16(@aad_ghv, @tag, @tbuf);
     if andDone then
@@ -3897,7 +3919,7 @@ procedure TAesGcmEngine.Done;
 begin
   if flagFlushed in flags then
     exit;
-  actx.Done;
+  aes.Done;
   include(flags, flagFlushed);
 end;
 
@@ -3908,7 +3930,7 @@ function TAesGcmEngine.FullEncryptAndAuthenticate(const Key; KeyBits: PtrInt;
   result := Init(Key, KeyBits) and
             Reset(pIV, IV_len) and
             Add_AAD(pAAD, aLen) and
-            Encrypt(ptp, ctp,pLen) and
+            Encrypt(ptp, ctp, pLen) and
             Final(tag);
   Done;
 end;
@@ -5200,6 +5222,23 @@ function TAesGcmAbstract.MacCheckError(Encrypted: pointer; Count: cardinal): boo
 
 { TAesGcm }
 
+function TAesGcm.AesGcmInit: boolean;
+begin
+  {$ifdef USEGCMAVX}
+  if (cfCLMUL in CpuFeatures) and
+     (cfAESNI in CpuFeatures) then
+  begin
+    // 8x interleaved aesni + pclmulqdq x86_64 asm
+    include(fGcm.flags, flagAVX);
+    result := fGcm.aes.EncryptInit(fKey, fKeySize);
+    if result then
+      GcmAvxInit(@fGcm.gf_t4k, @fGcm.aes, TAesContext(fGcm.aes).Rounds);
+    exit;
+  end;
+  {$endif USEGCMAVX}
+  result := fGcm.Init(fKey, fKeySize);
+end;
+
 function TAesGcm.Clone: TAesAbstract;
 begin
   result := NewInstance as TAesGcm;
@@ -5207,56 +5246,120 @@ function TAesGcm.Clone: TAesAbstract;
   result.fKeySize := fKeySize;
   result.fKeySizeBytes := fKeySizeBytes;
   result.fAlgoMode := mGcm;
-  TAesGcm(result).fAes := fAes; // reuse the very same TAesGcmEngine memory
-end;
-
-function TAesGcm.AesGcmInit: boolean;
-begin
-  result := fAes.Init(fKey, fKeySize);
+  {$ifdef USEGCMAVX}
+  if flagAVX in fGcm.flags then
+  begin
+    TAesGcm(result).fGcm.aes := fGcm.aes;
+    TAesGcm(result).fGcm.flags := fGcm.flags;
+    MoveFast(fGcm.gf_t4k, TAesGcm(result).fGcm.gf_t4k, 256);
+  end
+  else
+  {$endif USEGCMAVX}
+    TAesGcm(result).fGcm := fGcm; // reuse the very same TAesGcmEngine memory
 end;
 
 procedure TAesGcm.AesGcmDone;
 begin
-  fAes.Done;
+  {$ifdef USEGCMAVX}
+  if flagAVX in fGcm.flags then
+    fGcm.aes.Done
+  else
+  {$endif USEGCMAVX}
+    fGcm.Done;
 end;
 
 procedure TAesGcm.AesGcmReset;
 begin
-  fAes.Reset(@fIV, CTR_POS);
+  fGcm.Reset(@fIV, CTR_POS); // reused for USEGCMAVX since CTR_POS computes nothing
 end;
 
 function TAesGcm.AesGcmProcess(BufIn, BufOut: pointer; Count: cardinal): boolean;
+{$ifdef USEGCMAVX}
+var
+  blocks, ctr, onepass: cardinal;
+{$endif USEGCMAVX}
 begin
-  if fStarted = stEnc then
-    result := fAes.Encrypt(BufIn, BufOut, Count)
+  {$ifdef USEGCMAVX}
+  if flagAVX in fGcm.flags then
+  begin
+    result := true;
+    if Count and AesBlockMod <> 0 then
+      raise ESynCrypto.CreateUtf8('%.Encrypt/Decrypt should use PKCS7', [self]);
+    inc(fGcm.atx_cnt.V, Count);
+    repeat
+      // regroup GMAC + AES-CTR per 1MB chunks to fit in CPU cache
+      onepass := 1 shl 20;
+      if Count < onepass then
+        onepass := Count;
+      // GMAC done before decryption
+      if fStarted = stDec then
+        GcmAvxAuth(@fGcm.gf_t4k, BufIn, onepass, @fGcm.txt_ghv);
+      // AES-CTR with 32-bit counter
+      blocks := onepass shr AesBlockShift;
+      ctr := bswap32(TAesContext(fGcm.aes).iv.c3) + blocks;
+      GCM_IncCtr(TAesContext(fGcm.aes).iv.b); // should be done before
+      AesNiEncryptCtrNist32(BufIn, BufOut, blocks, @fGcm.aes, @TAesContext(fGcm.aes).iv);
+      TAesContext(fGcm.aes).iv.c3 := bswap32(ctr);
+      // GMAC done after encryption
+      if fStarted = stEnc then
+        GcmAvxAuth(@fGcm.gf_t4k, BufOut, onepass, @fGcm.txt_ghv);
+      dec(Count, onepass);
+      if Count = 0 then
+        exit;
+      inc(PByte(BufIn), onepass);
+      inc(PByte(BufOut), onepass);
+    until false;
+  end
   else
-    result := fAes.Decrypt(BufIn, BufOut, Count);
+  {$endif USEGCMAVX}
+    if fStarted = stEnc then
+      result := fGcm.Encrypt(BufIn, BufOut, Count)
+    else
+      result := fGcm.Decrypt(BufIn, BufOut, Count);
 end;
 
 procedure TAesGcm.AesGcmAad(Buf: pointer; Len: integer);
 begin
-  fAes.Add_AAD(Buf, Len);
+  {$ifdef USEGCMAVX}
+  if flagAVX in fGcm.flags then
+  begin
+    inc(fGcm.aad_cnt.V, Len);
+    GcmAvxAuth(@fGcm.gf_t4k, Buf, Len, @fGcm.txt_ghv); // use txt_ghv for both
+  end
+  else
+  {$endif USEGCMAVX}
+    fGcm.Add_AAD(Buf, Len);
 end;
 
 function TAesGcm.AesGcmFinal(var tag: TAesBlock): boolean;
 var
-  decoded: TAesBlock;
+  decoded: THash128Rec;
 begin
+  result := false;
+  if fStarted = stNone then
+    exit;
+  {$ifdef USEGCMAVX}
+  if flagAVX in fGcm.flags then
+  begin
+    decoded := TAesContext(fGcm.aes).iv;
+    decoded.c3 := fGcm.y0_val; // restore initial counter
+    fGcm.aes.Encrypt(decoded.b);
+    GcmAvxGetTag(@fGcm.gf_t4k, @decoded, @fGcm.txt_ghv, fGcm.atx_cnt.V, fGcm.aad_cnt.V);
+    decoded.b := fGcm.txt_ghv;
+  end
+  else
+  {$endif USEGCMAVX}
+    fGcm.Final(decoded.b, {andDone=}false);
   case fStarted of
     stEnc:
       begin
-        fAes.Final(tag, {andDone=}false);
+        tag := decoded.b;
         result := true;
       end;
     stDec:
-      begin
-        fAes.Final(decoded, {andDone=}false);
-        result := IsEqual(decoded, tag);
-      end;
-  else
-    result := false;
+      result := IsEqual(decoded.b, tag);
   end;
-  fStarted := stNone; // allow reuse of this fAes instance
+  fStarted := stNone; // allow reuse of this fGcm instance
 end;
 
 
@@ -5432,7 +5535,6 @@ function AesAlgoNameDecode(AesAlgoName: PUtf8Char;
   i: integer;
   tab: PByteArray;
 begin
-  // this code is very efficient
   result := false;
   if PCardinal(AesAlgoName)^ and $ffdfdfdf <>
       ord('A') + ord('E') shl 8 + ord('S') shl 16 + ord('-') shl 24 then
@@ -5449,9 +5551,9 @@ function AesAlgoNameDecode(AesAlgoName: PUtf8Char;
   end;
   tab := @NormToUpperAnsi7Byte;
   i := IntegerScanIndex(pointer(AESMODESTXT4), succ(ord(high(TAesMode))),
-    cardinal(tab[ord(AesAlgoName[8])]) +
-    cardinal(tab[ord(AesAlgoName[9])]) shl 8 +
-    cardinal(tab[ord(AesAlgoName[10])]) shl 16);
+         cardinal(tab[ord(AesAlgoName[8])]) +
+         cardinal(tab[ord(AesAlgoName[9])]) shl 8 +
+         cardinal(tab[ord(AesAlgoName[10])]) shl 16);
   if i < 0 then
     exit;
   Mode := TAesMode(i);
diff --git a/test/test.core.crypto.pas b/test/test.core.crypto.pas
index a9264fca8..d4ca6f5cf 100644
--- a/test/test.core.crypto.pas
+++ b/test/test.core.crypto.pas
@@ -1114,7 +1114,6 @@ procedure TTestCoreCrypto._AES;
   mac: TAesMac256;
   mac1, mac2: THash256;
   one, two, encdec: TAesAbstract;
-  PC: PAnsiChar;
   noaesni, gcm, aead: boolean;
   Timer: array[boolean] of TPrecisionTimer;
   ValuesCrypted, ValuesOrig: array[0..6] of RawByteString;
@@ -1170,7 +1169,8 @@ procedure TTestCoreCrypto._AES;
             FillRandom(@tag1, 4);
             Check(TAesGcmAbstract(one).AesGcmFinal(tag1));
             // writeln(one.classname, ks, ' ', AesBlockToShortString(tag1));
-            CheckEqual(AesBlockToString(tag1), TEST_AES_TAG[k], 'TEST_AES_TAG');
+            CheckEqual(AesBlockToString(tag1), TEST_AES_TAG[k],
+              FormatUtf8('TEST_AES_TAG %', [ks]));
           end;
           one.IV := iv.b;
           if aead then
@@ -1541,6 +1541,12 @@ procedure TTestCoreCrypto._AES_GCM;
   n: integer;
 begin
   key := PAesBlock(@hex32)^;
+  FillZero(buf);
+  FillZero(tag);
+  check(ctxt.FullEncryptAndAuthenticate(key, 128, @hex32, 12, nil, 0,
+    @buf, @buf, SizeOf(buf), tag));
+  CheckEqual(CardinalToHex(crc32c(0, @buf, SizeOf(buf))), 'AC3DDD17');
+  CheckEqual(Md5DigestToString(tag), '0332c40f9926bd3cdadf33148912c672');
   for n := 1 to 32 do
   begin
     Check(ctxt.Init(key, 128));
@@ -1629,6 +1635,7 @@ procedure TTestCoreCrypto.Catalog;
   c: TAesAbstract;
   key: THash256;
 begin
+  FillZero(key);
   for k := 0 to 2 do
     for m := low(m) to high(m) do
     begin
@@ -1638,7 +1645,7 @@ procedure TTestCoreCrypto.Catalog;
       CheckUtf8(AesAlgoNameDecode(n, k2) = TAesFast[m], n);
       UpperCaseSelf(n);
       CheckUtf8(AesAlgoNameDecode(n, k2) = TAesFast[m], n);
-      c := TAesFast[m].Create(k, 128 + k * 64);
+      c := TAesFast[m].Create(key, 128 + k * 64);
       try
         Check(c.AlgoMode = m);
         Check(IdemPropName(c.AlgoName, pointer(n), length(n)));