diff --git a/src/core/mormot.core.crypto.asmx64.inc b/src/core/mormot.core.crypto.asmx64.inc index 72b8bb04d..5c392d3d4 100644 --- a/src/core/mormot.core.crypto.asmx64.inc +++ b/src/core/mormot.core.crypto.asmx64.inc @@ -3243,13 +3243,13 @@ end; // AesNiEncryptCtrNist32() expects the CTR in lowest 32-bit to never overflow procedure AesNiEncryptCtrNist(src, dest: PByte; len: cardinal; - ctxt, iv: PAesBlock); inline; + ctxt, iv: PHash128Rec); inline; var ctr, blocks: cardinal; begin - ctr := bswap32(PCardinal(@iv[12])^); + ctr := bswap32(iv.c3); repeat - blocks := len shr 4; + blocks := len shr AesBlockShift; inc(ctr, blocks); if ctr < blocks then begin @@ -3258,10 +3258,10 @@ begin ctr := 0; end; AesNiEncryptCtrNist32(src, dest, blocks, ctxt, iv); // 32-bit CTR asm - PCardinal(@iv[12])^ := bswap32(ctr); + iv.c3 := bswap32(ctr); if ctr = 0 then - CtrNistCarry12(iv); // propagate carry - blocks := blocks shl 4; + CtrNistCarry12(@iv.b); // propagate carry + blocks := blocks shl AesBlockShift; inc(src, blocks); inc(dest, blocks); dec(len, blocks); @@ -3858,7 +3858,7 @@ var begin ctr := bswap32(PCardinal(@ctxt.fIV[12])^); repeat - blocks := len shr 4; + blocks := len shr AesBlockShift; inc(ctr, blocks); if ctr < blocks then begin @@ -3870,7 +3870,7 @@ begin PCardinal(@ctxt.fIV[12])^ := bswap32(ctr); if ctr = 0 then CtrNistCarry12(@ctxt.fIV); // propagate carry - blocks := blocks shl 4; + blocks := blocks shl AesBlockShift; inc(src, blocks); inc(dest, blocks); dec(len, blocks); @@ -4413,6 +4413,403 @@ procedure AesNiDecryptCfbCrc256(src, dest, aes: pointer; blocks: PtrUInt); {$endif WIN64ABI} end; +{$ifdef USEGCMAVX} + +// prepare the GMAC process for gcmavx_data() and gcmavx_end() +procedure GcmAvxInit(ptab, ks: pointer; rounds: cardinal); +{$ifdef FPC} nostackframe; assembler; asm {$else} asm .noframe {$endif} + // rcx/rdi=ptab, rdx/rsi=ks, r8/rdx=kslen + movdqa xmm15, dqword ptr [rip + @bswapMask] + movdqa xmm14, dqword ptr [rip + @gcmPoly] + movdqu xmm0, dqword ptr [ks] + movdqu xmm11, dqword ptr [ks + 10H] + aesenc xmm0, xmm11 + movdqu xmm11, dqword ptr [ks + 20H] + aesenc xmm0, xmm11 + movdqu xmm11, dqword ptr [ks + 30H] + aesenc xmm0, xmm11 + movdqu xmm11, dqword ptr [ks + 40H] + aesenc xmm0, xmm11 + movdqu xmm11, dqword ptr [ks + 50H] + aesenc xmm0, xmm11 + movdqu xmm11, dqword ptr [ks + 60H] + aesenc xmm0, xmm11 + movdqu xmm11, dqword ptr [ks + 70H] + aesenc xmm0, xmm11 + movdqu xmm11, dqword ptr [ks + 80H] + aesenc xmm0, xmm11 + movdqu xmm11, dqword ptr [ks + 90H] + aesenc xmm0, xmm11 + movdqu xmm11, dqword ptr [ks + 0A0H] + cmp rounds, 12 + jc @last + // end of AES-128 + aesenc xmm0, xmm11 + movdqu xmm11, dqword ptr [ks + 0B0H] + aesenc xmm0, xmm11 + movdqu xmm11, dqword ptr [ks + 0C0H] + jz @last + // end of AES-192 + aesenc xmm0, xmm11 + movdqu xmm11, dqword ptr [ks + 0D0H] + aesenc xmm0, xmm11 + movdqu xmm11, dqword ptr [ks + 0E0H] + // end of AES-256 +@last: aesenclast xmm0, xmm11 + pshufb xmm0, xmm15 + pshufd xmm11, xmm0, 0FFH + movdqu xmm12, xmm0 + psrad xmm11, 31 + pand xmm11, xmm14 + psrld xmm12, 31 + pslldq xmm12, 4 + pslld xmm0, 1 + pxor xmm0, xmm11 + pxor xmm0, xmm12 + movdqu dqword ptr [ptab + 0E0H], xmm0 + pshufd xmm1, xmm0, 4EH + pxor xmm1, xmm0 + movdqu dqword ptr [ptab + 0F0H], xmm1 + movdqu xmm2, xmm0 + movdqu xmm3, xmm1 + mov al, 7 + {$ifdef FPC} align 16 {$else} .align 16 {$endif} +@init: movdqu xmm11, xmm2 + movdqu xmm12, xmm2 + movdqu xmm13, xmm3 + // pclmulqdq xmm11, xmm0, 00H @ 66 44: 0F 3A 44. D8, 00 + // pclmulqdq xmm12, xmm0, 11H @ 66 44: 0F 3A 44. E0, 11 + // pclmulqdq xmm13, xmm1, 00H @ 66 44: 0F 3A 44. E9, 00 + db $66, $44, $0F, $3A, $44, $D8, $00 + db $66, $44, $0F, $3A, $44, $E0, $11 + db $66, $44, $0F, $3A, $44, $E9, $00 + pxor xmm13, xmm11 + pxor xmm13, xmm12 + movdqu xmm4, xmm13 + pslldq xmm4, 8 + psrldq xmm13, 8 + pxor xmm11, xmm4 + pxor xmm12, xmm13 + movdqu xmm2, xmm14 + // pclmulqdq xmm2, xmm11, 01H @ 66 41: 0F 3A 44. D3, 01 + db $66, $41, $0F, $3A, $44, $D3, $01 + pshufd xmm11, xmm11, 4EH + pxor xmm11, xmm2 + movdqu xmm2, xmm14 + // pclmulqdq xmm2, xmm11, 01H @ 66 41: 0F 3A 44. D3, 01 + db $66, $41, $0F, $3A, $44, $D3, $01 + pshufd xmm11, xmm11, 4EH + pxor xmm2, xmm11 + pxor xmm2, xmm12 + movdqu dqword ptr [ptab + 0C0H], xmm2 + pshufd xmm3, xmm2, 4EH + pxor xmm3, xmm2 + movdqu dqword ptr [ptab + 0D0H], xmm3 + lea ptab, [ptab - 20H] + dec al + jne @init + ret + {$ifdef FPC} align 16 {$else} .align 16 {$endif} +@bswapMask: + dq $08090A0B0C0D0E0F + dq $0001020304050607 +@gcmPoly: + dq $0000000000000001 + dq $C200000000000000 +end; + +// compute GMAC with 8x interleaved pclmulqdq opcode +procedure GcmAvxAuth(ptab, data: pointer; datalen: PtrInt; hash: pointer); +{$ifdef FPC} nostackframe; assembler; asm {$else} asm .noframe {$endif} + // rdi=ptab, rsi=data, rdx=datalen, rcx=hash + movdqu xmm8, dqword ptr [hash] + movdqa xmm15, dqword ptr [rip + @bswapMask] + movdqa xmm14, dqword ptr [rip + @gcmPoly] + test datalen, datalen + jz @done + cmp datalen, 128 + jc @by1 + {$ifdef FPC} align 16 {$else} .align 16 {$endif} +@by8: sub datalen, 128 + movdqu xmm0, dqword ptr [data] + movdqu xmm1, dqword ptr [data + 10H] + movdqu xmm2, dqword ptr [data + 20H] + movdqu xmm3, dqword ptr [data + 30H] + movdqu xmm4, dqword ptr [data + 40H] + movdqu xmm5, dqword ptr [data + 50H] + movdqu xmm6, dqword ptr [data + 60H] + movdqu xmm7, dqword ptr [data + 70H] + lea data, [data + 80H] + pshufb xmm0, xmm15 + pshufb xmm1, xmm15 + pshufb xmm2, xmm15 + pshufb xmm3, xmm15 + pshufb xmm4, xmm15 + pshufb xmm5, xmm15 + pshufb xmm6, xmm15 + pshufb xmm7, xmm15 + pxor xmm0, xmm8 + movdqu xmm8, dqword ptr [ptab] + movdqu xmm10, dqword ptr [ptab + 10H] + movdqu xmm9, xmm8 + pshufd xmm12, xmm0, 4EH + pxor xmm12, xmm0 + // pclmulqdq xmm8, xmm0, 00H 66 44: 0F 3A 44. C0, 00 + // pclmulqdq xmm9, xmm0, 11H 66 44: 0F 3A 44. C8, 11 + // pclmulqdq xmm10, xmm12, 00H 66 45: 0F 3A 44. D4, 00 + db $66, $44, $0F, $3A, $44, $C0, $00 + db $66, $44, $0F, $3A, $44, $C8, $11 + db $66, $45, $0F, $3A, $44, $D4, $00 + movdqu xmm12, dqword ptr [ptab + 20H] + movdqu xmm13, xmm12 + // pclmulqdq xmm12, xmm1, 00H 66 44: 0F 3A 44. E1, 00 + db $66, $44, $0F, $3A, $44, $E1, $00 + pxor xmm8, xmm12 + // pclmulqdq xmm13, xmm1, 11H 66 44: 0F 3A 44. E9, 11 + db $66, $44, $0F, $3A, $44, $E9, $11 + pxor xmm9, xmm13 + pshufd xmm12, xmm1, 4EH + pxor xmm1, xmm12 + movdqu xmm12, dqword ptr [ptab + 30H] + // pclmulqdq xmm12, xmm1, 00H 66 44: 0F 3A 44. E1, 00 + db $66, $44, $0F, $3A, $44, $E1, $00 + pxor xmm10, xmm12 + movdqu xmm12, dqword ptr [ptab + 40H] + movdqu xmm13, xmm12 + // pclmulqdq xmm12, xmm2, 00H 66 44: 0F 3A 44. E2, 00 + db $66, $44, $0F, $3A, $44, $E2, $00 + pxor xmm8, xmm12 + // pclmulqdq xmm13, xmm2, 11H 66 44: 0F 3A 44. EA, 11 + db $66, $44, $0F, $3A, $44, $EA, $11 + pxor xmm9, xmm13 + pshufd xmm12, xmm2, 4EH + pxor xmm2, xmm12 + movdqu xmm12, dqword ptr [ptab + 50H] + // pclmulqdq xmm12, xmm2, 00H 66 44: 0F 3A 44. E2, 00 + db $66, $44, $0F, $3A, $44, $E2, $00 + pxor xmm10, xmm12 + movdqu xmm12, dqword ptr [ptab + 60H] + movdqu xmm13, xmm12 + // pclmulqdq xmm12, xmm3, 00H 66 44: 0F 3A 44. E3, 00 + db $66, $44, $0F, $3A, $44, $E3, $00 + pxor xmm8, xmm12 + // pclmulqdq xmm13, xmm3, 11H 66 44: 0F 3A 44. EB, 11 + db $66, $44, $0F, $3A, $44, $EB, $11 + pxor xmm9, xmm13 + pshufd xmm12, xmm3, 4EH + pxor xmm3, xmm12 + movdqu xmm12, dqword ptr [ptab + 70H] + // pclmulqdq xmm12, xmm3, 00H 66 44: 0F 3A 44. E3, 00 + db $66, $44, $0F, $3A, $44, $E3, $00 + pxor xmm10, xmm12 + movdqu xmm12, dqword ptr [ptab + 80H] + movdqu xmm13, xmm12 + // pclmulqdq xmm12, xmm4, 00H 66 44: 0F 3A 44. E4, 00 + db $66, $44, $0F, $3A, $44, $E4, $00 + pxor xmm8, xmm12 + // pclmulqdq xmm13, xmm4, 11H 66 44: 0F 3A 44. EC, 11 + db $66, $44, $0F, $3A, $44, $EC, $11 + pxor xmm9, xmm13 + pshufd xmm12, xmm4, 4EH + pxor xmm4, xmm12 + movdqu xmm12, dqword ptr [ptab + 90H] + // pclmulqdq xmm12, xmm4, 00H 66 44: 0F 3A 44. E4, 00 + db $66, $44, $0F, $3A, $44, $E4, $00 + pxor xmm10, xmm12 + movdqu xmm12, dqword ptr [ptab + 0A0H] + movdqu xmm13, xmm12 + // pclmulqdq xmm12, xmm5, 00H 66 44: 0F 3A 44. E5, 00 + db $66, $44, $0F, $3A, $44, $E5, $00 + pxor xmm8, xmm12 + // pclmulqdq xmm13, xmm5, 11H 66 44: 0F 3A 44. ED, 11 + db $66, $44, $0F, $3A, $44, $ED, $11 + pxor xmm9, xmm13 + pshufd xmm12, xmm5, 4EH + pxor xmm5, xmm12 + movdqu xmm12, dqword ptr [ptab + 0B0H] + // pclmulqdq xmm12, xmm5, 00H 66 44: 0F 3A 44. E5, 00 + db $66, $44, $0F, $3A, $44, $E5, $00 + pxor xmm10, xmm12 + movdqu xmm12, dqword ptr [ptab + 0C0H] + movdqu xmm13, xmm12 + // pclmulqdq xmm12, xmm6, 00H 66 44: 0F 3A 44. E6, 00 + db $66, $44, $0F, $3A, $44, $E6, $00 + pxor xmm8, xmm12 + // pclmulqdq xmm13, xmm6, 11H 66 44: 0F 3A 44. EE, 11 + db $66, $44, $0F, $3A, $44, $EE, $11 + pxor xmm9, xmm13 + pshufd xmm12, xmm6, 4EH + pxor xmm6, xmm12 + movdqu xmm12, dqword ptr [ptab + 0D0H] + // pclmulqdq xmm12, xmm6, 00H 66 44: 0F 3A 44. E6, 00 + db $66, $44, $0F, $3A, $44, $E6, $00 + pxor xmm10, xmm12 + movdqu xmm12, dqword ptr [ptab + 0E0H] + movdqu xmm13, xmm12 + // pclmulqdq xmm12, xmm7, 00H 66 44: 0F 3A 44. E7, 00 + db $66, $44, $0F, $3A, $44, $E7, $00 + pxor xmm8, xmm12 + // pclmulqdq xmm13, xmm7, 11H 66 44: 0F 3A 44. EF, 11 + db $66, $44, $0F, $3A, $44, $EF, $11 + pxor xmm9, xmm13 + pshufd xmm12, xmm7, 4EH + pxor xmm7, xmm12 + movdqu xmm12, dqword ptr [ptab + 0F0H] + // pclmulqdq xmm12, xmm7, 00H 66 44: 0F 3A 44. E7, 00 + db $66, $44, $0F, $3A, $44, $E7, $00 + pxor xmm10, xmm12 + pxor xmm10, xmm8 + pxor xmm10, xmm9 + movdqu xmm11, xmm10 + psrldq xmm10, 8 + pslldq xmm11, 8 + pxor xmm9, xmm10 + pxor xmm8, xmm11 + movdqu xmm11, xmm14 + // pclmulqdq xmm11, xmm8, 01H 66 45: 0F 3A 44. D8, 01 + db $66, $45, $0F, $3A, $44, $D8, $01 + pshufd xmm8, xmm8, 4EH + pxor xmm8, xmm11 + movdqu xmm11, xmm14 + // pclmulqdq xmm11, xmm8, 01H 66 45: 0F 3A 44. D8, 01 + db $66, $45, $0F, $3A, $44, $D8, $01 + pshufd xmm8, xmm8, 4EH + pxor xmm8, xmm11 + pxor xmm8, xmm9 + cmp datalen, 128 + jnc @by8 +@by1: movdqu xmm12, dqword ptr [ptab + 0E0H] + movdqu xmm13, dqword ptr [ptab + 0F0H] + cmp datalen, 16 + jc @sml + {$ifdef FPC} align 16 {$else} .align 16 {$endif} +@next: sub datalen, 16 + movdqu xmm0, dqword ptr [data] +@s: pshufb xmm0, xmm15 + pxor xmm0, xmm8 + movdqu xmm8, xmm12 + movdqu xmm10, xmm13 + movdqu xmm9, xmm12 + pshufd xmm11, xmm0, 4EH + pxor xmm11, xmm0 + // pclmulqdq xmm8, xmm0, 00H 66 44: 0F 3A 44. C0, 00 + // pclmulqdq xmm9, xmm0, 11H 66 44: 0F 3A 44. C8, 11 + // pclmulqdq xmm10, xmm11, 00H 66 45: 0F 3A 44. D3, 00 + db $66, $44, $0F, $3A, $44, $C0, $00 + db $66, $44, $0F, $3A, $44, $C8, $11 + db $66, $45, $0F, $3A, $44, $D3, $00 + pxor xmm10, xmm8 + pxor xmm10, xmm9 + movdqu xmm11, xmm10 + psrldq xmm10, 8 + pslldq xmm11, 8 + pxor xmm9, xmm10 + pxor xmm8, xmm11 + movdqu xmm11, xmm14 + // pclmulqdq xmm11, xmm8, 01H 66 45: 0F 3A 44. D8, 01 + db $66, $45, $0F, $3A, $44, $D8, $01 + pshufd xmm8, xmm8, 4EH + pxor xmm8, xmm11 + movdqu xmm11, xmm14 + // pclmulqdq xmm11, xmm8, 01H 66 45: 0F 3A 44. D8, 01 + db $66, $45, $0F, $3A, $44, $D8, $01 + pshufd xmm8, xmm8, 4EH + pxor xmm8, xmm11 + pxor xmm8, xmm9 + lea data, [data + 10H] + cmp datalen, 16 + jnc @next +@sml: test datalen, datalen + jz @done + pxor xmm0, xmm0 + lea data, [data + datalen - 1] + {$ifdef FPC} align 8 {$else} .align 8 {$endif} +@ins: pslldq xmm0, 1 + pinsrb xmm0, byte ptr [data], 00H + dec data + dec datalen + jnz @ins + jmp @s + {$ifdef FPC} align 16 {$else} .align 16 {$endif} +@bswapMask: + dq $08090A0B0C0D0E0F + dq $0001020304050607 +@gcmPoly: + dq $0000000000000001 + dq $C200000000000000 +@done: movdqu dqword ptr [hash], xmm8 +end; + +procedure GcmAvxGetTag(ptab, mask, hash: pointer; plen, dlen: PtrInt); +{$ifdef FPC} nostackframe; assembler; asm {$else} asm .noframe {$endif} + // rdi=ptab, rsi=mask, rdx=hash, rcx=plen, r8=dlen + {$ifdef WIN64ABI} + mov rax, qword ptr [rsp + $28] // dlen not passed as register + push rsi + push rdi + mov rdi, ptab // rcx + mov rsi, mask // rdx + mov rdx, hash // r8 + mov rcx, plen // r9 + {$else} + mov rax, r8 + {$endif WIN64ABI} + shl rcx, 3 + movdqu xmm8, dqword ptr [rdx] + movdqu xmm13, dqword ptr [rsi] + movdqa xmm15, dqword ptr [rip + @bswapMask] + movdqa xmm14, dqword ptr [rip + @gcmPoly] + shl rax, 3 + movq xmm0, rcx + pinsrq xmm0, rax, 1 + pxor xmm0, xmm8 + movdqu xmm8, dqword ptr [rdi + 0E0H] + movdqu xmm10, dqword ptr [rdi + 0F0H] + movdqu xmm9, xmm8 + // pclmulqdq xmm8, xmm0, 00H 66 44: 0F 3A 44. C0, 00 + // pclmulqdq xmm9, xmm0, 11H 66 44: 0F 3A 44. C8, 11 + db $66, $44, $0F, $3A, $44, $C0, $00 + db $66, $44, $0F, $3A, $44, $C8, $11 + pshufd xmm11, xmm0, 4EH + pxor xmm11, xmm0 + // pclmulqdq xmm10, xmm11, 00H 66 45: 0F 3A 44. D3, 00 + db $66, $45, $0F, $3A, $44, $D3, $00 + pxor xmm10, xmm8 + pxor xmm10, xmm9 + movdqu xmm11, xmm10 + psrldq xmm10, 8 + pslldq xmm11, 8 + pxor xmm9, xmm10 + pxor xmm8, xmm11 + movdqu xmm11, xmm14 + // pclmulqdq xmm11, xmm8, 01H 66 45: 0F 3A 44. D8, 01 + db $66, $45, $0F, $3A, $44, $D8, $01 + pshufd xmm8, xmm8, 4EH + pxor xmm8, xmm11 + movdqu xmm11, xmm14 + // pclmulqdq xmm11, xmm8, 01H 66 45: 0F 3A 44. D8, 01 + db $66, $45, $0F, $3A, $44, $D8, $01 + pshufd xmm8, xmm8, 4EH + pxor xmm8, xmm11 + pxor xmm8, xmm9 + pshufb xmm8, xmm15 + pxor xmm8, xmm13 + movdqu dqword ptr [rdx], xmm8 + {$ifdef WIN64ABI} + pop rdi + pop rsi + {$endif WIN64ABI} + ret + {$ifdef FPC} align 16 {$else} .align 16 {$endif} +@bswapMask: + dq $08090A0B0C0D0E0F + dq $0001020304050607 +@gcmPoly: + dq $0000000000000001 + dq $C200000000000000 +end; + +{$endif USEGCMAVX} // compute a := a * b in GF(2^128) using pclmulqdq on WestMere CPUs // - three times faster than the pascal version using lookup tables @@ -4494,6 +4891,10 @@ procedure gf_mul_pclmulqdq(a, b: pointer); pxor xmm0, xmm2 pshufb xmm0, xmm10 movups dqword ptr [a], xmm0 + {$ifdef WIN64ABI} + pop rdi + pop rsi + {$endif WIN64ABI} ret {$ifdef FPC} align 16 {$else} .align 16 {$endif} @swap: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 diff --git a/src/core/mormot.core.crypto.openssl.pas b/src/core/mormot.core.crypto.openssl.pas index 6f72c8a96..a6cc0a137 100644 --- a/src/core/mormot.core.crypto.openssl.pas +++ b/src/core/mormot.core.crypto.openssl.pas @@ -14,7 +14,7 @@ ***************************************************************************** TL;DR: on x86_64, our mormot.core.crypto.pas asm is stand-alone and faster - than OpenSSL for most algorithms, but AES-GCM. + than OpenSSL for most algorithms, but AES-GCM (1.8 vs 1.5 GB/s). } @@ -170,11 +170,11 @@ TAesCtrNistOsl = class(TAesAbstractOsl) /// OpenSSL AES-GCM cypher/uncypher // - implements AEAD (authenticated-encryption with associated-data) process // via MacSetNonce/MacEncrypt or AesGcmAad/AesGcmFinal methods - // - OpenSSL is faster than our TAesGcm class which is not interleaved: - // $ mormot aes-128-gcm in 14.42ms i.e. 173274/s or 368.7 MB/s - // $ mormot aes-256-gcm in 16.98ms i.e. 147206/s or 313.2 MB/s + // - OpenSSL is faster than our TAesGcm class, but not so much: // $ openssl aes-128-gcm in 2.86ms i.e. 874125/s or 1.8 GB/s // $ openssl aes-256-gcm in 3.43ms i.e. 727590/s or 1.5 GB/s + // $ mormot aes-128-gcm in 3.45ms i.e. 722752/s or 1.5 GB/s + // $ mormot aes-256-gcm in 4.11ms i.e. 607385/s or 1.2 GB/s TAesGcmOsl = class(TAesGcmAbstract) protected fAes: TAesOsl; diff --git a/src/core/mormot.core.crypto.pas b/src/core/mormot.core.crypto.pas index 59c1089d3..5dcb639db 100644 --- a/src/core/mormot.core.crypto.pas +++ b/src/core/mormot.core.crypto.pas @@ -53,7 +53,8 @@ ESynCrypto = class(ESynException); {$ifdef HASAESNI} {$define USEAESNI} {$define USEAESNI64} - {$define USECLMUL} // gf_mul_pclmulqdq() requires some complex opcodes + {$define USECLMUL} // gf_mul_pclmulqdq() requires some complex opcodes + {$define USEGCMAVX} // 8x interleaved aesni + pclmulqdq asm for AES-GCM {$endif HASAESNI} {$ifdef OSWINDOWS} {$define CRC32C_X64} // external crc32_iscsi_01 for win64/lin64 @@ -273,7 +274,7 @@ procedure RawSha512Compress(var Hash; Data: pointer); TAesGcmEngine = object private /// standard AES encryption context - actx: TAes; + aes: TAes; /// ghash value of the Authentication Data aad_ghv: TAesBlock; /// ghash value of the Ciphertext @@ -289,7 +290,7 @@ procedure RawSha512Compress(var Hash; Data: pointer); /// current 0..15 position in encryption block blen: byte; /// the state of this context - flags: set of (flagFinalComputed, flagFlushed, flagCLMUL); + flags: set of (flagFinalComputed, flagFlushed, flagCLMUL, flagAVX); /// 4KB lookup table for fast Galois Finite Field multiplication // - is defined as last field of the object for better code generation gf_t4k: array[byte] of THash128Rec; @@ -904,8 +905,8 @@ TAesOfbCrc = class(TAesSymCrc) // $ mormot aes-128-ctr in 1.99ms i.e. 1254390/s or 2.6 GB/s // $ mormot aes-256-ctr in 2.64ms i.e. 945179/s or 1.9 GB/s // - could be used as an alternative to AES-GCM, even if OpenSSL is available: - // $ mormot aes-128-gcm in 14.42ms i.e. 173274/s or 368.7 MB/s - // $ mormot aes-256-gcm in 16.98ms i.e. 147206/s or 313.2 MB/s + // $ mormot aes-128-gcm in 3.45ms i.e. 722752/s or 1.5 GB/s + // $ mormot aes-256-gcm in 4.11ms i.e. 607385/s or 1.2 GB/s // $ openssl aes-128-gcm in 2.86ms i.e. 874125/s or 1.8 GB/s // $ openssl aes-256-gcm in 3.43ms i.e. 727590/s or 1.5 GB/s // - on i386, numbers are lower, because they are not interleaved: @@ -984,20 +985,21 @@ TAesGcmAbstractClass = class of TAesGcmAbstract; // via MacSetNonce/MacEncrypt or AesGcmAad/AesGcmFinal methods // - will use AES-NI and CLMUL hardware instructions, if available // - expect IV to be set before process, or IVAtBeginning=true - // - by design, AES-GCM doesn't expect any MAC to be supplied before processing - // - OpenSSL is faster than our TAesGcm class which is not interleaved: - // $ mormot aes-128-gcm in 14.42ms i.e. 173274/s or 368.7 MB/s - // $ mormot aes-256-gcm in 16.98ms i.e. 147206/s or 313.2 MB/s + // - by design, AES-GCM doesn't expect any Nonce to be supplied before processing + // - our TAesGcm class is 8x interleaved for both GMAC and AES-CTR + // $ mormot aes-128-gcm in 3.45ms i.e. 722752/s or 1.5 GB/s + // $ mormot aes-256-gcm in 4.11ms i.e. 607385/s or 1.2 GB/s + // - OpenSSL is faster since it performs GMAC and AES-CTR in a single pass // $ openssl aes-128-gcm in 2.86ms i.e. 874125/s or 1.8 GB/s // $ openssl aes-256-gcm in 3.43ms i.e. 727590/s or 1.5 GB/s - // - on i386, numbers are similar: - // $ mormot aes-128-gcm in 15.86ms i.e. 157609/s or 335.4 MB/s - // $ mormot aes-256-gcm in 18.23ms i.e. 137083/s or 291.7 MB/s - // $ openssl aes-128-gcm in 5.49ms i.e. 455290/s or 0.9 GB/s - // $ openssl aes-256-gcm in 6.11ms i.e. 408630/s or 869.6 MB/s + // - on i386, numbers are much lower, since lacks CLMUL and interleaved asm + // $ mormot aes-128-gcm in 15.86ms i.e. 157609/s or 335.4 MB/s + // $ mormot aes-256-gcm in 18.23ms i.e. 137083/s or 291.7 MB/s + // $ openssl aes-128-gcm in 5.49ms i.e. 455290/s or 0.9 GB/s + // $ openssl aes-256-gcm in 6.11ms i.e. 408630/s or 869.6 MB/s TAesGcm = class(TAesGcmAbstract) protected - fAes: TAesGcmEngine; + fGcm: TAesGcmEngine; function AesGcmInit: boolean; override; // from fKey/fKeySize procedure AesGcmDone; override; procedure AesGcmReset; override; // from fIV/CTR_POS @@ -3327,7 +3329,7 @@ procedure TAes.DoBlocksCtr(iv: PAesBlock; src, dst: pointer; begin {$ifdef USEAESNI64} if aesNi in TAesContext(Context).Flags then - AesNiEncryptCtrNist(src, dst, blockcount shl 4, @Context, iv) + AesNiEncryptCtrNist(src, dst, blockcount shl 4, @Context, pointer(iv)) else {$endif USEAESNI64} DoBlocksCtrPas(iv, src, dst, blockcount, TAesContext(Context)); @@ -3578,6 +3580,8 @@ procedure GCM_IncCtr(var x: TAesBlock); procedure TAesGcmEngine.internal_crypt(ptp, ctp: PByte; ILen: PtrUInt); var b_pos: PtrUInt; + {$ifdef USEAESNI64} ctr, {$endif USEAESNI64} + blocks: cardinal; begin b_pos := blen; inc(blen, ILen); @@ -3588,30 +3592,46 @@ procedure TAesGcmEngine.internal_crypt(ptp, ctp: PByte; ILen: PtrUInt); while (ILen > 0) and (b_pos < SizeOf(TAesBlock)) do begin - ctp^ := ptp^ xor TAesContext(actx).buf[b_pos]; + ctp^ := ptp^ xor TAesContext(aes).buf[b_pos]; inc(b_pos); inc(ptp); inc(ctp); dec(ILen); end; - while ILen >= SizeOf(TAesBlock) do - begin - GCM_IncCtr(TAesContext(actx).iv.b); - actx.Encrypt(TAesContext(actx).iv.b, TAesContext(actx).buf); // maybe AES-NI - XorBlock16(pointer(ptp), pointer(ctp), @TAesContext(actx).buf); - inc(PAesBlock(ptp)); - inc(PAesBlock(ctp)); - dec(ILen, SizeOf(TAesBlock)); - end; + blocks := ILen shr AesBlockShift; + if blocks <> 0 then + {$ifdef USEAESNI64} + if aesNi in TAesContext(aes).Flags then + begin + // AES-GCM has a 32-bit counter -> don't use 128-bit AesNiEncryptCtrNist() + ctr := bswap32(TAesContext(aes).iv.c3) + blocks; + GCM_IncCtr(TAesContext(aes).iv.b); // should be done before + AesNiEncryptCtrNist32(ptp, ctp, blocks, @aes, @TAesContext(aes).iv); + TAesContext(aes).iv.c3 := bswap32(ctr); + blocks := blocks shl AesBlockShift; + inc(ptp, blocks); + inc(ctp, blocks); + ILen := Ilen and AesBlockMod; + end + else + {$endif USEAESNI64} + repeat + GCM_IncCtr(TAesContext(aes).iv.b); + aes.Encrypt(TAesContext(aes).iv.b, TAesContext(aes).buf); // maybe AES-NI + XorBlock16(pointer(ptp), pointer(ctp), @TAesContext(aes).buf); + inc(PAesBlock(ptp)); + inc(PAesBlock(ctp)); + dec(ILen, SizeOf(TAesBlock)); + until ILen < SizeOf(TAesBlock); while ILen > 0 do begin if b_pos = SizeOf(TAesBlock) then begin - GCM_IncCtr(TAesContext(actx).iv.b); - actx.Encrypt(TAesContext(actx).iv.b, TAesContext(actx).buf); + GCM_IncCtr(TAesContext(aes).iv.b); + aes.Encrypt(TAesContext(aes).iv.b, TAesContext(aes).buf); b_pos := 0; end; - ctp^ := TAesContext(actx).buf[b_pos] xor ptp^; + ctp^ := TAesContext(aes).buf[b_pos] xor ptp^; inc(b_pos); inc(ptp); inc(ctp); @@ -3661,10 +3681,10 @@ procedure TAesGcmEngine.internal_auth(ctp: PByte; ILen: PtrUInt; function TAesGcmEngine.Init(const Key; KeyBits: PtrInt): boolean; begin FillcharFast(self,SizeOf(self), 0); - result := actx.EncryptInit(Key, KeyBits); + result := aes.EncryptInit(Key, KeyBits); if not result then exit; - actx.Encrypt(ghash_h, ghash_h); + aes.Encrypt(ghash_h, ghash_h); {$ifdef USECLMUL} if cfCLMUL in CpuFeatures then include(flags, flagCLMUL) @@ -3689,48 +3709,44 @@ function TAesGcmEngine.Reset(pIV: pointer; IV_len: PtrInt): boolean; if IV_len = CTR_POS then begin // Initialization Vector size matches perfect size of 12 bytes - MoveFast(pIV^, TAesContext(actx).iv, CTR_POS); - TAesContext(actx).iv.c3 := $01000000; + MoveSmall(pIV, @TAesContext(aes).iv, CTR_POS); + TAesContext(aes).iv.c3 := $01000000; end else begin // Initialization Vector is otherwise computed from GHASH(IV,H) n_pos := IV_len; - FillZero(TAesContext(actx).iv.b); + FillZero(TAesContext(aes).iv.b); while n_pos >= SizeOf(TAesBlock) do begin - XorBlock16(@TAesContext(actx).iv, pIV); + XorBlock16(@TAesContext(aes).iv, pIV); inc(PAesBlock(pIV)); dec(n_pos, SizeOf(TAesBlock)); - gf_mul_h(self, TAesContext(actx).iv.b); // maybe CLMUL + gf_mul_h(self, TAesContext(aes).iv.b); // maybe CLMUL end; if n_pos > 0 then begin for i := 0 to n_pos - 1 do - TAesContext(actx).iv.b[i] := TAesContext(actx).iv.b[i] xor PAesBlock(pIV)^[i]; - gf_mul_h(self, TAesContext(actx).iv.b); // maybe CLMUL + TAesContext(aes).iv.b[i] := TAesContext(aes).iv.b[i] xor PAesBlock(pIV)^[i]; + gf_mul_h(self, TAesContext(aes).iv.b); // maybe CLMUL end; n_pos := IV_len shl 3; i := 15; while n_pos > 0 do begin - TAesContext(actx).iv.b[i] := TAesContext(actx).iv.b[i] xor byte(n_pos); + TAesContext(aes).iv.b[i] := TAesContext(aes).iv.b[i] xor byte(n_pos); n_pos := n_pos shr 8; dec(i); end; - gf_mul_h(self, TAesContext(actx).iv.b); // maybe CLMUL + gf_mul_h(self, TAesContext(aes).iv.b); // maybe CLMUL end; // reset internal state and counters - y0_val := TAesContext(actx).iv.c3; + y0_val := TAesContext(aes).iv.c3; FillZero(aad_ghv); FillZero(txt_ghv); aad_cnt.V := 0; atx_cnt.V := 0; - flags := []; - {$ifdef USECLMUL} - if cfCLMUL in CpuFeatures then - include(flags, flagCLMUL); - {$endif USECLMUL} + flags := flags - [flagFinalComputed, flagFlushed]; result := true; end; @@ -3746,16 +3762,19 @@ function TAesGcmEngine.Encrypt(ptp, ctp: Pointer; ILen: PtrInt): boolean; exit; end; if (ILen and AesBlockMod = 0) and + {$ifdef USEAESNI64} // faster with 8x interleaved internal_crypt() + not (aesNi in TAesContext(aes).Flags) and + {$endif USEAESNI64} (blen = 0) then begin inc(atx_cnt.V, ILen); ILen := ILen shr AesBlockShift; repeat // single-pass loop optimized e.g. for PKCS7 padding - {%H-}GCM_IncCtr(TAesContext(actx).iv.b); - TAesContext(actx).DoBlock(actx, TAesContext(actx).iv, - TAesContext(actx).buf); // buf=AES(iv) maybe AES-NI - XorBlock16(ptp, ctp, @TAesContext(actx).buf); + {%H-}GCM_IncCtr(TAesContext(aes).iv.b); + TAesContext(aes).DoBlock(aes, TAesContext(aes).iv, + TAesContext(aes).buf); // buf=AES(iv) maybe AES-NI + XorBlock16(ptp, ctp, @TAesContext(aes).buf); gf_mul_h(self, txt_ghv); // maybe CLMUL XorBlock16(@txt_ghv, ctp); inc(PAesBlock(ptp)); @@ -3786,6 +3805,9 @@ function TAesGcmEngine.Decrypt(ctp, ptp: Pointer; ILen: PtrInt; (flagFinalComputed in flags) then exit; if (ILen and AesBlockMod = 0) and + {$ifdef USEAESNI64} // faster with 8x interleaved internal_crypt() + not (aesNi in TAesContext(aes).Flags) and + {$endif USEAESNI64} (blen = 0) then begin inc(atx_cnt.V, ILen); @@ -3794,9 +3816,9 @@ function TAesGcmEngine.Decrypt(ctp, ptp: Pointer; ILen: PtrInt; // single-pass loop optimized e.g. for PKCS7 padding gf_mul_h(self, txt_ghv); // maybe CLMUL XorBlock16(@txt_ghv, ctp); - GCM_IncCtr(TAesContext(actx).iv.b); - actx.Encrypt(TAesContext(actx).iv.b, TAesContext(actx).buf); // maybe AES-NI - XorBlock16(ctp, ptp, @TAesContext(actx).buf); + GCM_IncCtr(TAesContext(aes).iv.b); + aes.Encrypt(TAesContext(aes).iv.b, TAesContext(aes).buf); // maybe AES-NI + XorBlock16(ctp, ptp, @TAesContext(aes).buf); inc(PAesBlock(ptp)); inc(PAesBlock(ctp)); dec(ILen); @@ -3819,7 +3841,7 @@ function TAesGcmEngine.Decrypt(ctp, ptp: Pointer; ILen: PtrInt; begin Final(tag, {anddone=}false); if not IsEqual(tag, ptag^, tlen) then - // check authentication before encryption + // check authentication before decryption exit; end; internal_crypt(ctp, ptp, iLen); @@ -3877,9 +3899,9 @@ function TAesGcmEngine.Final(out tag: TAesBlock; andDone: boolean): boolean; XorBlock16(@aad_ghv, @tbuf); gf_mul_h(self, aad_ghv); // maybe CLMUL // compute E(K,Y0) - tbuf := TAesContext(actx).iv.b; + tbuf := TAesContext(aes).iv.b; TWA4(tbuf)[3] := y0_val; - actx.Encrypt(tbuf); + aes.Encrypt(tbuf); // GMAC = GHASH(H, AAD, ctp) xor E(K,Y0) XorBlock16(@aad_ghv, @tag, @tbuf); if andDone then @@ -3897,7 +3919,7 @@ procedure TAesGcmEngine.Done; begin if flagFlushed in flags then exit; - actx.Done; + aes.Done; include(flags, flagFlushed); end; @@ -3908,7 +3930,7 @@ function TAesGcmEngine.FullEncryptAndAuthenticate(const Key; KeyBits: PtrInt; result := Init(Key, KeyBits) and Reset(pIV, IV_len) and Add_AAD(pAAD, aLen) and - Encrypt(ptp, ctp,pLen) and + Encrypt(ptp, ctp, pLen) and Final(tag); Done; end; @@ -5200,6 +5222,23 @@ function TAesGcmAbstract.MacCheckError(Encrypted: pointer; Count: cardinal): boo { TAesGcm } +function TAesGcm.AesGcmInit: boolean; +begin + {$ifdef USEGCMAVX} + if (cfCLMUL in CpuFeatures) and + (cfAESNI in CpuFeatures) then + begin + // 8x interleaved aesni + pclmulqdq x86_64 asm + include(fGcm.flags, flagAVX); + result := fGcm.aes.EncryptInit(fKey, fKeySize); + if result then + GcmAvxInit(@fGcm.gf_t4k, @fGcm.aes, TAesContext(fGcm.aes).Rounds); + exit; + end; + {$endif USEGCMAVX} + result := fGcm.Init(fKey, fKeySize); +end; + function TAesGcm.Clone: TAesAbstract; begin result := NewInstance as TAesGcm; @@ -5207,56 +5246,120 @@ function TAesGcm.Clone: TAesAbstract; result.fKeySize := fKeySize; result.fKeySizeBytes := fKeySizeBytes; result.fAlgoMode := mGcm; - TAesGcm(result).fAes := fAes; // reuse the very same TAesGcmEngine memory -end; - -function TAesGcm.AesGcmInit: boolean; -begin - result := fAes.Init(fKey, fKeySize); + {$ifdef USEGCMAVX} + if flagAVX in fGcm.flags then + begin + TAesGcm(result).fGcm.aes := fGcm.aes; + TAesGcm(result).fGcm.flags := fGcm.flags; + MoveFast(fGcm.gf_t4k, TAesGcm(result).fGcm.gf_t4k, 256); + end + else + {$endif USEGCMAVX} + TAesGcm(result).fGcm := fGcm; // reuse the very same TAesGcmEngine memory end; procedure TAesGcm.AesGcmDone; begin - fAes.Done; + {$ifdef USEGCMAVX} + if flagAVX in fGcm.flags then + fGcm.aes.Done + else + {$endif USEGCMAVX} + fGcm.Done; end; procedure TAesGcm.AesGcmReset; begin - fAes.Reset(@fIV, CTR_POS); + fGcm.Reset(@fIV, CTR_POS); // reused for USEGCMAVX since CTR_POS computes nothing end; function TAesGcm.AesGcmProcess(BufIn, BufOut: pointer; Count: cardinal): boolean; +{$ifdef USEGCMAVX} +var + blocks, ctr, onepass: cardinal; +{$endif USEGCMAVX} begin - if fStarted = stEnc then - result := fAes.Encrypt(BufIn, BufOut, Count) + {$ifdef USEGCMAVX} + if flagAVX in fGcm.flags then + begin + result := true; + if Count and AesBlockMod <> 0 then + raise ESynCrypto.CreateUtf8('%.Encrypt/Decrypt should use PKCS7', [self]); + inc(fGcm.atx_cnt.V, Count); + repeat + // regroup GMAC + AES-CTR per 1MB chunks to fit in CPU cache + onepass := 1 shl 20; + if Count < onepass then + onepass := Count; + // GMAC done before decryption + if fStarted = stDec then + GcmAvxAuth(@fGcm.gf_t4k, BufIn, onepass, @fGcm.txt_ghv); + // AES-CTR with 32-bit counter + blocks := onepass shr AesBlockShift; + ctr := bswap32(TAesContext(fGcm.aes).iv.c3) + blocks; + GCM_IncCtr(TAesContext(fGcm.aes).iv.b); // should be done before + AesNiEncryptCtrNist32(BufIn, BufOut, blocks, @fGcm.aes, @TAesContext(fGcm.aes).iv); + TAesContext(fGcm.aes).iv.c3 := bswap32(ctr); + // GMAC done after encryption + if fStarted = stEnc then + GcmAvxAuth(@fGcm.gf_t4k, BufOut, onepass, @fGcm.txt_ghv); + dec(Count, onepass); + if Count = 0 then + exit; + inc(PByte(BufIn), onepass); + inc(PByte(BufOut), onepass); + until false; + end else - result := fAes.Decrypt(BufIn, BufOut, Count); + {$endif USEGCMAVX} + if fStarted = stEnc then + result := fGcm.Encrypt(BufIn, BufOut, Count) + else + result := fGcm.Decrypt(BufIn, BufOut, Count); end; procedure TAesGcm.AesGcmAad(Buf: pointer; Len: integer); begin - fAes.Add_AAD(Buf, Len); + {$ifdef USEGCMAVX} + if flagAVX in fGcm.flags then + begin + inc(fGcm.aad_cnt.V, Len); + GcmAvxAuth(@fGcm.gf_t4k, Buf, Len, @fGcm.txt_ghv); // use txt_ghv for both + end + else + {$endif USEGCMAVX} + fGcm.Add_AAD(Buf, Len); end; function TAesGcm.AesGcmFinal(var tag: TAesBlock): boolean; var - decoded: TAesBlock; + decoded: THash128Rec; begin + result := false; + if fStarted = stNone then + exit; + {$ifdef USEGCMAVX} + if flagAVX in fGcm.flags then + begin + decoded := TAesContext(fGcm.aes).iv; + decoded.c3 := fGcm.y0_val; // restore initial counter + fGcm.aes.Encrypt(decoded.b); + GcmAvxGetTag(@fGcm.gf_t4k, @decoded, @fGcm.txt_ghv, fGcm.atx_cnt.V, fGcm.aad_cnt.V); + decoded.b := fGcm.txt_ghv; + end + else + {$endif USEGCMAVX} + fGcm.Final(decoded.b, {andDone=}false); case fStarted of stEnc: begin - fAes.Final(tag, {andDone=}false); + tag := decoded.b; result := true; end; stDec: - begin - fAes.Final(decoded, {andDone=}false); - result := IsEqual(decoded, tag); - end; - else - result := false; + result := IsEqual(decoded.b, tag); end; - fStarted := stNone; // allow reuse of this fAes instance + fStarted := stNone; // allow reuse of this fGcm instance end; @@ -5432,7 +5535,6 @@ function AesAlgoNameDecode(AesAlgoName: PUtf8Char; i: integer; tab: PByteArray; begin - // this code is very efficient result := false; if PCardinal(AesAlgoName)^ and $ffdfdfdf <> ord('A') + ord('E') shl 8 + ord('S') shl 16 + ord('-') shl 24 then @@ -5449,9 +5551,9 @@ function AesAlgoNameDecode(AesAlgoName: PUtf8Char; end; tab := @NormToUpperAnsi7Byte; i := IntegerScanIndex(pointer(AESMODESTXT4), succ(ord(high(TAesMode))), - cardinal(tab[ord(AesAlgoName[8])]) + - cardinal(tab[ord(AesAlgoName[9])]) shl 8 + - cardinal(tab[ord(AesAlgoName[10])]) shl 16); + cardinal(tab[ord(AesAlgoName[8])]) + + cardinal(tab[ord(AesAlgoName[9])]) shl 8 + + cardinal(tab[ord(AesAlgoName[10])]) shl 16); if i < 0 then exit; Mode := TAesMode(i); diff --git a/test/test.core.crypto.pas b/test/test.core.crypto.pas index a9264fca8..d4ca6f5cf 100644 --- a/test/test.core.crypto.pas +++ b/test/test.core.crypto.pas @@ -1114,7 +1114,6 @@ procedure TTestCoreCrypto._AES; mac: TAesMac256; mac1, mac2: THash256; one, two, encdec: TAesAbstract; - PC: PAnsiChar; noaesni, gcm, aead: boolean; Timer: array[boolean] of TPrecisionTimer; ValuesCrypted, ValuesOrig: array[0..6] of RawByteString; @@ -1170,7 +1169,8 @@ procedure TTestCoreCrypto._AES; FillRandom(@tag1, 4); Check(TAesGcmAbstract(one).AesGcmFinal(tag1)); // writeln(one.classname, ks, ' ', AesBlockToShortString(tag1)); - CheckEqual(AesBlockToString(tag1), TEST_AES_TAG[k], 'TEST_AES_TAG'); + CheckEqual(AesBlockToString(tag1), TEST_AES_TAG[k], + FormatUtf8('TEST_AES_TAG %', [ks])); end; one.IV := iv.b; if aead then @@ -1541,6 +1541,12 @@ procedure TTestCoreCrypto._AES_GCM; n: integer; begin key := PAesBlock(@hex32)^; + FillZero(buf); + FillZero(tag); + check(ctxt.FullEncryptAndAuthenticate(key, 128, @hex32, 12, nil, 0, + @buf, @buf, SizeOf(buf), tag)); + CheckEqual(CardinalToHex(crc32c(0, @buf, SizeOf(buf))), 'AC3DDD17'); + CheckEqual(Md5DigestToString(tag), '0332c40f9926bd3cdadf33148912c672'); for n := 1 to 32 do begin Check(ctxt.Init(key, 128)); @@ -1629,6 +1635,7 @@ procedure TTestCoreCrypto.Catalog; c: TAesAbstract; key: THash256; begin + FillZero(key); for k := 0 to 2 do for m := low(m) to high(m) do begin @@ -1638,7 +1645,7 @@ procedure TTestCoreCrypto.Catalog; CheckUtf8(AesAlgoNameDecode(n, k2) = TAesFast[m], n); UpperCaseSelf(n); CheckUtf8(AesAlgoNameDecode(n, k2) = TAesFast[m], n); - c := TAesFast[m].Create(k, 128 + k * 64); + c := TAesFast[m].Create(key, 128 + k * 64); try Check(c.AlgoMode = m); Check(IdemPropName(c.AlgoName, pointer(n), length(n)));