Permalink
Browse files

huge performance enhancement on SynLZ by new optimized x64 asm

  • Loading branch information...
Arnaud Bouchez
Arnaud Bouchez committed Aug 10, 2017
1 parent 22735bb commit 042ee0102e240f1c6871bf1e86bfb61b5cba2fca
Showing with 284 additions and 37 deletions.
  1. +279 −32 SynLZ.pas
  2. +4 −4 SynSelfTests.pas
  3. +1 −1 SynopseCommit.inc
View
311 SynLZ.pas
@@ -226,7 +226,12 @@ function SynLZdecompress1pas(src: PAnsiChar; size: integer; dst: PAnsiChar): int
function SynLZdecompress1partial(src: PAnsiChar; size: integer; dst: PAnsiChar;
maxDst: integer): integer;
{$ifdef PUREPASCAL}
{$ifdef CPUINTEL}
/// optimized x86/x64 asm version of the 1st compression algorithm
function SynLZcompress1(src: PAnsiChar; size: integer; dst: PAnsiChar): integer;
/// optimized x86/x64 asm version of the 1st compression algorithm
function SynLZdecompress1(src: PAnsiChar; size: integer; dst: PAnsiChar): integer;
{$else}
var
/// fastest available SynLZ compression (using 1st algorithm)
SynLZCompress1: function(
@@ -235,20 +240,7 @@ function SynLZdecompress1partial(src: PAnsiChar; size: integer; dst: PAnsiChar;
/// fastest available SynLZ decompression (using 1st algorithm)
SynLZDecompress1: function(
src: PAnsiChar; size: integer; dst: PAnsiChar): integer = SynLZDecompress1pas;
{$else}
/// optimized x86 asm version of the 1st compression algorithm
function SynLZcompress1asm(src: PAnsiChar; size: integer; dst: PAnsiChar): integer;
/// optimized x86 asm version of the 1st compression algorithm
function SynLZdecompress1asm(src: PAnsiChar; size: integer; dst: PAnsiChar): integer;
/// fastest available SynLZ compression (using x86 asm on 1st algorithm)
function SynLZcompress1(src: PAnsiChar; size: integer; dst: PAnsiChar): integer;
/// fastest available SynLZ decompression (using x86 asm on 1st algorithm)
function SynLZdecompress1(src: PAnsiChar; size: integer; dst: PAnsiChar): integer;
{$endif PUREPASCAL}
{$endif CPUINTEL}
/// 2nd compression algorithm optimizing pattern copy
// - this algorithm is a bit smaller, but slower, so the 1st method is preferred
@@ -290,14 +282,10 @@ function SynLZdecompressdestlen(in_p: PAnsiChar): integer;
result := (result and $7fff) or (integer(PWord(in_p)^) shl 15);
end;
{$ifndef PUREPASCAL}
{$ifdef CPUINTEL}
// using direct x86 jmp also circumvents Internal Error C11715 for Delphi 5
function SynLZcompress1(src: PAnsiChar; size: integer; dst: PAnsiChar): integer;
asm
jmp SynLzCompress1asm
end;
function SynLZcompress1asm(src: PAnsiChar; size: integer; dst: PAnsiChar): integer;
{$ifdef CPUX86}
asm
push ebp
push ebx
@@ -444,7 +432,7 @@ function SynLZcompress1asm(src: PAnsiChar; size: integer; dst: PAnsiChar): integ
jbe @@0892
@@0900: cmp esi, [esp+0CH]
jnc @@0903
@@0901: mov al, [esi]
@@0901: mov al, [esi]
mov [edi], al
inc esi
inc edi
@@ -465,8 +453,158 @@ function SynLZcompress1asm(src: PAnsiChar; size: integer; dst: PAnsiChar): integ
pop esi
pop ebx
pop ebp
{$else CPUX86}
var off: array[0..4095] of PAnsiChar;
cache: array[0..4095] of cardinal; // uses 32B+16KB=48KB on stack
asm // rcx=src, edx=size, r8=dest
{$ifndef win64} // Linux 64-bit ABI
mov r8, rdx
mov rdx, rsi
mov rcx, rdi
{$endif win64}
push rbx
push rdi
push rsi
push r12
push r13
push r14
push r15
mov r15, r8 // r8=dest r15=dst_beg
mov rbx, rcx // rbx=src
cmp edx, 32768
jc @03
mov eax, edx
and eax, 7FFFH
or eax, 8000H
mov word ptr [r8], ax
mov eax, edx
shr eax, 15
mov word ptr [r8+2H], ax
add r8, 4
jmp @05
@03: mov word ptr [r8], dx
test edx, edx
jnz @04
mov r15d, 2
jmp @19
nop
@04: add r8, 2
@05: lea r9, [rdx+rbx] // r9=src_end
lea r10, [r9-0BH] // r10=src_endmatch
mov ecx, 1 // ecx=CWBits
mov r11, r8 // r11=CWpoint
mov dword ptr [r8], 0
add r8, 4
pxor xmm0, xmm0
mov eax, 32768-32
@06: movdqu dqword ptr [off+rax-16], xmm0
movdqu dqword ptr [off+rax], xmm0
sub eax, 32
jae @06
cmp rbx, r10
ja @15
@07: mov edx, dword ptr [rbx]
mov rax, rdx
mov r12, rdx
shr rax, 12
xor rax, rdx
and rax, 0FFFH // rax=h
mov r14, qword ptr [off+rax*8] // r14=o
mov edx, dword ptr [cache+rax*4]
mov qword ptr [off+rax*8], rbx
mov dword ptr [cache+rax*4], r12d
xor rdx, r12
test r14, r14
lea rdi, [r9-1]
je @12
and rdx, 0FFFFFFH
jne @12
mov rdx, rbx
sub rdx, r14
cmp rdx, 2
jbe @12
or dword ptr[r11], ecx
add rbx, 2
add r14, 2
mov esi, 1
sub rdi, rbx
cmp rdi, 271
jc @09
mov edi, 271
jmp @09
@08: inc rsi
@09: mov edx, dword ptr [r14+rsi]
cmp dl, byte ptr [rbx+rsi]
jnz @10
cmp rsi, rdi
jge @10
inc rsi
cmp dh, byte ptr [rbx+rsi]
jnz @10
shr edx, 16
cmp rsi, rdi
jge @10
inc rsi
cmp dl, byte ptr [rbx+rsi]
jnz @10
cmp rsi, rdi
jge @10
inc rsi
cmp dh, byte ptr [rbx+rsi]
jnz @10
cmp rsi, rdi
jc @08
@10: add rbx, rsi
shl rax, 4
cmp rsi, 15
ja @11
or rax, rsi
mov word ptr [r8], ax
add r8, 2
jmp @13
@11: sub rsi, 16
mov word ptr [r8], ax
mov byte ptr [r8+2H], sil
add r8, 3
jmp @13
@12: mov al, byte ptr [rbx]
mov byte ptr [r8], al
add rbx, 1
add r8, 1
@13: add ecx, ecx
jnz @14
mov r11, r8
mov [r8], ecx
add r8, 4
add ecx, 1
@14: cmp rbx, r10
jbe @07
@15: cmp rbx, r9
jnc @18
@16: mov al, byte ptr [rbx]
mov byte ptr [r8], al
add rbx, 1
add r8, 1
add ecx, ecx
jnz @17
mov [r8], ecx
add r8, 4
add ecx, 1
@17: cmp rbx, r9
jc @16
@18: sub r8, r15
mov r15, r8
@19: mov rax, r15
pop r15
pop r14
pop r13
pop r12
pop rsi
pop rdi
pop rbx
{$endif CPUX86}
end;
{$endif PUREPASCAL}
{$endif CPUINTEL}
function SynLZcompress1pas(src: PAnsiChar; size: integer; dst: PAnsiChar): integer;
var dst_beg, // initial dst value
@@ -669,14 +807,10 @@ function SynLZdecompress1b(src: PAnsiChar; size: integer; dst: PAnsiChar): integ
// assert(result=dst-dst_beg);
end;
{$ifndef PUREPASCAL}
{$ifdef CPUINTEL}
// using direct x86 jmp also circumvents Internal Error C11715 for Delphi 5
function SynLZdecompress1(src: PAnsiChar; size: integer; dst: PAnsiChar): integer;
asm
jmp SynLZDecompress1asm
end;
function SynLZdecompress1asm(src: PAnsiChar; size: integer; dst: PAnsiChar): integer;
{$ifdef CPUX86}
asm
push ebp
push ebx
@@ -760,7 +894,7 @@ function SynLZdecompress1asm(src: PAnsiChar; size: integer; dst: PAnsiChar): int
cmp edx, 32 // inlined optimized move()
ja @large
sub edx, 8
jg @9_32
jg @9_32
mov ecx, [eax]
mov eax, [eax+4] // always copy 8 bytes for 0..8
mov [esi], ecx // safe since src_endmatch := src_end-(6+5)
@@ -789,7 +923,7 @@ function SynLZdecompress1asm(src: PAnsiChar; size: integer; dst: PAnsiChar): int
neg edx
and esi, -8
lea edx, [edx+esi+8]
pop esi
pop esi
@lrgnxt:fild qword ptr[eax+edx]
fistp qword ptr[esi+edx]
add edx, 8
@@ -830,8 +964,121 @@ function SynLZdecompress1asm(src: PAnsiChar; size: integer; dst: PAnsiChar): int
pop esi
pop ebx
pop ebp
{$else CPUX86}
var off: array[0..4095] of PAnsiChar; // use 32KB of stack space
asm // rcx=src, edx=size, r8=dest
{$ifndef win64} // Linux 64-bit ABI
mov r8, rdx
mov rdx, rsi
mov rcx, rdi
{$endif win64}
push rbx
push rsi
push rdi
push r12
push r13
push r14
movzx eax, word ptr [rcx] // rcx=src eax=result
lea r9, [rdx+rcx] // r9=src_end
test eax, eax
je @35
add rcx, 2
mov r10d, eax
and r10d, 8000H
jz @21
movzx ebx, word ptr [rcx]
shl ebx, 15
mov r10d, eax
and r10d, 7FFFH
or r10d, ebx
mov eax, r10d
add rcx, 2
@21: lea r10, [r8-1H] // r10=last_hashed r8=dest
@22: mov edi, dword ptr [rcx] // edi=CW
add rcx, 4
mov r13d, 1 // r13d=CWBit
cmp rcx, r9
jnc @35
@23: mov ebx, r13d
and ebx, edi
jnz @25
mov bl, byte ptr [rcx]
mov byte ptr [r8], bl
add rcx, 1
lea rbx, [r8-2H]
add r8, 1
cmp rcx, r9
jnc @35
cmp rbx, r10
jbe @24
add r10, 1
mov esi, dword ptr [r10]
mov rbx, rsi
shr esi, 12
xor ebx, esi
and ebx, 0FFFH
mov qword ptr [off+rbx*8], r10
@24: shl r13d, 1
jnz @23
jmp @22
@25: movzx r11, word ptr [rcx] // r11=t
add rcx, 2
mov ebx, r11d // ebx=h
shr ebx, 4
and r11, 0FH
lea r11, [r11+2H]
jnz @26
movzx r11, byte ptr [rcx]
add rcx, 1
lea r11, [r11+12H]
@26: mov r14, qword ptr [off+rbx*8] // r14=o
mov rbx, r8
xor rsi, rsi
mov r12, r11
sub rbx, r14
cmp rbx, r11
jnc @28
@27: mov bl, byte ptr [r14+rsi]
mov byte ptr [r8+rsi], bl
inc rsi
dec r12
jnz @27
jmp @31
@28: shr r12, 3
jz @30
@29: mov rbx, qword ptr [r14+rsi]
mov qword ptr [r8+rsi], rbx
add rsi, 8
dec r12
jnz @29
@30: mov rbx, qword ptr [r14+rsi]
mov qword ptr [r8+rsi], rbx
@31: cmp rcx, r9
jnz @33
jmp @35
@32: add r10, 1
mov ebx, dword ptr [r10]
mov rsi, rbx
shr ebx, 12
xor esi, ebx
and esi, 0FFFH
mov qword ptr [off+rsi*8], r10
@33: cmp r10, r8
jc @32
add r8, r11
lea r10, [r8-1H]
shl r13d, 1
jnz @23
jmp @22
@35: pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbx
{$endif CPUX86}
end;
{$endif PUREPASCAL}
{$endif CPUINTEL}
function SynLZdecompress1pas(src: PAnsiChar; size: integer; dst: PAnsiChar): integer;
var last_hashed: PAnsiChar; // initial src and dst value
Oops, something went wrong.

0 comments on commit 042ee01

Please sign in to comment.