Skip to content
Permalink
Browse files

minor update - no functional change

  • Loading branch information
Arnaud Bouchez
Arnaud Bouchez committed Feb 13, 2020
1 parent 9b58faa commit a24ada5959df9cc82df35b7c73749fb4601ef83a
Showing with 7 additions and 11 deletions.
  1. +6 −10 SynCommons.pas
  2. +1 −1 SynopseCommit.inc
jl @favxr
@favxe: vmovups [r10], ymm1 // last 32
vmovups [r9], ymm2 // first 32
// https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties
vzeroupper
ret
align 16
movups oword ptr[dst], xmm0 // first unaligned 1..16 bytes
add dst, 16
and dst, -16
movups oword ptr[dst], xmm0 // unaligned 17..32 bytes
movaps oword ptr[dst], xmm0 // aligned 17..32 bytes
vinsertf128 ymm0,ymm0,xmm0,1
add dst, 16
and dst, -32 // dst is 32-bytes aligned
add rcx, 4
dec r8
jmp @align
// simd process of 128 bytes per loop iteration
// avx process of 128 bytes (32 indexes) per loop iteration
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@s: sub r8, 32
vmovdqa ymm1, [rcx]
add rcx, 128
cmp r8, 32
jae @s
// https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties
vzeroupper
test r8, r8
jnz @1
ret
jmp @2
// trailing indexes
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
@1: dec r8
add rcx, 4
dec r8
jmp @align
// simd process of 64 bytes = 4 x 128-bit quad per loop iteration
// SSE2 process of 64 bytes (16 indexes) per loop iteration
{$ifdef FPC} align 16 {$else} .align 16 {$endif}
@s: sub r8, 16
movdqa xmm1, dqword [rcx] // quad load
add rcx, 64
cmp r8, 16
jae @s
test r8, r8
jnz @1
ret
jmp @2
// trailing indexes
{$ifdef FPC} align 8 {$else} .align 8 {$endif}
@1: dec r8
@@ -1 +1 @@
'1.18.5478'
'1.18.5479'

0 comments on commit a24ada5

Please sign in to comment.
You can’t perform that action at this time.