Skip to content

Commit

Permalink
x86: adapt SSSE3 wiener filter to SSE2
Browse files Browse the repository at this point in the history
Also slightly optimized the 32-bit SSSE3, especially by the removal of
an XMM store/load.

---------------------
x86_64:
------------------------------------------
wiener_chroma_8bpc_c: 193155.1
wiener_chroma_8bpc_sse2: 48973.4
wiener_chroma_8bpc_ssse3: 31486.3
---------------------
wiener_luma_8bpc_c: 192787.5
wiener_luma_8bpc_sse2: 48674.9
wiener_luma_8bpc_ssse3: 30446.3
------------------------------------------

---------------------
x86_32:
------------------------------------------
wiener_chroma_8bpc_c: 309861.0
wiener_chroma_8bpc_sse2: 52345.9
wiener_chroma_8bpc_ssse3: 32983.2
---------------------
wiener_luma_8bpc_c: 317909.1
wiener_luma_8bpc_sse2: 52522.1
wiener_luma_8bpc_ssse3: 33323.1
------------------------------------------
  • Loading branch information
Victorien Le Couviour--Tuffet authored and barrbrain committed Oct 30, 2019
1 parent f4dbc65 commit 1144224
Showing 1 changed file with 195 additions and 69 deletions.
264 changes: 195 additions & 69 deletions src/x86/looprestoration_ssse3.asm
Expand Up @@ -43,8 +43,6 @@ pb_15: times 16 db 15
pb_0_1: times 8 db 0, 1
pb_6_7: times 8 db 6, 7
pb_14_15: times 8 db 14, 15
pb_0_1_2_3: times 4 db 0, 1, 2, 3
pb_4_5_6_7: times 4 db 4, 5, 6, 7
pw_1: times 8 dw 1
pw_16: times 8 dw 16
pw_128: times 8 dw 128
Expand Down Expand Up @@ -97,58 +95,101 @@ SECTION .text
%define PIC_sym(sym) (sym)
%endif

%macro PALIGNR 4 ; dst, src1, src2, shift
%if cpuflag(ssse3)
palignr %1, %2, %3, %4
%else
%assign %%i regnumof%+%1 + 1
%define %%tmp m %+ %%i
psrldq %1, %3, %4
pslldq %%tmp, %2, 16-%4
por %1, %%tmp
%endif
%endmacro

%macro PMADDUBSW 5 ; dst, src, zero, tmp, reset_zero
%if cpuflag(ssse3)
pmaddubsw %1, %2
%else
%if %5 == 1
pxor %3, %3
%endif
punpckhbw %4, %1, %3
punpcklbw %1, %3
pmaddwd %4, %2
pmaddwd %1, %2
packssdw %1, %4
%endif
%endmacro

;;;;;;;;;;;;;;;;;;;;;;
;; wiener ;;
;;;;;;;;;;;;;;;;;;;;;;

INIT_XMM ssse3
%macro WIENER_H 0
%if ARCH_X86_64
cglobal wiener_filter_h, 5, 15, 16, dst, left, src, stride, fh, w, h, edge
mov edged, edgem
movifnidn wd, wm
mov hd, hm
movq m15, [fhq]
pshufb m12, m15, [pb_6_7]
pshufb m13, m15, [pb_4]
pshufb m14, m15, [pb_2]
pshufb m15, m15, [pb_0]
mova m11, [pw_2048]
mova m10, [pw_16380]
lea r11, [pb_right_ext_mask]

DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim
%else
cglobal wiener_filter_h, 5, 7, 8, -100, dst, left, src, stride, fh, w, h, edge
mov wd, edgem
mov [esp+12], wd
cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, fh, w, h, edge
mov r5, edgem
mov [esp+12], r5
mov wd, wm
mov hd, hm
SETUP_PIC hd
movq m0, [fhq]
pshufb m3, m0, [PIC_sym(pb_6_7)]
pshufb m2, m0, [PIC_sym(pb_4)]
pshufb m1, m0, [PIC_sym(pb_2)]
pshufb m0, m0, [PIC_sym(pb_0)]
%define m15 m0
%define m14 m1
%define m13 m2
%define m12 m3
%endif

DEFINE_ARGS dst, left, src, stride, x, w, h, edge
movq m15, [fhq]
%if cpuflag(ssse3)
pshufb m12, m15, [PIC_sym(pb_6_7)]
pshufb m13, m15, [PIC_sym(pb_4)]
pshufb m14, m15, [PIC_sym(pb_2)]
pshufb m15, m15, [PIC_sym(pb_0)]
%else
pshuflw m12, m15, q3333
punpcklbw m15, m15
pshufhw m13, m15, q0000
pshuflw m14, m15, q2222
pshuflw m15, m15, q0000
punpcklqdq m12, m12
punpckhqdq m13, m13
punpcklqdq m14, m14
punpcklqdq m15, m15
psraw m13, 8
psraw m14, 8
psraw m15, 8
%endif

%define srcptrq srcq
%define dstptrq dstq
%define hd dword [esp]
%define edged dword [esp+12]
%define xlimd dword [esp+16]
%if ARCH_X86_64
mova m11, [pw_2048]
mova m10, [pw_16380]
lea r11, [pb_right_ext_mask]

DEFINE_ARGS dst, left, src, stride, x, w, h, edge, srcptr, dstptr, xlim
%else
%define m10 [PIC_sym(pw_16380)]
%define m11 [PIC_sym(pw_2048)]
%define m12 [esp+0x14]
%define m13 [esp+0x24]
%define m14 [esp+0x34]
%define m15 [esp+0x44]

mova m15, m0
mova m14, m1
mova m13, m2
mova m12, m3
mova m13, m2
mova m14, m1
mova m15, m0

DEFINE_ARGS dst, left, src, stride, x, w, h, edge
%define srcptrq srcq
%define dstptrq dstq
%define hd dword [esp+ 0]
%define edged dword [esp+12]
%define xlimd dword [esp+16]
%endif

; if (edge & has_right) align_w_to_16
Expand Down Expand Up @@ -196,7 +237,16 @@ cglobal wiener_filter_h, 5, 7, 8, -100, dst, left, src, stride, fh, w, h, edge
jmp .left_load_done
.emu_left:
movd m0, [srcq]
%if cpuflag(ssse3)
pshufb m0, [PIC_sym(pb_14x0_1_2)]
%else
pslldq m1, m0, 13
punpcklbw m0, m0
pshuflw m0, m0, q0000
punpcklqdq m0, m0
psrldq m0, 2
por m0, m1
%endif

; load right edge pixels
.left_load_done:
Expand All @@ -208,19 +258,39 @@ cglobal wiener_filter_h, 5, 7, 8, -100, dst, left, src, stride, fh, w, h, edge

; for very small images (w=[1-2]), edge-extend the original cache,
; ugly, but only runs in very odd cases
%if cpuflag(ssse3)
add wd, wd
%if ARCH_X86_64
%if ARCH_X86_64
pshufb m0, [r11-pb_right_ext_mask+pb_0_to_15_min_n+wq*8-16]
%else
%else
pshufb m0, [PIC_sym(pb_0_to_15_min_n)+wq*8-16]
%endif
%endif
shr wd, 1
%else
shl wd, 4
pcmpeqd m2, m2
movd m3, wd
psrldq m2, 2
punpckhbw m1, m0, m0
pshufhw m1, m1, q1122
psllq m1, m3
pand m0, m2
pandn m2, m1
por m0, m2
shr wd, 4
%endif

; main x loop, mostly this starts in .main_load
.splat_right:
; no need to load new pixels, just extend them from the (possibly previously
; extended) previous load into m0
%if cpuflag(ssse3)
pshufb m1, m0, [PIC_sym(pb_15)]
%else
punpckhbw m1, m0, m0
pshufhw m1, m1, q3333
punpckhqdq m1, m1
%endif
jmp .main_loop
.load_and_splat:
; load new pixels and extend edge for right-most
Expand All @@ -235,7 +305,13 @@ cglobal wiener_filter_h, 5, 7, 8, -100, dst, left, src, stride, fh, w, h, edge
add PIC_reg, xd
%endif
movd m3, [srcptrq+2+xq]
%if cpuflag(ssse3)
pshufb m3, [PIC_sym(pb_0)]
%else
punpcklbw m3, m3
pshuflw m3, m3, q0000
punpcklqdq m3, m3
%endif
pand m1, m2
pxor m2, [PIC_sym(pb_right_ext_mask)]
pand m3, m2
Expand All @@ -246,58 +322,98 @@ cglobal wiener_filter_h, 5, 7, 8, -100, dst, left, src, stride, fh, w, h, edge
; load subsequent line
movu m1, [srcptrq+3]
.main_loop:
palignr m2, m1, m0, 10
palignr m3, m1, m0, 11
palignr m4, m1, m0, 12
palignr m5, m1, m0, 13
palignr m6, m1, m0, 14
palignr m7, m1, m0, 15
%if ARCH_X86_64
PALIGNR m2, m1, m0, 10
PALIGNR m3, m1, m0, 11
PALIGNR m4, m1, m0, 12
PALIGNR m5, m1, m0, 13
PALIGNR m6, m1, m0, 14
PALIGNR m7, m1, m0, 15

%if ARCH_X86_32
mova [esp+0x54], m1
%define m8 m1
%endif
punpcklbw m0, m2, m1
punpckhbw m2, m1
punpcklbw m8, m3, m7
punpckhbw m3, m7
punpcklbw m7, m4, m6
punpckhbw m4, m6
pmaddubsw m0, m15
pmaddubsw m2, m15
pmaddubsw m8, m14
pmaddubsw m3, m14
pmaddubsw m7, m13
pmaddubsw m4, m13
PMADDUBSW m0, m15, m6, m9, 1
PMADDUBSW m2, m15, m6, m9, 0
PMADDUBSW m8, m14, m6, m9, 0
PMADDUBSW m3, m14, m6, m9, 0
PMADDUBSW m7, m13, m6, m9, 0
PMADDUBSW m4, m13, m6, m9, 0
paddw m0, m8
paddw m2, m3
pxor m3, m3
punpcklbw m6, m5, m3
punpckhbw m5, m3
psllw m8, m6, 7
psllw m3, m5, 7
%if cpuflag(ssse3)
pxor m6, m6
%endif
punpcklbw m3, m5, m6
punpckhbw m5, m6
psllw m8, m3, 7
psllw m6, m5, 7
psubw m8, m10
psubw m3, m10
pmullw m6, m12
psubw m6, m10
pmullw m3, m12
pmullw m5, m12
paddw m0, m7
paddw m2, m4
paddw m0, m6
paddw m0, m3
paddw m2, m5
paddsw m0, m8
paddsw m2, m3
paddsw m2, m6
psraw m0, 3
psraw m2, 3
paddw m0, m11
paddw m2, m11
mova [dstptrq+ 0], m0
mova [dstptrq+16], m2

%if ARCH_X86_64
mova m0, m1
%else
mova m0, [esp+0x54]
PALIGNR m2, m1, m0, 10
punpcklbw m3, m2, m1
punpckhbw m2, m1
PMADDUBSW m3, m15, m4, m5, 1
PMADDUBSW m2, m15, m4, m5, 0
PALIGNR m4, m1, m0, 11
PALIGNR m5, m1, m0, 15
punpcklbw m6, m4, m5
punpckhbw m4, m5
PMADDUBSW m6, m14, m5, m7, 1
PMADDUBSW m4, m14, m5, m7, 0
paddw m3, m6
paddw m2, m4
PALIGNR m4, m1, m0, 12
PALIGNR m5, m1, m0, 14
punpcklbw m6, m4, m5
punpckhbw m4, m5
PMADDUBSW m6, m13, m5, m7, 1
PMADDUBSW m4, m13, m5, m7, 0
paddw m3, m6
paddw m2, m4
PALIGNR m6, m1, m0, 13
%if cpuflag(ssse3)
pxor m5, m5
%endif
punpcklbw m4, m6, m5
punpckhbw m6, m5
psllw m5, m4, 7
psllw m7, m6, 7
psubw m5, m10
psubw m7, m10
pmullw m4, m12
pmullw m6, m12
paddw m3, m4
paddw m2, m6
paddsw m3, m5
paddsw m2, m7
psraw m3, 3
psraw m2, 3
paddw m3, m11
paddw m2, m11
mova [dstptrq+ 0], m3
mova [dstptrq+16], m2
%endif

mova m0, m1
add srcptrq, 16
add dstptrq, 32
sub xd, 16
Expand All @@ -317,18 +433,19 @@ cglobal wiener_filter_h, 5, 7, 8, -100, dst, left, src, stride, fh, w, h, edge
dec hd
jg .loop
RET
%endmacro

%macro WIENER_V 0
%if ARCH_X86_64
cglobal wiener_filter_v, 4, 10, 16, dst, stride, mid, w, h, fv, edge
mov edged, edgem
movifnidn fvq, fvmp
movifnidn hd, hm
movq m15, [fvq]
pshufb m14, m15, [pb_4_5_6_7]
pshufb m15, m15, [pb_0_1_2_3]
pshufd m14, m15, q1111
pshufd m15, m15, q0000
paddw m14, [pw_0_128]
movd m12, [pd_1024]
pshufd m12, m12, 0
mova m12, [pd_1024]

DEFINE_ARGS dst, stride, mid, w, h, y, edge, ylim, mptr, dstptr

Expand All @@ -351,8 +468,8 @@ cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge
SETUP_PIC edged

movq m0, [fvq]
pshufb m1, m0, [PIC_sym(pb_4_5_6_7)]
pshufb m0, m0, [PIC_sym(pb_0_1_2_3)]
pshufd m1, m0, q1111
pshufd m0, m0, q0000
paddw m1, [PIC_sym(pw_0_128)]
mova [esp+0x50], m0
mova [esp+0x40], m1
Expand Down Expand Up @@ -504,6 +621,15 @@ cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge
sub wd, 8
jg .loop_x
RET
%endmacro

INIT_XMM sse2
WIENER_H
WIENER_V

INIT_XMM ssse3
WIENER_H
WIENER_V

;;;;;;;;;;;;;;;;;;;;;;;;;;
;; self-guided ;;
Expand Down

0 comments on commit 1144224

Please sign in to comment.