diff --git a/src/x86/itx.asm b/src/x86/itx.asm index 4d687b08..dc07b74a 100644 --- a/src/x86/itx.asm +++ b/src/x86/itx.asm @@ -47,9 +47,11 @@ pw_m%2_%1: dw -%2, %1 pw_3803_1321: dw 3803, 1321 pw_m1321_2482: dw -1321, 2482 pw_2482_3344: dw 2482, 3344 +pw_m3344_3344: dw -3344, 3344 pw_m3803_3344: dw -3803, 3344 pw_m3803_m6688: dw -3803, -6688 -%define pw_3344x8 iadst4_dconly2b +COEF_PAIR 2896, 2896 +pw_2896_m2896: dw 2896, -2896 pw_5: times 2 dw 5 pw_2048: times 2 dw 2048 @@ -464,13 +466,15 @@ ALIGN function_align %macro IADST4_1D_PACKED 0 punpcklwd m2, m1, m0 punpckhwd m3, m1, m0 - psubw m0, m1 - punpckhqdq m1, m1 - paddw m1, m0 ; in0 - in2 + in3 + vpbroadcastd m5, [o(pw_m3344_3344)] vpbroadcastd m0, [o(pw_3803_1321)] vpbroadcastd m4, [o(pw_m1321_2482)] + pmaddwd m1, m5, m2 ; 3344*in3 - 3344*in2 + psrld m5, 16 pmaddwd m0, m2 pmaddwd m2, m4 + pmaddwd m5, m3 ; 3344*in0 + paddd m1, m5 ; 3344*in0 - 3344*in2 + 3344*in3 vpbroadcastd m4, [o(pw_2482_3344)] vpbroadcastd m5, [o(pw_m3803_3344)] pmaddwd m4, m3 @@ -478,19 +482,16 @@ ALIGN function_align paddd m4, m0 ; 1321*in0 + 3344*in1 + 3803*in2 + 2482*in3 vpbroadcastd m0, [o(pw_m3803_m6688)] pmaddwd m3, m0 - vpbroadcastd m0, [o(pw_3344x8)] - pmulhrsw m1, m0 ; out2 ____ vpbroadcastd m0, [o(pd_2048)] paddd m2, m0 + paddd m1, m0 paddd m0, m4 paddd m5, m2 ; 2482*in0 + 3344*in1 - 1321*in2 - 3803*in3 paddd m2, m4 paddd m2, m3 - psrad m0, 12 - psrad m5, 12 - psrad m2, 12 + REPX {psrad x, 12}, m1, m2, m0, m5 packssdw m0, m5 ; out0 out1 - packssdw m2, m2 ; out3 out3 + packssdw m1, m2 ; out2 out3 %endmacro INV_TXFM_4X4_FN dct, dct, 0 @@ -524,14 +525,13 @@ cglobal iadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 mova m0, [cq+16*0] mova m1, [cq+16*1] call .main - punpckhwd m3, m0, m2 + punpckhwd m3, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m3 punpcklwd m0, m3 jmp tx2q .pass2: call .main - vpblendd m1, m1, m2, 0x0c ; out2 out3 .end: pxor m2, m2 mova [cq+16*0], m2 @@ -552,14 +552,13 @@ cglobal iflipadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 mova m0, [cq+16*0] mova m1, [cq+16*1] call m(iadst_4x4_internal).main - punpcklwd m1, m0 - punpckhwd m2, m0 - punpcklwd m0, m2, m1 - punpckhwd m1, m2, m1 + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 + punpckhwd m1, m2 jmp tx2q .pass2: call m(iadst_4x4_internal).main - vpblendd m1, m1, m2, 0x0c ; out2 out3 .end: pxor m2, m2 mova [cq+16*0], m2 @@ -710,12 +709,55 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 paddsw m1, m5 ; out3 out2 %endmacro -%macro IADST8_1D_PACKED 0 +%macro IADST8_1D_PACKED 1 ; pass vpbroadcastd m6, [o(pd_2048)] punpckhwd m0, m4, m3 ; 0 7 punpckhwd m1, m5, m2 ; 2 5 punpcklwd m2, m5 ; 4 3 punpcklwd m3, m4 ; 6 1 +%if %1 == 1 + ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076, 3 ; t1a t0a + ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612, 2 ; t2a t3a + ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598, 3 ; t5a t4a + ITX_MUL2X_PACK 3, 4, 5, 6, 3920, 1189, 2 ; t6a t7a + psubsw m4, m0, m2 ; t5 t4 + paddsw m0, m2 ; t1 t0 + psubsw m5, m1, m3 ; t6 t7 + paddsw m1, m3 ; t2 t3 + ITX_MUL2X_PACK 4, 2, 3, 6, 1567, 3784, 3 ; t5a t4a + ITX_MUL2X_PACK 5, 2, 3, 6, 3784, 1567, 2 ; t7a t6a +%if mmsize > 16 + vbroadcasti128 m2, [o(deint_shuf)] +%else + mova m2, [o(deint_shuf)] +%endif + pshuflw m1, m1, q2301 + pshufhw m1, m1, q2301 + psubsw m3, m0, m1 ; t3 t2 + paddsw m0, m1 ; -out7 out0 + psubsw m1, m4, m5 ; t7 t6 + paddsw m4, m5 ; out6 -out1 + pshufb m0, m2 + pshufb m4, m2 + vpbroadcastd m5, [o(pw_m2896_2896)] + pmaddwd m2, m5, m3 + pmaddwd m5, m1 + paddd m2, m6 + paddd m5, m6 + psrad m2, 12 + psrad m5, 12 + packssdw m2, m5 ; out4 -out5 + vpbroadcastd m5, [o(pw_2896_2896)] + pmaddwd m3, m5 + pmaddwd m1, m5 + paddd m3, m6 + paddd m1, m6 + psrad m3, 12 + psrad m1, 12 + packssdw m1, m3 ; out2 -out3 + punpcklqdq m3, m4, m0 ; out6 -out7 + punpckhqdq m0, m4 ; out0 -out1 +%else ITX_MUL2X_PACK 0, 4, 5, 6, 401, 4076 ; t0a t1a ITX_MUL2X_PACK 1, 4, 5, 6, 1931, 3612 ; t2a t3a ITX_MUL2X_PACK 2, 4, 5, 6, 3166, 2598 ; t4a t5a @@ -743,6 +785,7 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 pmulhrsw m2, m5 ; out4 -out5 pshufd m1, m1, q1032 pmulhrsw m1, m5 ; out2 -out3 +%endif %endmacro INIT_YMM avx2 @@ -790,7 +833,7 @@ cglobal iadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 pmulhrsw m0, m2 pmulhrsw m1, m2 call m(iadst_8x4_internal).main - punpckhwd m3, m0, m2 + punpckhwd m3, m0, m1 punpcklwd m0, m1 punpckhwd m1, m0, m3 punpcklwd m0, m3 @@ -800,7 +843,7 @@ cglobal iadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 vextracti128 xm3, m1, 1 pshufd xm4, xm0, q1032 pshufd xm5, xm1, q1032 - call .main + call .main_pass2 vpbroadcastd m4, [o(pw_2048)] vinserti128 m0, m0, xm2, 1 vinserti128 m1, m1, xm3, 1 @@ -822,8 +865,12 @@ cglobal iadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 WRITE_4X8 0, 1 RET ALIGN function_align -.main: - WRAP_XMM IADST8_1D_PACKED +.main_pass1: + WRAP_XMM IADST8_1D_PACKED 1 + ret +ALIGN function_align +.main_pass2: + WRAP_XMM IADST8_1D_PACKED 2 ret INV_TXFM_4X8_FN flipadst, dct, 0 @@ -839,7 +886,7 @@ cglobal iflipadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 pmulhrsw m1, m2 call m(iadst_8x4_internal).main punpcklwd m3, m1, m0 - punpckhwd m1, m2, m0 + punpckhwd m1, m0 punpcklwd m0, m1, m3 punpckhwd m1, m3 jmp tx2q @@ -848,7 +895,7 @@ cglobal iflipadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 vextracti128 xm3, m1, 1 pshufd xm4, xm0, q1032 pshufd xm5, xm1, q1032 - call m(iadst_4x8_internal).main + call m(iadst_4x8_internal).main_pass2 vpbroadcastd m5, [o(pw_2048)] vinserti128 m3, m3, xm1, 1 vinserti128 m2, m2, xm0, 1 @@ -1099,8 +1146,13 @@ cglobal iadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 jmp tx2q .pass2: call .main - pshufd m1, m1, q1032 + vpbroadcastd m5, [o(pw_2896x8)] + paddsw m1, m2, m4 + psubsw m2, m4 + pmulhrsw m1, m5 ; -out7 out4 out6 -out5 + pmulhrsw m2, m5 ; out8 -out11 -out9 out10 vpbroadcastd m5, [o(pw_2048)] + pshufd m1, m1, q1032 vpblendd m4, m1, m0, 0x33 vpblendd m0, m0, m2, 0x33 vpblendd m2, m2, m3, 0x33 @@ -1176,7 +1228,6 @@ ALIGN function_align vinserti128 m0, m0, xm2, 1 ; t1 t0 t9a t8a vperm2i128 m2, m4, m1, 0x31 ; t7a t6a t15 t14 vinserti128 m4, m4, xm1, 1 ; t4a t5a t12 t13 - vpbroadcastd m5, [o(pw_2896x8)] pshufd m2, m2, q1032 ; t6a t7a t14 t15 psubsw m1, m0, m3 ; t3a t2a t11 t10 paddsw m0, m3 ; -out15 out0 out14 -out1 @@ -1184,10 +1235,21 @@ ALIGN function_align psubsw m4, m2 ; t6 t7 t14a t15a shufps m2, m1, m4, q1032 ; t2a t6 t10 t14a vpblendd m4, m4, m1, 0x33 ; t3a t7 t11 t15a - paddsw m1, m2, m4 - psubsw m2, m4 - pmulhrsw m1, m5 ; -out7 out4 out6 -out5 - pmulhrsw m2, m5 ; out8 -out11 -out9 out10 + ret +ALIGN function_align +.main_pass1_end: + vpbroadcastd m5, [o(pw_m2896_2896)] + vpbroadcastd m6, [o(pw_2896_2896)] + punpcklwd m1, m4, m2 + punpckhwd m4, m2 + pmaddwd m2, m5, m4 + pmaddwd m4, m6 + pmaddwd m5, m1 + pmaddwd m1, m6 + REPX {paddd x, m8}, m5, m1, m2, m4 + REPX {psrad x, 12}, m5, m2, m1, m4 + packssdw m2, m5 ; -out11 out8 out10 -out9 + packssdw m1, m4 ; -out7 out4 out6 -out5 ret INV_TXFM_4X16_FN flipadst, dct, 0 @@ -1214,8 +1276,13 @@ cglobal iflipadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 jmp tx2q .pass2: call m(iadst_4x16_internal).main - pshufd m1, m1, q1032 + vpbroadcastd m5, [o(pw_2896x8)] + paddsw m1, m2, m4 + psubsw m2, m4 + pmulhrsw m1, m5 ; -out7 out4 out6 -out5 + pmulhrsw m2, m5 ; out8 -out11 -out9 out10 vpbroadcastd m6, [o(pw_2048)] + pshufd m1, m1, q1032 vpblendd m4, m0, m2, 0x33 vpblendd m0, m0, m1, 0xcc vpblendd m1, m1, m3, 0xcc @@ -1381,7 +1448,7 @@ cglobal iadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 pmulhrsw xm2, xm0, [cq+16*2] pmulhrsw xm4, xm0 pmulhrsw xm5, xm0 - call m(iadst_4x8_internal).main + call m(iadst_4x8_internal).main_pass1 vinserti128 m0, m0, xm2, 1 vinserti128 m1, m1, xm3, 1 punpckhwd m2, m0, m1 @@ -1393,7 +1460,6 @@ cglobal iadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 jmp tx2q .pass2: call .main - vpblendd m1, m1, m2, 0xcc .end: vpermq m0, m0, q3120 vpermq m1, m1, q3120 @@ -1427,7 +1493,7 @@ cglobal iflipadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 pmulhrsw xm2, xm0, [cq+16*2] pmulhrsw xm4, xm0 pmulhrsw xm5, xm0 - call m(iadst_4x8_internal).main + call m(iadst_4x8_internal).main_pass1 vinserti128 m3, m3, xm1, 1 vinserti128 m2, m2, xm0, 1 punpckhwd m1, m3, m2 @@ -1439,7 +1505,7 @@ cglobal iflipadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 jmp tx2q .pass2: call m(iadst_8x4_internal).main - vpblendd m2, m2, m1, 0x33 + mova m2, m1 vpermq m1, m0, q2031 vpermq m0, m2, q2031 jmp m(iadst_8x4_internal).end2 @@ -1580,7 +1646,7 @@ cglobal iadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m3, [cq+32*3], q3120 ; 6 7 vpermq m5, [cq+32*1], q1302 ; 3 2 vpermq m2, [cq+32*2], q3120 ; 4 5 - call .main + call .main_pass1 vpbroadcastd m5, [o(pw_16384)] punpcklwd m4, m0, m1 punpckhwd m0, m1 @@ -1604,7 +1670,7 @@ cglobal iadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 .pass2: pshufd m4, m0, q1032 pshufd m5, m1, q1032 - call .main + call .main_pass2 vpbroadcastd m5, [o(pw_2048)] vpbroadcastd xm4, [o(pw_4096)] psubw m4, m5 ; lower half = 2048, upper half = -2048 @@ -1629,8 +1695,12 @@ cglobal iadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 WRITE_8X4 2, 3, 4, 5 RET ALIGN function_align -.main: - IADST8_1D_PACKED +.main_pass1: + IADST8_1D_PACKED 1 + ret +ALIGN function_align +.main_pass2: + IADST8_1D_PACKED 2 ret INV_TXFM_8X8_FN flipadst, dct @@ -1643,7 +1713,7 @@ cglobal iflipadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m3, [cq+32*3], q3120 ; 6 7 vpermq m5, [cq+32*1], q1302 ; 3 2 vpermq m2, [cq+32*2], q3120 ; 4 5 - call m(iadst_8x8_internal).main + call m(iadst_8x8_internal).main_pass1 vpbroadcastd m5, [o(pw_16384)] punpckhwd m4, m3, m2 punpcklwd m3, m2 @@ -1667,7 +1737,7 @@ cglobal iflipadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 .pass2: pshufd m4, m0, q1032 pshufd m5, m1, q1032 - call m(iadst_8x8_internal).main + call m(iadst_8x8_internal).main_pass2 vpbroadcastd m4, [o(pw_2048)] vpbroadcastd xm5, [o(pw_4096)] psubw m4, m5 ; lower half = -2048, upper half = 2048 @@ -1867,6 +1937,7 @@ INV_TXFM_8X16_FN adst, identity cglobal iadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 ITX_8X16_LOAD_COEFS call m(iadst_16x8_internal).main + call m(iadst_16x8_internal).main_pass1_end vpbroadcastd m10, [o(pw_16384)] pslld m9, m10, 17 psubw m10, m9 ; 16384, -16384 @@ -1874,6 +1945,7 @@ cglobal iadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 ALIGN function_align .pass2: call .main + call .main_pass2_end vpbroadcastd m9, [o(pw_2048)] vpbroadcastd xm8, [o(pw_4096)] psubw m8, m9 @@ -1930,38 +2002,72 @@ ALIGN function_align paddsw m4, m6 ; t8a t9a vpbroadcastd m11, [o(pw_m3784_1567)] vpbroadcastd m12, [o(pw_1567_3784)] - ITX_MUL2X_PACK 3, 6, _, 10, 11, 12, 4 ; t4a t5a + ITX_MUL2X_PACK 3, 6, _, 10, 12, 11, 6 ; t5a t4a psubw m6, m9, m11 ; pw_3784_m1567 - ITX_MUL2X_PACK 8, 12, _, 10, 12, 6, 4 ; t6a t7a + ITX_MUL2X_PACK 8, 6, _, 10, 6, 12, 6 ; t7a t6a vpbroadcastd m11, [o(pw_m1567_3784)] vpbroadcastd m12, [o(pw_3784_1567)] - ITX_MUL2X_PACK 2, 6, _, 10, 11, 12, 4 ; t15 t14 + ITX_MUL2X_PACK 2, 6, _, 10, 11, 12, 6 ; t15 t14 psubw m6, m9, m11 ; pw_1567_m3784 - ITX_MUL2X_PACK 5, 12, _, 10, 12, 6, 4 ; t13 t12 - vbroadcasti128 m11, [o(deint_shuf)] - vpbroadcastd m12, [o(pw_2896x8)] - psubsw m6, m0, m1 ; t3a t2a + ITX_MUL2X_PACK 5, 12, _, 10, 12, 6, 6 ; t13 t12 + vbroadcasti128 m12, [o(deint_shuf)] + paddsw m6, m4, m7 ; -out1 out14 + psubsw m4, m7 ; t10 t11 + psubsw m11, m3, m8 ; t7 t6 + paddsw m8, m3 ; out12 -out3 + psubsw m3, m0, m1 ; t3a t2a paddsw m0, m1 ; -out15 out0 paddsw m1, m2, m5 ; -out13 out2 psubsw m5, m2 ; t15a t14a - paddsw m2, m4, m7 ; -out1 out14 - psubsw m4, m7 ; t10 t11 - psubsw m7, m3, m8 ; t6 t7 - paddsw m8, m3 ; -out3 out12 - REPX {pshufb x, m11}, m6, m4, m0, m2 - vpblendd m3, m6, m4, 0xcc ; t3a t11 - shufps m6, m6, m4, q1032 ; t2a t10 - vpblendd m4, m5, m7, 0xcc ; t15a t7 - shufps m5, m5, m7, q1032 ; t14a t6 - shufps m7, m2, m0, q1032 ; out14 -out15 - vpblendd m0, m0, m2, 0x33 ; -out1 out0 - paddsw m2, m5, m4 ; -out5 out4 - psubsw m5, m4 ; out10 -out11 - psubsw m4, m6, m3 ; out8 -out9 - paddsw m3, m6 ; -out7 out6 - shufps m6, m8, m1, q1032 ; out12 -out13 - vpblendd m1, m1, m8, 0x33 ; -out3 out2 - REPX {pmulhrsw x, m12}, m2, m3, m4, m5 + pshufb m0, m12 + pshufb m6, m12 + pshufb m8, m12 + pshufb m1, m12 + shufps m7, m6, m0, q1032 ; out14 -out15 + vpblendd m0, m6, 0x33 ; -out1 out0 + punpcklqdq m6, m8, m1 ; out12 -out13 + punpckhqdq m1, m8, m1 ; -out3 out2 + ret +ALIGN function_align +.main_pass1_end: + vpbroadcastd m8, [o(pw_m2896_2896)] + vpbroadcastd m12, [o(pw_2896_2896)] + pmaddwd m9, m8, m11 ; -out11 + pmaddwd m2, m12, m5 ; -out5 + pmaddwd m5, m8 ; out10 + pmaddwd m11, m12 ; out4 + REPX {paddd x, m10}, m9, m5, m2, m11 + REPX {psrad x, 12 }, m9, m5, m2, m11 + packssdw m5, m9 ; out10 -out11 + packssdw m2, m11 ; -out5 out4 + pmaddwd m11, m8, m3 ; out8 + vpbroadcastd m8, [o(pw_2896_m2896)] + pmaddwd m3, m12 ; -out7 + pmaddwd m8, m4 ; -out9 + pmaddwd m4, m12 ; out6 + REPX {paddd x, m10}, m11, m3, m8, m4 + REPX {psrad x, 12 }, m11, m3, m8, m4 + packssdw m3, m4 ; -out7 out6 + packssdw m4, m11, m8 ; out8 -out9 + vpbroadcastd m10, [o(pw_16384)] + pxor m9, m9 + ret +ALIGN function_align +.main_pass2_end: + vpbroadcastd m8, [o(pw_2896x8)] + pshufb m2, m11, m12 + pshufb m5, m12 + pshufb m3, m12 + pshufb m4, m12 + punpcklqdq m11, m5, m2 ; t15a t7 + punpckhqdq m5, m2 ; t14a t6 + shufps m2, m3, m4, q1032 ; t2a t10 + vpblendd m3, m4, 0xcc ; t3a t11 + psubsw m4, m2, m3 ; out8 -out9 + paddsw m3, m2 ; -out7 out6 + paddsw m2, m5, m11 ; -out5 out4 + psubsw m5, m11 ; out10 -out11 + REPX {pmulhrsw x, m8}, m2, m3, m4, m5 ret INV_TXFM_8X16_FN flipadst, dct @@ -1972,6 +2078,7 @@ INV_TXFM_8X16_FN flipadst, identity cglobal iflipadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 ITX_8X16_LOAD_COEFS call m(iadst_16x8_internal).main + call m(iadst_16x8_internal).main_pass1_end vpbroadcastd m9, [o(pw_16384)] pslld m10, m9, 17 psubw m10, m9 ; -16384, 16384 @@ -1990,6 +2097,7 @@ cglobal iflipadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 jmp m(idct_8x16_internal).pass1_end2 .pass2: call m(iadst_8x16_internal).main + call m(iadst_8x16_internal).main_pass2_end vpbroadcastd m8, [o(pw_2048)] vpbroadcastd xm9, [o(pw_4096)] psubw m8, m9 @@ -2232,7 +2340,7 @@ cglobal iadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 vpermq m1, [cq+32*1], q1230 vpermq m2, [cq+32*2], q2103 call m(iadst_4x16_internal).main2 - pshufd m2, m2, q1032 + call m(iadst_4x16_internal).main_pass1_end punpcklwd m4, m3, m1 punpcklwd m5, m2, m0 punpckhwd m0, m1 @@ -2276,20 +2384,26 @@ cglobal iadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 RET ALIGN function_align .main: + vpbroadcastd m6, [o(pw_m3344_3344)] vpbroadcastd m7, [o(pw_3803_1321)] vpbroadcastd m8, [o(pw_m1321_2482)] vpbroadcastd m9, [o(pw_2482_3344)] punpcklwd m4, m2, m0 ; in2 in0 l - psubw m6, m0, m2 punpckhwd m2, m0 ; in2 in0 h - paddw m6, m3 ; t2 + psrld m5, m6, 16 + pmaddwd m10, m6, m4 ; t2:02 l + pmaddwd m6, m2 ; t2:02 h pmaddwd m0, m7, m4 ; t0:02 l pmaddwd m7, m2 ; t0:02 h pmaddwd m4, m8 ; t1:02 l pmaddwd m8, m2 ; t1:02 h punpckhwd m2, m3, m1 ; in3 in1 h punpcklwd m3, m1 ; in3 in1 l + pmaddwd m1, m5, m2 ; t2:3 h + pmaddwd m5, m3 ; t2:3 l + paddd m6, m1 vpbroadcastd m1, [o(pd_2048)] + paddd m10, m5 pmaddwd m5, m9, m3 pmaddwd m9, m2 paddd m0, m1 @@ -2299,6 +2413,8 @@ ALIGN function_align vpbroadcastd m9, [o(pw_m3803_3344)] pmaddwd m5, m9, m2 pmaddwd m9, m3 + paddd m10, m1 ; t2 + 2048 l + paddd m6, m1 ; t2 + 2048 h paddd m5, m1 ; t1:13 + 2048 h paddd m1, m9 ; t1:13 + 2048 l vpbroadcastd m9, [o(pw_m3803_m6688)] @@ -2310,12 +2426,11 @@ ALIGN function_align paddd m4, m0 paddd m2, m8 ; t0 + t1 - t3 + 2048 h paddd m3, m4 ; t0 + t1 - t3 + 2048 l - REPX {psrad x, 12}, m0, m7, m5, m1, m2, m3 + REPX {psrad x, 12}, m10, m6, m0, m7, m5, m1, m2, m3 packssdw m0, m7 packssdw m1, m5 packssdw m3, m2 - vpbroadcastd m2, [o(pw_3344x8)] - pmulhrsw m2, m6 + packssdw m2, m10, m6 ret INV_TXFM_16X4_FN flipadst, dct @@ -2329,7 +2444,7 @@ cglobal iflipadst_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 vpermq m1, [cq+32*1], q1230 vpermq m2, [cq+32*2], q2103 call m(iadst_4x16_internal).main2 - pshufd m2, m2, q1032 + call m(iadst_4x16_internal).main_pass1_end punpckhwd m4, m3, m2 punpckhwd m5, m1, m0 punpcklwd m0, m2 @@ -2552,7 +2667,7 @@ INV_TXFM_16X8_FN adst, identity cglobal iadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 ITX_16X8_LOAD_COEFS 1302 call m(iadst_8x16_internal).main2 - vpbroadcastd m10, [o(pw_16384)] + call m(iadst_8x16_internal).main_pass1_end psubw m11, m9, m10 punpcklwd m8, m0, m2 punpckhwd m0, m2 @@ -2567,7 +2682,7 @@ cglobal iadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 ALIGN function_align .pass2: call .main - vpbroadcastd m9, [o(pw_2048)] + call .main_pass2_end pxor m8, m8 psubw m8, m9 REPX {pmulhrsw x, m9}, m0, m2, m4, m6 @@ -2591,13 +2706,41 @@ ALIGN function_align ITX_MULSUB_2W 4, 8, 3, 9, 10, 3784, 1567 ; t6a, t7a psubsw m9, m6, m8 ; t7 paddsw m6, m8 ; out6 - vpbroadcastd m8, [o(pw_2896x8)] psubsw m3, m7, m5 ; t3 paddsw m7, m5 ; -out7 psubsw m5, m0, m2 ; t2 paddsw m0, m2 ; out0 psubsw m2, m1, m4 ; t6 paddsw m1, m4 ; -out1 + ret +ALIGN function_align +.main_pass1_end: + vpbroadcastd m11, [o(pw_m2896_2896)] + vpbroadcastd m12, [o(pw_2896_2896)] + punpckhwd m4, m3, m5 + punpcklwd m3, m5 + pmaddwd m5, m11, m4 + pmaddwd m4, m12 + pmaddwd m8, m11, m3 + pmaddwd m3, m12 + REPX {paddd x, m10}, m5, m4, m8, m3 + REPX {psrad x, 12 }, m5, m8, m4, m3 + packssdw m3, m4 ; -out3 + packssdw m4, m8, m5 ; out4 + punpcklwd m5, m9, m2 + punpckhwd m9, m2 + pmaddwd m2, m12, m5 + pmaddwd m5, m11 + pmaddwd m12, m9 + pmaddwd m11, m9 + REPX {paddd x, m10}, m2, m5, m12, m11 + REPX {psrad x, 12 }, m2, m12, m5, m11 + packssdw m2, m12 ; out2 + packssdw m5, m11 ; -out5 + ret +ALIGN function_align +.main_pass2_end: + vpbroadcastd m8, [o(pw_2896x8)] psubsw m4, m5, m3 paddsw m3, m5 psubsw m5, m2, m9 @@ -2606,6 +2749,7 @@ ALIGN function_align pmulhrsw m3, m8 ; -out3 pmulhrsw m4, m8 ; out4 pmulhrsw m5, m8 ; -out5 + vpbroadcastd m9, [o(pw_2048)] ret INV_TXFM_16X8_FN flipadst, dct @@ -2616,7 +2760,7 @@ INV_TXFM_16X8_FN flipadst, identity cglobal iflipadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 ITX_16X8_LOAD_COEFS 1302 call m(iadst_8x16_internal).main2 - vpbroadcastd m10, [o(pw_16384)] + call m(iadst_8x16_internal).main_pass1_end psubw m9, m10 punpcklwd m8, m6, m4 punpckhwd m6, m4 @@ -2655,7 +2799,7 @@ cglobal iflipadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 jmp tx2q .pass2: call m(iadst_16x8_internal).main - vpbroadcastd m9, [o(pw_2048)] + call m(iadst_16x8_internal).main_pass2_end pxor m8, m8 psubw m8, m9 pmulhrsw m10, m7, m8 @@ -2986,8 +3130,12 @@ INV_TXFM_16X16_FN adst, flipadst cglobal iadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 ITX_16X16_LOAD_COEFS call .main - vpbroadcastd m1, [o(pw_8192)] - REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 + call .main_pass1_end + pmulhrsw m0, m1, [cq+32*0] + pmulhrsw m2, m1, [cq+32*1] + REPX {pmulhrsw x, m1}, m4, m6, m8, m10 + pmulhrsw m12, m1, [cq+32*2] + pmulhrsw m14, m1, [cq+32*3] vextracti128 [rsp+16*5], m8, 1 mova [rsp+16*1], xm8 pxor m8, m8 @@ -2996,7 +3144,7 @@ cglobal iadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 ALIGN function_align .pass2: call .main - vpbroadcastd m1, [o(pw_2048)] + call .main_pass2_end REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14 mova [rsp+32*0], m6 pxor m6, m6 @@ -3081,6 +3229,62 @@ ALIGN function_align paddsw m0, m12 ; out0 paddsw m12, m8, m5 ; out12 psubsw m8, m5 ; t7 + ret +ALIGN function_align +.main_pass1_end: + mova [cq+32*0], m0 + mova [cq+32*1], m2 + mova [cq+32*2], m12 + mova [cq+32*3], m14 + vpbroadcastd m14, [pw_m2896_2896] + vpbroadcastd m12, [pw_2896_2896] + vpbroadcastd m2, [pd_2048] + punpcklwd m5, m11, m10 + punpckhwd m11, m10 + pmaddwd m10, m14, m5 + pmaddwd m0, m14, m11 + pmaddwd m5, m12 + pmaddwd m11, m12 + REPX {paddd x, m2}, m10, m0, m5, m11 + REPX {psrad x, 12}, m10, m0, m5, m11 + packssdw m10, m0 ; out10 + packssdw m5, m11 ; -out5 + punpcklwd m11, m8, m4 + punpckhwd m8, m4 + pmaddwd m4, m12, m11 + pmaddwd m0, m12, m8 + pmaddwd m11, m14 + pmaddwd m8, m14 + REPX {paddd x, m2}, m4, m0, m11, m8 + REPX {psrad x, 12}, m4, m0, m11, m8 + packssdw m4, m0 ; out4 + packssdw m11, m8 ; -out11 + punpcklwd m8, m9, m7 + punpckhwd m9, m7 + pmaddwd m7, m12, m8 + pmaddwd m0, m12, m9 + pmaddwd m8, m14 + pmaddwd m9, m14 + REPX {paddd x, m2}, m7, m0, m8, m9 + REPX {psrad x, 12}, m7, m0, m8, m9 + packssdw m7, m0 ; -out7 + packssdw m8, m9 ; out8 + punpckhwd m0, m6, m1 + punpcklwd m6, m1 + pmaddwd m1, m14, m0 + pmaddwd m9, m14, m6 + pmaddwd m0, m12 + pmaddwd m6, m12 + REPX {paddd x, m2}, m1, m9, m0, m6 + REPX {psrad x, 12}, m1, m9, m0, m6 + packssdw m9, m1 ; -out7 + packssdw m6, m0 ; out8 + vpbroadcastd m1, [o(pw_8192)] + ret +ALIGN function_align +.main_pass2_end: + ; In pass 2 we're going to clip to pixels afterwards anyway, so clipping to + ; 16-bit here will produce the same result as using 32-bit intermediates. paddsw m5, m10, m11 ; -out5 psubsw m10, m11 ; out10 psubsw m11, m4, m8 ; -out11 @@ -3091,6 +3295,7 @@ ALIGN function_align paddsw m6, m1 ; out6 vpbroadcastd m1, [o(pw_2896x8)] REPX {pmulhrsw x, m1}, m4, m5, m6, m7, m8, m9, m10, m11 + vpbroadcastd m1, [o(pw_2048)] ret INV_TXFM_16X16_FN flipadst, dct @@ -3100,16 +3305,16 @@ INV_TXFM_16X16_FN flipadst, flipadst cglobal iflipadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 ITX_16X16_LOAD_COEFS call m(iadst_16x16_internal).main - vpbroadcastd m1, [o(pw_8192)] + call m(iadst_16x16_internal).main_pass1_end pmulhrsw m6, m1 + pmulhrsw m2, m1, m8 mova [rsp+32*2], m6 pmulhrsw m6, m1, m4 pmulhrsw m4, m1, m10 - pmulhrsw m10, m1, m12 - pmulhrsw m12, m1, m2 - pmulhrsw m2, m1, m8 - pmulhrsw m8, m1, m14 - pmulhrsw m14, m1, m0 + pmulhrsw m8, m1, [cq+32*3] + pmulhrsw m10, m1, [cq+32*2] + pmulhrsw m12, m1, [cq+32*1] + pmulhrsw m14, m1, [cq+32*0] pxor m0, m0 psubw m0, m1 REPX {pmulhrsw x, m0}, m3, m5, m7, m11, m15 @@ -3136,7 +3341,7 @@ cglobal iflipadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 jmp m(idct_16x16_internal).pass1_end3 .pass2: call m(iadst_16x16_internal).main - vpbroadcastd m1, [o(pw_2048)] + call m(iadst_16x16_internal).main_pass2_end pmulhrsw m0, m1 pmulhrsw m8, m1 mova [rsp+32*0], m0 diff --git a/src/x86/itx_ssse3.asm b/src/x86/itx_ssse3.asm index a0fbb9d4..5c360d92 100644 --- a/src/x86/itx_ssse3.asm +++ b/src/x86/itx_ssse3.asm @@ -43,8 +43,11 @@ pw_1321_3803: times 4 dw 1321, 3803 pw_2482_m1321: times 4 dw 2482, -1321 pw_3344_2482: times 4 dw 3344, 2482 pw_3344_m3803: times 4 dw 3344, -3803 +pw_3344_m3344: times 4 dw 3344, -3344 +pw_0_3344 times 4 dw 0, 3344 pw_m6688_m3803: times 4 dw -6688, -3803 +COEF_PAIR 2896, 2896 COEF_PAIR 1567, 3784 COEF_PAIR 799, 4017 COEF_PAIR 3406, 2276 @@ -126,7 +129,6 @@ pw_2675x8: times 8 dw 2675*8 pw_4085x8: times 8 dw 4085*8 pw_m301x8: times 8 dw -301*8 - iadst4_dconly1a: times 2 dw 10568, 19856, 26752, 30424 iadst4_dconly1b: times 2 dw 30424, 26752, 19856, 10568 iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856 @@ -200,7 +202,6 @@ SECTION .text ret %endmacro - ; flags: 1 = swap, 2: coef_regs %macro ITX_MUL2X_PACK 5-6 0 ; dst/src, tmp[1], rnd, coef[1-2], flags %if %6 & 2 @@ -239,35 +240,6 @@ SECTION .text paddsw m0, m2 ;high: out1 ;low: out0 %endmacro - -%macro IADST4_1D_PACKED 0 - punpcklwd m2, m0, m1 ;unpacked in0 in2 - punpckhwd m3, m0, m1 ;unpacked in1 in3 - psubw m0, m1 - punpckhqdq m1, m1 ; - paddw m1, m0 ;low: in0 - in2 + in3 - - pmaddwd m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 - pmaddwd m2, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 - pmaddwd m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 - pmaddwd m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3 - paddd m4, m0 ;t0 + t3 - pmaddwd m3, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 - pmulhrsw m1, [o(pw_3344x8)] ;low: out2 - mova m0, [o(pd_2048)] - paddd m2, m0 - paddd m0, m4 ;t0 + t3 + 2048 - paddd m5, m2 ;t1 + t3 + 2048 - paddd m2, m4 - paddd m2, m3 ;t0 + t1 - t3 + 2048 - - psrad m0, 12 ;out0 - psrad m5, 12 ;out1 - psrad m2, 12 ;out3 - packssdw m0, m5 ;high: out1 ;low: out0 - packssdw m2, m2 ;high: out3 ;low: out3 -%endmacro - %macro INV_TXFM_FN 5+ ; type1, type2, fast_thresh, size, xmm/stack cglobal inv_txfm_add_%1_%2_%4, 4, 6, %5, dst, stride, coeff, eob, tx2 %undef cmp @@ -392,15 +364,14 @@ cglobal iadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m0, [coeffq+16*0] mova m1, [coeffq+16*1] call .main - punpckhwd m3, m0, m2 + punpckhwd m2, m0, m1 punpcklwd m0, m1 - punpckhwd m1, m0, m3 ;high: in3 ;low :in2 - punpcklwd m0, m3 ;high: in1 ;low: in0 + punpckhwd m1, m0, m2 ;high: in3 ;low :in2 + punpcklwd m0, m2 ;high: in1 ;low: in0 jmp tx2q .pass2: call .main - punpcklqdq m1, m2 ;out2 out3 .end: pxor m2, m2 @@ -412,7 +383,28 @@ cglobal iadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ALIGN function_align .main: - IADST4_1D_PACKED + punpcklwd m2, m0, m1 ;unpacked in0 in2 + punpckhwd m0, m1 ;unpacked in1 in3 + mova m3, m0 + pmaddwd m1, m2, [o(pw_3344_m3344)];3344 * in0 - 3344 * in2 + pmaddwd m0, [o(pw_0_3344)] ;3344 * in3 + paddd m1, m0 ;t2 + pmaddwd m0, m2, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 + pmaddwd m2, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 + pmaddwd m4, m3, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 + pmaddwd m5, m3, [o(pw_3344_m3803)];3344 * in1 - 3803 * in3 + paddd m4, m0 ;t0 + t3 + pmaddwd m3, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 + mova m0, [o(pd_2048)] + paddd m1, m0 ;t2 + 2048 + paddd m2, m0 + paddd m0, m4 ;t0 + t3 + 2048 + paddd m5, m2 ;t1 + t3 + 2048 + paddd m2, m4 + paddd m2, m3 ;t0 + t1 - t3 + 2048 + REPX {psrad x, 12}, m1, m0, m5, m2 + packssdw m0, m5 ;high: out1 ;low: out0 + packssdw m1, m2 ;high: out3 ;low: out3 ret INV_TXFM_4X4_FN flipadst, dct, 0 @@ -424,16 +416,14 @@ cglobal iflipadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m0, [coeffq+16*0] mova m1, [coeffq+16*1] call m(iadst_4x4_internal).main - punpcklwd m1, m0 - punpckhwd m2, m0 - punpcklwd m0, m2, m1 ;high: in3 ;low :in2 - punpckhwd m2, m1 ;high: in1 ;low: in0 - mova m1, m2 + punpcklwd m2, m1, m0 + punpckhwd m1, m0 + punpcklwd m0, m1, m2 ;high: in3 ;low :in2 + punpckhwd m1, m2 ;high: in1 ;low: in0 jmp tx2q .pass2: call m(iadst_4x4_internal).main - punpcklqdq m1, m2 ;out2 out3 .end: pxor m2, m2 @@ -584,99 +574,6 @@ cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff mova m%4, m%5 %endmacro -%macro IADST4_1D 0 - mova m4, m2 - psubw m2, m0, m4 - paddw m2, m3 ;low: in0 - in2 + in3 - - punpckhwd m6, m0, m4 ;unpacked in0 in2 - punpckhwd m7, m1, m3 ;unpacked in1 in3 - punpcklwd m0, m4 ;unpacked in0 in2 - punpcklwd m1, m3 ;unpacked in1 in3 - - pmaddwd m4, m0, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 - pmaddwd m0, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 - pmaddwd m3, m1, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 - pmaddwd m5, m1, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3 - paddd m3, m4 ;t0 + t3 - - pmaddwd m1, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 - pmulhrsw m2, [o(pw_3344x8)] ;out2 - mova m4, [o(pd_2048)] - paddd m0, m4 - paddd m4, m3 ;t0 + t3 + 2048 - paddd m5, m0 ;t1 + t3 + 2048 - paddd m3, m0 - paddd m3, m1 ;t0 + t1 - t3 + 2048 - - psrad m4, 12 ;out0 - psrad m5, 12 ;out1 - psrad m3, 12 ;out3 - packssdw m0, m4, m5 ;low: out0 high: out1 - - pmaddwd m4, m6, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 - pmaddwd m6, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 - pmaddwd m1, m7, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 - pmaddwd m5, m7, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3 - paddd m1, m4 ;t0 + t3 - pmaddwd m7, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 - - mova m4, [o(pd_2048)] - paddd m6, m4 - paddd m4, m1 ;t0 + t3 + 2048 - paddd m5, m6 ;t1 + t3 + 2048 - paddd m1, m6 - paddd m1, m7 ;t0 + t1 - t3 + 2048 - - psrad m4, 12 ;out0 - psrad m5, 12 ;out1 - psrad m1, 12 ;out3 - packssdw m3, m1 ;out3 - packssdw m4, m5 ;low: out0 high: out1 - - punpckhqdq m1, m0, m4 ;out1 - punpcklqdq m0, m4 ;out0 -%endmacro - -%macro IADST8_1D_PACKED 0 - mova m6, [o(pd_2048)] - punpckhwd m4, m3, m0 ;unpacked in7 in0 - punpckhwd m5, m2, m1 ;unpacked in5 in2 - punpcklwd m1, m2 ;unpacked in3 in4 - punpcklwd m0, m3 ;unpacked in1 in6 - ITX_MUL2X_PACK 4, 2, 6, 401, 4076 ;low: t0a high: t1a - ITX_MUL2X_PACK 5, 2, 6, 1931, 3612 ;low: t2a high: t3a - ITX_MUL2X_PACK 1, 2, 6, 3166, 2598 ;low: t4a high: t5a - ITX_MUL2X_PACK 0, 2, 6, 3920, 1189 ;low: t6a high: t7a - - psubsw m3, m4, m1 ;low: t4 high: t5 - paddsw m4, m1 ;low: t0 high: t1 - psubsw m2, m5, m0 ;low: t6 high: t7 - paddsw m5, m0 ;low: t2 high: t3 - - shufps m1, m3, m2, q1032 - punpckhwd m2, m1 - punpcklwd m3, m1 - ITX_MUL2X_PACK 3, 0, 6, 1567, 3784, 1 ;low: t5a high: t4a - ITX_MUL2X_PACK 2, 0, 6, 3784, 1567 ;low: t7a high: t6a - - psubsw m1, m4, m5 ;low: t2 high: t3 - paddsw m4, m5 ;low: out0 high: -out7 - psubsw m5, m3, m2 ;low: t7 high: t6 - paddsw m3, m2 ;low: out6 high: -out1 - shufps m0, m4, m3, q3210 ;low: out0 high: -out1 - shufps m3, m4, q3210 ;low: out6 high: -out7 - - shufps m4, m1, m5, q1032 ;low: t3 high: t7 - shufps m1, m5, q3210 ;low: t2 high: t6 - mova m5, [o(pw_2896x8)] - psubsw m2, m1, m4 ;low: t2-t3 high: t6-t7 - paddsw m1, m4 ;low: t2+t3 high: t6+t7 - pmulhrsw m2, m5 ;low: out4 high: -out5 - shufps m1, m1, q1032 - pmulhrsw m1, m5 ;low: out2 high: -out3 -%endmacro - %macro WRITE_4X8 4 ;row[1-4] WRITE_4X4 0, 1, 4, 5, 6, %1, %2, %3, %4 lea dstq, [dstq+strideq*4] @@ -838,7 +735,48 @@ cglobal iadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ALIGN function_align .main: - IADST8_1D_PACKED + mova m6, [o(pd_2048)] + punpckhwd m4, m3, m0 ;unpacked in7 in0 + punpckhwd m5, m2, m1 ;unpacked in5 in2 + punpcklwd m1, m2 ;unpacked in3 in4 + punpcklwd m0, m3 ;unpacked in1 in6 + ITX_MUL2X_PACK 4, 2, 6, 401, 4076 ;low: t0a high: t1a + ITX_MUL2X_PACK 5, 2, 6, 1931, 3612 ;low: t2a high: t3a + ITX_MUL2X_PACK 1, 2, 6, 3166, 2598 ;low: t4a high: t5a + ITX_MUL2X_PACK 0, 2, 6, 3920, 1189 ;low: t6a high: t7a + + psubsw m3, m4, m1 ;low: t4 high: t5 + paddsw m4, m1 ;low: t0 high: t1 + psubsw m2, m5, m0 ;low: t6 high: t7 + paddsw m5, m0 ;low: t2 high: t3 + + shufps m1, m3, m2, q1032 + punpckhwd m2, m1 + punpcklwd m3, m1 + ITX_MUL2X_PACK 3, 0, 6, 1567, 3784, 1 ;low: t5a high: t4a + ITX_MUL2X_PACK 2, 0, 6, 3784, 1567 ;low: t7a high: t6a + + psubsw m1, m4, m5 ;low: t2 high: t3 + paddsw m4, m5 ;low: out0 high: -out7 + psubsw m5, m3, m2 ;low: t7 high: t6 + paddsw m3, m2 ;low: out6 high: -out1 + shufps m0, m4, m3, q3210 ;low: out0 high: -out1 + shufps m3, m4, q3210 ;low: out6 high: -out7 + + mova m2, [o(pw_2896_m2896)] + mova m7, [o(pw_2896_2896)] + shufps m4, m1, m5, q1032 ;low: t3 high: t7 + shufps m1, m5, q3210 ;low: t2 high: t6 + punpcklwd m5, m1, m4 + punpckhwd m1, m4 + pmaddwd m4, m2, m1 ;-out5 + pmaddwd m2, m5 ; out4 + pmaddwd m1, m7 ; out2 + pmaddwd m5, m7 ;-out3 + REPX {paddd x, m6}, m4, m2, m1, m5 + REPX {psrad x, 12}, m4, m2, m1, m5 + packssdw m1, m5 ;low: out2 high: -out3 + packssdw m2, m4 ;low: out4 high: -out5 ret INV_TXFM_4X8_FN flipadst, dct, 0 @@ -1109,7 +1047,67 @@ cglobal iadst_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ALIGN function_align .main: - IADST4_1D + punpckhwd m6, m0, m2 ;unpacked in0 in2 + punpcklwd m0, m2 ;unpacked in0 in2 + punpckhwd m7, m1, m3 ;unpacked in1 in3 + punpcklwd m1, m3 ;unpacked in1 in3 + + mova m2, [o(pw_3344_m3344)] + mova m4, [o(pw_0_3344)] + pmaddwd m3, m2, m6 ;3344 * in0 - 3344 * in2 + pmaddwd m5, m4, m7 ;3344 * in3 + pmaddwd m2, m0 + pmaddwd m4, m1 + paddd m3, m5 + paddd m2, m4 + mova m4, [o(pd_2048)] + paddd m3, m4 ;t2 + 2048 + paddd m2, m4 + psrad m3, 12 + psrad m2, 12 + packssdw m2, m3 ;out2 + + pmaddwd m4, m0, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 + pmaddwd m0, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 + pmaddwd m3, m1, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 + pmaddwd m5, m1, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3 + paddd m3, m4 ;t0 + t3 + + pmaddwd m1, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 + mova m4, [o(pd_2048)] + paddd m0, m4 + paddd m4, m3 ;t0 + t3 + 2048 + paddd m5, m0 ;t1 + t3 + 2048 + paddd m3, m0 + paddd m3, m1 ;t0 + t1 - t3 + 2048 + + psrad m4, 12 ;out0 + psrad m5, 12 ;out1 + psrad m3, 12 ;out3 + packssdw m0, m4, m5 ;low: out0 high: out1 + + pmaddwd m4, m6, [o(pw_1321_3803)] ;1321 * in0 + 3803 * in2 + pmaddwd m6, [o(pw_2482_m1321)] ;2482 * in0 - 1321 * in2 + pmaddwd m1, m7, [o(pw_3344_2482)] ;3344 * in1 + 2482 * in3 + pmaddwd m5, m7, [o(pw_3344_m3803)] ;3344 * in1 - 3803 * in3 + paddd m1, m4 ;t0 + t3 + pmaddwd m7, [o(pw_m6688_m3803)] ;-2 * 3344 * in1 - 3803 * in3 + + mova m4, [o(pd_2048)] + paddd m6, m4 + paddd m4, m1 ;t0 + t3 + 2048 + paddd m5, m6 ;t1 + t3 + 2048 + paddd m1, m6 + paddd m1, m7 ;t0 + t1 - t3 + 2048 + + psrad m4, 12 ;out0 + psrad m5, 12 ;out1 + psrad m1, 12 ;out3 + packssdw m3, m1 ;out3 + packssdw m4, m5 ;low: out0 high: out1 + + punpckhqdq m1, m0, m4 ;out1 + punpcklqdq m0, m4 ;out0 ret INV_TXFM_8X4_FN flipadst, dct @@ -1423,6 +1421,7 @@ cglobal iadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 .pass1: call .main + call .main_pass1_end .pass1_end: mova m7, [o(pw_16384)] @@ -1441,6 +1440,7 @@ ALIGN function_align .pass2_main: call .main + call .main_pass2_end .end: mova m7, [o(pw_2048)] @@ -1491,6 +1491,53 @@ ALIGN function_align psubsw m5, m6 ;t6 paddsw m6, m2, m7 ;out6 psubsw m2, m7 ;t7 + ret +ALIGN function_align +.main_pass1_end: + mova [rsp+gprsize*2+16*1], m1 + mova [rsp+gprsize*2+16*2], m6 + punpckhwd m1, m4, m3 + punpcklwd m4, m3 + punpckhwd m7, m5, m2 + punpcklwd m5, m2 + mova m2, [o(pw_2896_2896)] + mova m6, [o(pd_2048)] + pmaddwd m3, m2, m7 + pmaddwd m2, m5 + paddd m3, m6 + paddd m2, m6 + psrad m3, 12 + psrad m2, 12 + packssdw m2, m3 ;out2 + mova m3, [o(pw_2896_m2896)] + pmaddwd m7, m3 + pmaddwd m5, m3 + paddd m7, m6 + paddd m5, m6 + psrad m7, 12 + psrad m5, 12 + packssdw m5, m7 ;-out5 + mova m3, [o(pw_2896_2896)] + pmaddwd m7, m3, m1 + pmaddwd m3, m4 + paddd m7, m6 + paddd m3, m6 + psrad m7, 12 + psrad m3, 12 + packssdw m3, m7 ;-out3 + mova m7, [o(pw_2896_m2896)] + pmaddwd m1, m7 + pmaddwd m4, m7 + paddd m1, m6 + paddd m4, m6 + psrad m1, 12 + psrad m4, 12 + packssdw m4, m1 ;-out5 + mova m1, [rsp+gprsize*2+16*1] + mova m6, [rsp+gprsize*2+16*2] + ret +ALIGN function_align +.main_pass2_end: paddsw m7, m4, m3 ;t2 + t3 psubsw m4, m3 ;t2 - t3 paddsw m3, m5, m2 ;t6 + t7 @@ -1513,6 +1560,7 @@ cglobal iflipadst_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 .pass1: call m(iadst_8x8_internal).main + call m(iadst_8x8_internal).main_pass1_end .pass1_end: mova m7, [o(pw_m16384)] @@ -1542,6 +1590,7 @@ ALIGN function_align .pass2_main: call m(iadst_8x8_internal).main + call m(iadst_8x8_internal).main_pass2_end .end: mova m7, [o(pw_2048)] @@ -1753,6 +1802,7 @@ cglobal iadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 .pass2: call m(iadst_16x4_internal).main + call m(iadst_16x4_internal).main_pass2_end punpcklqdq m6, m5, m4 ;low: -out5 high: -out7 punpckhqdq m4, m5 ;low: out8 high: out10 @@ -1820,6 +1870,7 @@ cglobal iflipadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 .pass2: call m(iadst_16x4_internal).main + call m(iadst_16x4_internal).main_pass2_end punpckhqdq m6, m5, m4 ;low: out5 high: out7 punpcklqdq m4, m5 ;low: -out8 high: -out10 @@ -2160,6 +2211,7 @@ INV_TXFM_16X4_FN adst, identity cglobal iadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_7ROWS coeffq, 16 call .main + call .main_pass1_end punpckhwd m6, m7, m0 ;packed -out11, -out15 punpcklwd m0, m7 ;packed out0, out4 @@ -2193,88 +2245,133 @@ cglobal iadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ALIGN function_align .main: mova [coeffq+16*6], m0 - pshufd m1, m1, q1032 + pshufd m0, m1, q1032 pshufd m2, m2, q1032 - punpckhwd m0, m6, m1 ;packed in13, in2 - punpcklwd m1, m6 ;packed in3, in12 - punpckhwd m6, m5, m2 ;packed in11, in4 + punpckhwd m1, m6, m0 ;packed in13, in2 + punpcklwd m0, m6 ;packed in3, in12 + punpckhwd m7, m5, m2 ;packed in11, in4 punpcklwd m2, m5 ;packed in5, in10 - mova m7, [o(pd_2048)] - ITX_MUL2X_PACK 0, 5, 7, 995, 3973 ;low:t2 high:t3 - ITX_MUL2X_PACK 6, 5, 7, 1751, 3703 ;low:t4 high:t5 - ITX_MUL2X_PACK 2, 5, 7, 3513, 2106 ;low:t10 high:t11 - ITX_MUL2X_PACK 1, 5, 7, 3857, 1380 ;low:t12 high:t13 - psubsw m5, m0, m2 ;low:t10a high:t11a - paddsw m0, m2 ;low:t2a high:t3a - psubsw m2, m6, m1 ;low:t12a high:t13a - paddsw m6, m1 ;low:t4a high:t5a - punpcklqdq m1, m5 - punpckhwd m1, m5 ;packed t10a, t11a + mova m6, [o(pd_2048)] + ITX_MUL2X_PACK 1, 5, 6, 995, 3973 ;low:t2 high:t3 + ITX_MUL2X_PACK 7, 5, 6, 1751, 3703 ;low:t4 high:t5 + ITX_MUL2X_PACK 2, 5, 6, 3513, 2106 ;low:t10 high:t11 + ITX_MUL2X_PACK 0, 5, 6, 3857, 1380 ;low:t12 high:t13 + psubsw m5, m1, m2 ;low:t10a high:t11a + paddsw m1, m2 ;low:t2a high:t3a + psubsw m2, m7, m0 ;low:t12a high:t13a + paddsw m7, m0 ;low:t4a high:t5a + punpcklqdq m0, m5 + punpckhwd m0, m5 ;packed t10a, t11a punpcklqdq m5, m2 punpckhwd m2, m5 ;packed t13a, t12a - ITX_MUL2X_PACK 1, 5, 7, 3406, 2276 ;low:t10 high:t11 - ITX_MUL2X_PACK 2, 5, 7, 4017, 799, 1 ;low:t12 high:t13 - mova [coeffq+16*4], m0 - mova [coeffq+16*5], m6 - mova m0, [coeffq+16*6] - mova m6, [coeffq+16*7] - pshufd m0, m0, q1032 + ITX_MUL2X_PACK 0, 5, 6, 3406, 2276 ;low:t10 high:t11 + ITX_MUL2X_PACK 2, 5, 6, 4017, 799, 1 ;low:t12 high:t13 + mova [coeffq+16*4], m1 + mova [coeffq+16*5], m7 + mova m1, [coeffq+16*6] + mova m7, [coeffq+16*7] + pshufd m1, m1, q1032 pshufd m3, m3, q1032 - punpckhwd m5, m6, m0 ;packed in15, in0 - punpcklwd m0, m6 ;packed in1, in14 - punpckhwd m6, m4, m3 ;packed in9, in6 + punpckhwd m5, m7, m1 ;packed in15, in0 + punpcklwd m1, m7 ;packed in1, in14 + punpckhwd m7, m4, m3 ;packed in9, in6 punpcklwd m3, m4 ;packed in7, in8 - ITX_MUL2X_PACK 5, 4, 7, 201, 4091 ;low:t0 high:t1 - ITX_MUL2X_PACK 6, 4, 7, 2440, 3290 ;low:t6 high:t7 - ITX_MUL2X_PACK 3, 4, 7, 3035, 2751 ;low:t8 high:t9 - ITX_MUL2X_PACK 0, 4, 7, 4052, 601 ;low:t14 high:t15 + ITX_MUL2X_PACK 5, 4, 6, 201, 4091 ;low:t0 high:t1 + ITX_MUL2X_PACK 7, 4, 6, 2440, 3290 ;low:t6 high:t7 + ITX_MUL2X_PACK 3, 4, 6, 3035, 2751 ;low:t8 high:t9 + ITX_MUL2X_PACK 1, 4, 6, 4052, 601 ;low:t14 high:t15 psubsw m4, m5, m3 ;low:t8a high:t9a paddsw m5, m3 ;low:t0a high:t1a - psubsw m3, m6, m0 ;low:t14a high:t15a - paddsw m6, m0 ;low:t6a high:t7a - punpcklqdq m0, m4 - punpckhwd m0, m4 ;packed t8a, t9a + psubsw m3, m7, m1 ;low:t14a high:t15a + paddsw m7, m1 ;low:t6a high:t7a + punpcklqdq m1, m4 + punpckhwd m1, m4 ;packed t8a, t9a punpcklqdq m4, m3 punpckhwd m3, m4 ;packed t15a, t14a - ITX_MUL2X_PACK 0, 4, 7, 799, 4017 ;low:t8 high:t9 - ITX_MUL2X_PACK 3, 4, 7, 2276, 3406, 1 ;low:t14 high:t15 - psubsw m4, m0, m2 ;low:t12a high:t13a - paddsw m0, m2 ;low:t8a high:t9a - psubsw m2, m1, m3 ;low:t14a high:t15a - paddsw m1, m3 ;low:t10a high:t11a - punpcklqdq m3, m4 - punpckhwd m3, m4 ;packed t12a, t13a - punpcklqdq m4, m2 - punpckhwd m2, m4 ;packed t15a, t14a - ITX_MUL2X_PACK 3, 4, 7, 1567, 3784 ;low:t12 high:t13 - ITX_MUL2X_PACK 2, 4, 7, 3784, 1567, 1 ;low:t14 high:t15 - psubsw m4, m0, m1 ;low:t10 high:t11 - paddsw m0, m1 ;low:-out1 high:out14 + ITX_MUL2X_PACK 1, 4, 6, 799, 4017 ;low:t8 high:t9 + ITX_MUL2X_PACK 3, 4, 6, 2276, 3406, 1 ;low:t14 high:t15 + paddsw m4, m1, m2 ;low:t12a high:t13a + psubsw m1, m2 ;low:t8a high:t9a + psubsw m2, m0, m3 ;low:t14a high:t15a + paddsw m0, m3 ;low:t10a high:t11a + punpcklqdq m3, m1 + punpckhwd m3, m1 ;packed t12a, t13a + punpcklqdq m1, m2 + punpckhwd m2, m1 ;packed t15a, t14a + ITX_MUL2X_PACK 3, 1, 6, 1567, 3784 ;low:t12 high:t13 + ITX_MUL2X_PACK 2, 1, 6, 3784, 1567, 1 ;low:t14 high:t15 psubsw m1, m3, m2 ;low:t14a high:t15a paddsw m3, m2 ;low:out2 high:-out13 - punpckhqdq m2, m4, m1 ;low:t11 high:t15a - punpcklqdq m4, m1 ;low:t10 high:t14a - psubsw m1, m4, m2 - paddsw m2, m4 + psubsw m2, m4, m0 ;low:t10 high:t11 + paddsw m0, m4 ;low:-out1 high:out14 mova [coeffq+16*6], m0 mova [coeffq+16*7], m3 mova m0, [coeffq+16*4] mova m3, [coeffq+16*5] psubsw m4, m5, m3 ;low:t4 high:t5 paddsw m5, m3 ;low:t0 high:t1 - psubsw m3, m0 ,m6 ;low:t6 high:t7 - paddsw m0, m6 ;low:t2 high:t3 - punpcklqdq m6, m4 - punpckhwd m6, m4 ;packed t4, t5 + psubsw m3, m0, m7 ;low:t6 high:t7 + paddsw m0, m7 ;low:t2 high:t3 + punpcklqdq m7, m4 + punpckhwd m7, m4 ;packed t4, t5 punpcklqdq m4, m3 punpckhwd m3, m4 ;packed t7, t6 - ITX_MUL2X_PACK 6, 4, 7, 1567, 3784 ;low:t4a high:t5a - ITX_MUL2X_PACK 3, 4, 7, 3784, 1567, 1 ;low:t6a high:t7a + ITX_MUL2X_PACK 7, 4, 6, 1567, 3784 ;low:t4a high:t5a + ITX_MUL2X_PACK 3, 4, 6, 3784, 1567, 1 ;low:t6a high:t7a psubsw m4, m5, m0 ;low:t2a high:t3a paddsw m0, m5 ;low:out0 high:-out15 - psubsw m5, m6, m3 ;low:t6 high:t7 - paddsw m3, m6 ;low:-out3 high:out12 + psubsw m5, m7, m3 ;low:t6 high:t7 + paddsw m3, m7 ;low:-out3 high:out12 + ret +ALIGN function_align +.main_pass1_end: + mova m7, [o(deint_shuf1)] + mova [coeffq+16*4], m0 + mova [coeffq+16*5], m3 + mova m0, [o(pw_2896_m2896)] + mova m3, [o(pw_2896_2896)] + pshufb m1, m7 ;t14a t15a + pshufb m2, m7 ;t10 t11 + pshufb m4, m7 ;t2a t3a + pshufb m5, m7 ;t6 t7 + pmaddwd m7, m0, m2 + pmaddwd m2, m3 + paddd m7, m6 + paddd m2, m6 + psrad m7, 12 + psrad m2, 12 + packssdw m2, m7 ;low:out6 high:-out9 + pmaddwd m7, m0, m4 + pmaddwd m4, m3 + paddd m7, m6 + paddd m4, m6 + psrad m7, 12 + psrad m4, 12 + packssdw m4, m7 ;low:-out7 high:out8 + pmaddwd m7, m3, m5 + pmaddwd m5, m0 + paddd m7, m6 + paddd m5, m6 + psrad m7, 12 + psrad m5, 12 + packssdw m7, m5 ;low:out4 high:-out11 + pmaddwd m5, m3, m1 + pmaddwd m1, m0 + paddd m5, m6 + paddd m1, m6 + psrad m5, 12 + psrad m1, 12 + packssdw m5, m1 ;low:-out5 high:out10 + mova m0, [coeffq+16*4] + mova m3, [coeffq+16*5] + ret +ALIGN function_align +.main_pass2_end: mova m7, [o(pw_2896x8)] + punpckhqdq m6, m2, m1 ;low:t11 high:t15a + punpcklqdq m2, m1 ;low:t10 high:t14a + psubsw m1, m2, m6 + paddsw m2, m6 punpckhqdq m6, m4, m5 ;low:t3a high:t7 punpcklqdq m4, m5 ;low:t2a high:t6 psubsw m5, m4, m6 @@ -2298,6 +2395,7 @@ INV_TXFM_16X4_FN flipadst, identity cglobal iflipadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_7ROWS coeffq, 16 call m(iadst_16x4_internal).main + call m(iadst_16x4_internal).main_pass1_end punpcklwd m6, m7, m0 ;packed out11, out15 punpckhwd m0, m7 ;packed -out0, -out4 @@ -2360,7 +2458,7 @@ cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 %endmacro %macro INV_TXFM_8X16_FN 2-3 -1 ; type1, type2, fast_thresh - INV_TXFM_FN %1, %2, %3, 8x16, 8, 16*12 + INV_TXFM_FN %1, %2, %3, 8x16, 8, 16*16 %ifidn %1_%2, dct_dct pshuflw m0, [coeffq], q0000 punpcklwd m0, m0 @@ -2548,6 +2646,7 @@ cglobal iadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m7, [coeffq+16*11] call m(iadst_16x8_internal).main + call m(iadst_16x8_internal).main_pass2_end mov r3, dstq lea dstq, [dstq+strideq*8] @@ -2599,6 +2698,7 @@ cglobal iflipadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m7, [coeffq+16*11] call m(iadst_16x8_internal).main + call m(iadst_16x8_internal).main_pass2_end jmp m(iflipadst_8x8_internal).end .end: @@ -2652,7 +2752,7 @@ cglobal iidentity_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 %macro INV_TXFM_16X8_FN 2-3 -1 ; type1, type2, fast_thresh - INV_TXFM_FN %1, %2, %3, 16x8, 8, 16*12 + INV_TXFM_FN %1, %2, %3, 16x8, 8, 16*16 %ifidn %1_%2, dct_dct movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] @@ -2893,6 +2993,7 @@ cglobal iadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 pmulhrsw m7, [coeffq+16*13] call .main + call .main_pass1_end mov r3, tx2q lea tx2q, [o(m(iadst_16x8_internal).pass1_end)] jmp m(iadst_8x8_internal).pass1_end @@ -2998,23 +3099,15 @@ ALIGN function_align mova [rsp+gprsize*2+16*6], m3 ;-out3 psubsw m3, m0, m4 ;t7 paddsw m0, m4 ;out12 - mova m7, [o(pw_2896x8)] - psubsw m4, m2, m3 - paddsw m2, m3 + mova [rsp+gprsize*2+16*12], m3 mova m3, [rsp+gprsize*2+16*7] ;t3 - pmulhrsw m4, m7 ;-out11 - pmulhrsw m2, m7 ;out4 - mova [rsp+gprsize*2+16*7], m2 ;out4 + mova [rsp+gprsize*2+16* 7], m2 ;out4 psubsw m2, m5, m3 ;t3a paddsw m5, m3 ;-out15 - psubsw m3, m1, m2 - paddsw m1, m2 + mova [rsp+gprsize*2+16*11], m2 mova m2, [rsp+gprsize*2+32*5] ;t15 - pmulhrsw m3, m7 ;out8 - pmulhrsw m1, m7 ;-out7 - mova [rsp+gprsize*2+32*5 ], m1 ;-out7 + mova [rsp+gprsize*2+16*10], m1 ;-out7 mova m1, [rsp+gprsize*2+16*0] ;t11 - mova [rsp+gprsize*2+16*11], m3 ;out8 mova [rsp+gprsize*2+16*0 ], m5 ;-out15 mova m3, [rsp+gprsize*2+16*1] ;t10 mova [rsp+gprsize*2+16*1 ], m4 ;-out11 @@ -3044,26 +3137,106 @@ ALIGN function_align paddsw m2, m6 ;-out1 paddsw m6, m4, m1 ;out14 psubsw m4, m1 ;t11 - psubsw m1, m3, m4 - paddsw m3, m4 - pmulhrsw m1, m7 ;-out9 - pmulhrsw m3, m7 ;out6 - mova [rsp+gprsize*2+16*4], m2 ;-out1 + mova [rsp+gprsize*2+16*14], m4 + mova [rsp+gprsize*2+16* 4], m2 ;-out1 mova m4, [rsp+gprsize*2+16*8] ;t14 mova m2, [rsp+gprsize*2+16*9] ;t15 - mova [rsp+gprsize*2+16*9], m3 ;out6 + mova [rsp+gprsize*2+16* 9], m3 ;out6 psubsw m3, m0, m4 ;t14a paddsw m0, m4 ;out2 psubsw m4, m5, m2 ;t15a paddsw m5, m2 ;-out13 + mova [rsp+gprsize*2+16* 5], m0 ;out2 + ret +ALIGN function_align +.main_pass1_end: + mova m0, [rsp+gprsize*2+16*14] + mova [rsp+gprsize*2+16*14], m5 + mova [rsp+gprsize*2+16*15], m6 + mova m5, [o(pw_2896_2896)] + mova m6, [o(pw_2896_m2896)] + mova m7, [o(pd_2048)] + punpcklwd m2, m3, m4 + punpckhwd m3, m4 + pmaddwd m4, m5, m2 + pmaddwd m2, m6 + pmaddwd m1, m5, m3 + pmaddwd m3, m6 + REPX {paddd x, m7}, m4, m2, m1, m3 + REPX {psrad x, 12}, m4, m1, m2, m3 + packssdw m4, m1 ;-out5 + packssdw m2, m3 ;out10 + mova [rsp+gprsize*2+16* 8], m4 + mova m3, [rsp+gprsize*2+16* 9] + punpcklwd m1, m3, m0 + punpckhwd m3, m0 + pmaddwd m0, m5, m1 + pmaddwd m1, m6 + pmaddwd m4, m5, m3 + pmaddwd m3, m6 + REPX {paddd x, m7}, m0, m1, m4, m3 + REPX {psrad x, 12}, m0, m4, m1, m3 + packssdw m0, m4 ;out6 + packssdw m1, m3 ;-out9 + mova [rsp+gprsize*2+16* 9], m0 + mova m0, [rsp+gprsize*2+16* 7] + mova m4, [rsp+gprsize*2+16*12] + punpcklwd m3, m0, m4 + punpckhwd m0, m4 + pmaddwd m4, m5, m3 + pmaddwd m3, m6 + pmaddwd m5, m0 + pmaddwd m0, m6 + REPX {paddd x, m7}, m4, m3, m5, m0 + REPX {psrad x, 12}, m4, m5, m3, m0 + packssdw m4, m5 ;out4 + packssdw m3, m0 ;-out11 + mova [rsp+gprsize*2+16* 7], m4 + mova m4, [rsp+gprsize*2+16*10] + mova m5, [rsp+gprsize*2+16*11] + punpcklwd m0, m4, m5 + punpckhwd m4, m5 + pmaddwd m5, m0, [o(pw_2896_2896)] + pmaddwd m0, m6 + pmaddwd m6, m4 + pmaddwd m4, [o(pw_2896_2896)] + REPX {paddd x, m7}, m5, m0, m6, m4 + REPX {psrad x, 12}, m0, m6, m5, m4 + packssdw m0, m6 ;out8 + packssdw m5, m4 ;-out7 + mova [rsp+gprsize*2+16*10], m5 + mova m4, [rsp+gprsize*2+16* 2] ;out12 + mova m5, [rsp+gprsize*2+16*14] ;-out13 + mova m6, [rsp+gprsize*2+16*15] ;out14 + ret +ALIGN function_align +.main_pass2_end: + mova m7, [o(pw_2896x8)] + mova m1, [rsp+gprsize*2+16* 9] + mova m2, [rsp+gprsize*2+16*14] + paddsw m0, m1, m2 + psubsw m1, m2 + pmulhrsw m0, m7 ;out6 + pmulhrsw m1, m7 ;-out9 + mova [rsp+gprsize*2+16* 9], m0 psubsw m2, m3, m4 paddsw m3, m4 - mova [rsp+gprsize*2+16*5], m0 ;out2 - pmulhrsw m3, m7 ;-out5 pmulhrsw m2, m7 ;out10 - mova [rsp+gprsize*2+16*8], m3 ;-out5 - mova m0, [rsp+gprsize*2+16*11] ;out8 - mova m3, [rsp+gprsize*2+16*1 ] ;-out11 + pmulhrsw m3, m7 ;-out5 + mova [rsp+gprsize*2+16* 8], m3 + mova m3, [rsp+gprsize*2+16* 7] + mova m4, [rsp+gprsize*2+16*12] + paddsw m0, m3, m4 + psubsw m3, m4 + pmulhrsw m0, m7 ;out4 + pmulhrsw m3, m7 ;-out11 + mova [rsp+gprsize*2+16* 7], m0 + mova m0, [rsp+gprsize*2+16*10] + paddsw m4, m0, [rsp+gprsize*2+16*11] + psubsw m0, [rsp+gprsize*2+16*11] + pmulhrsw m4, m7 ;-out7 + pmulhrsw m0, m7 ;out8 + mova [rsp+gprsize*2+16*10], m4 mova m4, [rsp+gprsize*2+16*2 ] ;out12 ret @@ -3100,6 +3273,7 @@ cglobal iflipadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 pmulhrsw m7, [coeffq+16*13] call m(iadst_16x8_internal).main + call m(iadst_16x8_internal).main_pass1_end mova m7, [rsp+gprsize+16*0] SAVE_8ROWS coeffq+16*0, 32 @@ -3184,7 +3358,7 @@ cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 %macro INV_TXFM_16X16_FN 2-3 -1 ; type1, type2, fast_thresh - INV_TXFM_FN %1, %2, %3, 16x16, 8, 16*12 + INV_TXFM_FN %1, %2, %3, 16x16, 8, 16*16 %ifidn %1_%2, dct_dct movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] @@ -3423,6 +3597,7 @@ INV_TXFM_16X16_FN adst, flipadst cglobal iadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ITX_16X16_ADST_LOAD_ODD_COEFS call m(iadst_16x8_internal).main + call m(iadst_16x8_internal).main_pass1_end mov r3, tx2q lea tx2q, [o(m(iadst_16x16_internal).pass1_end)] @@ -3441,6 +3616,7 @@ cglobal iadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 SAVE_8ROWS coeffq+16*1, 32 ITX_16X16_ADST_LOAD_EVEN_COEFS call m(iadst_16x8_internal).main + call m(iadst_16x8_internal).main_pass1_end lea tx2q, [o(m(iadst_16x16_internal).pass1_end2)] mova m7, [o(pw_8192)] @@ -3496,6 +3672,7 @@ INV_TXFM_16X16_FN flipadst, flipadst cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ITX_16X16_ADST_LOAD_ODD_COEFS call m(iadst_16x8_internal).main + call m(iadst_16x8_internal).main_pass1_end mov r3, tx2q lea tx2q, [o(m(iflipadst_16x16_internal).pass1_end)] @@ -3514,6 +3691,7 @@ cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 SAVE_8ROWS coeffq+16*17, 32 ITX_16X16_ADST_LOAD_EVEN_COEFS call m(iadst_16x8_internal).main + call m(iadst_16x8_internal).main_pass1_end mova m7, [rsp+gprsize+16*0] SAVE_8ROWS coeffq+16*0, 32