|
| 1 | +/* |
| 2 | + * Copyright (c) 2015 Stupéflix |
| 3 | + * |
| 4 | + * This file is part of FFmpeg. |
| 5 | + * |
| 6 | + * FFmpeg is free software; you can redistribute it and/or |
| 7 | + * modify it under the terms of the GNU Lesser General Public |
| 8 | + * License as published by the Free Software Foundation; either |
| 9 | + * version 2.1 of the License, or (at your option) any later version. |
| 10 | + * |
| 11 | + * FFmpeg is distributed in the hope that it will be useful, |
| 12 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 14 | + * Lesser General Public License for more details. |
| 15 | + * |
| 16 | + * You should have received a copy of the GNU Lesser General Public |
| 17 | + * License along with FFmpeg; if not, write to the Free Software |
| 18 | + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 19 | + */ |
| 20 | + |
| 21 | +#include "libavutil/arm/asm.S" |
| 22 | + |
| 23 | +.macro compute_premult half_u half_v |
| 24 | + vmov d2, \half_u @ copy left q14 to left q1 |
| 25 | + vmov d3, \half_u @ copy left q14 to right q1 |
| 26 | + vmov d4, \half_v @ copy left q15 to left q2 |
| 27 | + vmov d5, \half_v @ copy left q15 to right q2 |
| 28 | + |
| 29 | + vzip.16 d2, d3 @ U1U1U2U2U3U3U4U4 |
| 30 | + vzip.16 d4, d5 @ V1V1V2V2V3V3V4V4 |
| 31 | + |
| 32 | + vmull.s16 q8, d4, d1[0] @ V * v2r (left, red) |
| 33 | + vmull.s16 q9, d5, d1[0] @ V * v2r (right, red) |
| 34 | + vmull.s16 q10, d2, d1[1] @ U * u2g |
| 35 | + vmull.s16 q11, d3, d1[1] @ U * u2g |
| 36 | + vmlal.s16 q10, d4, d1[2] @ U * u2g + V * v2g (left, green) |
| 37 | + vmlal.s16 q11, d5, d1[2] @ U * u2g + V * v2g (right, green) |
| 38 | + vmull.s16 q12, d2, d1[3] @ U * u2b (left, blue) |
| 39 | + vmull.s16 q13, d3, d1[3] @ U * u2b (right, blue) |
| 40 | +.endm |
| 41 | + |
| 42 | +.macro compute_color dst_comp pre1 pre2 |
| 43 | + vadd.s32 q3, q1, \pre1 |
| 44 | + vadd.s32 q4, q2, \pre2 |
| 45 | + vqrshrun.s32 d10, q3, #13 |
| 46 | + vqrshrun.s32 d11, q4, #13 @ q5 = ({q3,q4} + (1<<12)) >> 13 |
| 47 | + vqmovn.u16 \dst_comp, q5 @ saturate 16bit -> 8bit |
| 48 | +.endm |
| 49 | + |
| 50 | +.macro compute_rgbx a b c d alpha |
| 51 | + compute_color \a, q8, q9 @ r |
| 52 | + compute_color \b, q10, q11 @ g |
| 53 | + compute_color \c, q12, q13 @ b |
| 54 | + vmov.u8 \d, #\alpha @ x |
| 55 | +.endm |
| 56 | + |
| 57 | +.macro compute_half_line dst half_y rgb alpha |
| 58 | + vmovl.u8 q7, \half_y @ 8px of Y |
| 59 | + vdup.16 q5, r9 |
| 60 | + @vsub.s16 q7, q5 |
| 61 | + vmull.s16 q1, d14, d0 @ q1 = (srcY - y_offset) * y_coeff (left) |
| 62 | + vmull.s16 q2, d15, d0 @ q2 = (srcY - y_offset) * y_coeff (right) |
| 63 | + |
| 64 | +.ifc \rgb,xrgb |
| 65 | + compute_rgbx d15, d12, d13, d14, \alpha |
| 66 | +.endif |
| 67 | + |
| 68 | +.ifc \rgb,rgbx |
| 69 | + compute_rgbx d12, d13, d14, d15, \alpha |
| 70 | +.endif |
| 71 | + |
| 72 | +.ifc \rgb,xbgr |
| 73 | + compute_rgbx d15, d14, d13, d12, \alpha |
| 74 | +.endif |
| 75 | + |
| 76 | +.ifc \rgb,bgrx |
| 77 | + compute_rgbx d14, d13, d12, d15, \alpha |
| 78 | +.endif |
| 79 | + |
| 80 | + vst4.u8 {d12, d13, d14, d15}, [\dst]! |
| 81 | +.endm |
| 82 | + |
| 83 | +.macro declare_func ifmt ofmt rgb alpha |
| 84 | +function ff_\ifmt\()_to_\ofmt\()_neon, export=1 |
| 85 | + push {r4-r12, lr} |
| 86 | + vpush {q4-q7} |
| 87 | + ldr r4, [sp, #104] @ r4 = srcY |
| 88 | + ldr r5, [sp, #108] @ r5 = linesizeY |
| 89 | + ldr r6, [sp, #112] @ r6 = srcC |
| 90 | + ldr r7, [sp, #116] @ r7 = linesizeC |
| 91 | + ldr r8, [sp, #120] @ r8 = table |
| 92 | + ldr r9, [sp, #124] @ r9 = y_offset |
| 93 | + ldr r10,[sp, #128] @ r10 = y_coeff |
| 94 | + vdup.16 d0, r10 @ d0 = y_coeff |
| 95 | + vld1.16 {d1}, [r8] @ d1 = *table |
| 96 | + add r11, r2, r3 @ r11 = dst + linesize (dst2) |
| 97 | + add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2) |
| 98 | + lsl r3, r3, #1 |
| 99 | + lsl r5, r5, #1 |
| 100 | + lsl r8, r0, #2 |
| 101 | + sub r3, r3, r8 @ r3 = linesize * 2 - width * 4 (padding) |
| 102 | + sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY) |
| 103 | + sub r7, r7, r0 @ r7 = linesizeC * 2 - width (paddingC) |
| 104 | +1: |
| 105 | + mov r8, r0 @ r8 = width |
| 106 | +2: |
| 107 | + pld [r6, #64*3] |
| 108 | + pld [r4, #64*3] |
| 109 | + pld [r12, #64*3] |
| 110 | + |
| 111 | + vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line |
| 112 | + vmov.i8 d10, #128 |
| 113 | +.ifc \ifmt,nv12 |
| 114 | + vsubl.u8 q14, d2, d10 @ q14 = U - 128 |
| 115 | + vsubl.u8 q15, d3, d10 @ q15 = V - 128 |
| 116 | +.else |
| 117 | + vsubl.u8 q14, d3, d10 @ q14 = U - 128 |
| 118 | + vsubl.u8 q15, d2, d10 @ q15 = V - 128 |
| 119 | +.endif |
| 120 | + |
| 121 | + compute_premult d28 d30 |
| 122 | + |
| 123 | + vld1.8 {q7}, [r4]! @ first line of luma |
| 124 | + vmov d28, d15 @ save right of the first line of luma for later use |
| 125 | + compute_half_line r2, d14, \rgb, \alpha |
| 126 | + |
| 127 | + vld1.8 {q7}, [r12]! @ second line of luma |
| 128 | + vmov d30, d15 @ save right of the second line of luma for later use |
| 129 | + compute_half_line r11, d14, \rgb, \alpha |
| 130 | + |
| 131 | + compute_premult d29, d31 |
| 132 | + compute_half_line r2, d28, \rgb, \alpha |
| 133 | + compute_half_line r11, d30, \rgb, \alpha |
| 134 | + |
| 135 | + subs r8, r8, #16 @ width -= 16 |
| 136 | + bgt 2b |
| 137 | + |
| 138 | + add r2, r2, r3 @ dst += padding |
| 139 | + add r11, r11, r3 @ dst2 += padding |
| 140 | + add r4, r4, r5 @ srcY += paddingY |
| 141 | + add r6, r6, r7 @ srcC += paddingC |
| 142 | + add r12, r12, r5 @ srcY2 += paddingY |
| 143 | + |
| 144 | + subs r1, r1, #2 @ height -= 2 |
| 145 | + bgt 1b |
| 146 | + |
| 147 | + vpop {q4-q7} |
| 148 | + pop {r4-r12, lr} |
| 149 | + mov pc, lr |
| 150 | +endfunc |
| 151 | +.endm |
| 152 | + |
| 153 | +.macro declare_rgb_funcs ifmt |
| 154 | + declare_func \ifmt, argb, xrgb, 255 |
| 155 | + declare_func \ifmt, rgba, rgbx, 255 |
| 156 | + declare_func \ifmt, abgr, xbgr, 255 |
| 157 | + declare_func \ifmt, bgra, bgrx, 255 |
| 158 | + |
| 159 | + declare_func \ifmt, 0rgb, xrgb, 0 |
| 160 | + declare_func \ifmt, rgb0, rgbx, 0 |
| 161 | + declare_func \ifmt, 0bgr, xbgr, 0 |
| 162 | + declare_func \ifmt, bgr0, bgrx, 0 |
| 163 | +.endm |
| 164 | + |
| 165 | +declare_rgb_funcs nv12 |
| 166 | +declare_rgb_funcs nv21 |
0 commit comments