Skip to content
This repository
Browse code

Merge pull request #3016 from popcornmix/ffmpeg_dts_up

[ffmpeg] Backport of armv6 and vfp optmisations for DTS
  • Loading branch information...
commit eccd7ba36a037b8f3eddb4400f20961ddd46224e 2 parents 60965df + 6a8a24e
popcornmix authored August 03, 2013

Showing 26 changed files with 3,859 additions and 60 deletions. Show diff stats Hide diff stats

  1. 4  lib/ffmpeg/libavcodec/arm/Makefile
  2. 12  lib/ffmpeg/libavcodec/arm/dcadsp_init_arm.c
  3. 493  lib/ffmpeg/libavcodec/arm/dcadsp_vfp.S
  4. 17  lib/ffmpeg/libavcodec/arm/fft_init_arm.c
  5. 298  lib/ffmpeg/libavcodec/arm/fft_vfp.S
  6. 14  lib/ffmpeg/libavcodec/arm/fmtconvert_init_arm.c
  7. 200  lib/ffmpeg/libavcodec/arm/fmtconvert_vfp.S
  8. 205  lib/ffmpeg/libavcodec/arm/mdct_vfp.S
  9. 243  lib/ffmpeg/libavcodec/arm/synth_filter_vfp.S
  10. 49  lib/ffmpeg/libavcodec/dcadec.c
  11. 30  lib/ffmpeg/libavcodec/dcadsp.c
  12. 9  lib/ffmpeg/libavcodec/dcadsp.h
  13. 10  lib/ffmpeg/libavcodec/fmtconvert.c
  14. 16  lib/ffmpeg/libavcodec/fmtconvert.h
  15. 311  lib/ffmpeg/patches/0040-ffmpeg-backport-arm-Add-VFP-accelerated-version-of-s.patch
  16. 102  lib/ffmpeg/patches/0041-ffmpeg-backport-arm-Add-VFP-accelerated-version-of-i.patch
  17. 78  lib/ffmpeg/patches/0042-ffmpeg-backport-fmtconvert-Add-a-new-method-int32_to.patch
  18. 90  lib/ffmpeg/patches/0043-ffmpeg-backport-dcadec-Use-int32_to_float_fmul_array.patch
  19. 222  lib/ffmpeg/patches/0044-ffmpeg-backport-arm-Add-VFP-accelerated-version-of-i.patch
  20. 274  lib/ffmpeg/patches/0045-ffmpeg-backport-arm-Add-VFP-accelerated-version-of-i.patch
  21. 58  lib/ffmpeg/patches/0046-ffmpeg-backport-arm-Add-VFP-accelerated-version-of-d.patch
  22. 339  lib/ffmpeg/patches/0047-ffmpeg-backport-arm-Add-VFP-accelerated-version-of-f.patch
  23. 140  lib/ffmpeg/patches/0048-ffmpeg-backport-dcadsp-Add-a-new-method-qmf_32_subba.patch
  24. 551  lib/ffmpeg/patches/0049-ffmpeg-backport-arm-Add-VFP-accelerated-version-of-q.patch
  25. 64  lib/ffmpeg/patches/0050-ffmpeg-backport-arm-Mangle-external-symbols-properly.patch
  26. 90  tools/depends/native/gas-preprocessor-native/gas-preprocessor.pl
4  lib/ffmpeg/libavcodec/arm/Makefile
@@ -58,6 +58,10 @@ ARMV6-OBJS                             += arm/dsputil_init_armv6.o      \
58 58
                                           arm/dsputil_armv6.o           \
59 59
                                           arm/simple_idct_armv6.o       \
60 60
 
  61
+VFP-OBJS-$(CONFIG_DCA_DECODER)         += arm/dcadsp_vfp.o              \
  62
+                                          arm/synth_filter_vfp.o
  63
+VFP-OBJS-$(CONFIG_FFT)                 += arm/fft_vfp.o
  64
+VFP-OBJS-$(CONFIG_MDCT)                += arm/mdct_vfp.o
61 65
 VFP-OBJS-$(HAVE_ARMV6)                 += arm/fmtconvert_vfp.o
62 66
 
63 67
 NEON-OBJS-$(CONFIG_FFT)                += arm/fft_neon.o                \
12  lib/ffmpeg/libavcodec/arm/dcadsp_init_arm.c
@@ -24,6 +24,14 @@
24 24
 #include "libavutil/attributes.h"
25 25
 #include "libavcodec/dcadsp.h"
26 26
 
  27
+void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
  28
+                        int decifactor, float scale);
  29
+void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
  30
+                                SynthFilterContext *synth, FFTContext *imdct,
  31
+                                float synth_buf_ptr[512],
  32
+                                int *synth_buf_offset, float synth_buf2[32],
  33
+                                const float window[512], float *samples_out,
  34
+                                float raXin[32], float scale);
27 35
 void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs,
28 36
                          int decifactor, float scale);
29 37
 
@@ -31,6 +39,10 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
31 39
 {
32 40
     int cpu_flags = av_get_cpu_flags();
33 41
 
  42
+    if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) {
  43
+        s->lfe_fir = ff_dca_lfe_fir_vfp;
  44
+        s->qmf_32_subbands = ff_dca_qmf_32_subbands_vfp;
  45
+    }
34 46
     if (have_neon(cpu_flags))
35 47
         s->lfe_fir = ff_dca_lfe_fir_neon;
36 48
 }
493  lib/ffmpeg/libavcodec/arm/dcadsp_vfp.S
... ...
@@ -0,0 +1,493 @@
  1
+/*
  2
+ * Copyright (c) 2013 RISC OS Open Ltd
  3
+ * Author: Ben Avison <bavison@riscosopen.org>
  4
+ *
  5
+ * This file is part of Libav.
  6
+ *
  7
+ * Libav is free software; you can redistribute it and/or
  8
+ * modify it under the terms of the GNU Lesser General Public
  9
+ * License as published by the Free Software Foundation; either
  10
+ * version 2.1 of the License, or (at your option) any later version.
  11
+ *
  12
+ * Libav is distributed in the hope that it will be useful,
  13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15
+ * Lesser General Public License for more details.
  16
+ *
  17
+ * You should have received a copy of the GNU Lesser General Public
  18
+ * License along with Libav; if not, write to the Free Software
  19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20
+ */
  21
+
  22
+#include "libavutil/arm/asm.S"
  23
+
  24
+POUT          .req    a1
  25
+PIN           .req    a2
  26
+PCOEF         .req    a3
  27
+DECIFACTOR    .req    a4
  28
+OLDFPSCR      .req    a4
  29
+COUNTER       .req    ip
  30
+
  31
+SCALE32       .req    s28  @ use vector of 4 in place of 9th scalar when decifactor=32 / JMAX=8
  32
+SCALE64       .req    s0   @ spare register in scalar bank when decifactor=64 / JMAX=4
  33
+IN0           .req    s4
  34
+IN1           .req    s5
  35
+IN2           .req    s6
  36
+IN3           .req    s7
  37
+IN4           .req    s0
  38
+IN5           .req    s1
  39
+IN6           .req    s2
  40
+IN7           .req    s3
  41
+COEF0         .req    s8   @ coefficient elements
  42
+COEF1         .req    s9
  43
+COEF2         .req    s10
  44
+COEF3         .req    s11
  45
+COEF4         .req    s12
  46
+COEF5         .req    s13
  47
+COEF6         .req    s14
  48
+COEF7         .req    s15
  49
+ACCUM0        .req    s16  @ double-buffered multiply-accumulate results
  50
+ACCUM4        .req    s20
  51
+POST0         .req    s24  @ do long-latency post-multiply in this vector in parallel
  52
+POST1         .req    s25
  53
+POST2         .req    s26
  54
+POST3         .req    s27
  55
+
  56
+
  57
+.macro inner_loop  decifactor, dir, tail, head
  58
+ .ifc "\dir","up"
  59
+  .set X, 0
  60
+  .set Y, 4
  61
+ .else
  62
+  .set X, 4*JMAX*4 - 4
  63
+  .set Y, -4
  64
+ .endif
  65
+ .ifnc "\head",""
  66
+        vldr    COEF0, [PCOEF, #X + (0*JMAX + 0) * Y]
  67
+        vldr    COEF1, [PCOEF, #X + (1*JMAX + 0) * Y]
  68
+        vldr    COEF2, [PCOEF, #X + (2*JMAX + 0) * Y]
  69
+        vldr    COEF3, [PCOEF, #X + (3*JMAX + 0) * Y]
  70
+ .endif
  71
+ .ifnc "\tail",""
  72
+        vadd.f  POST0, ACCUM0, ACCUM4   @ vector operation
  73
+ .endif
  74
+ .ifnc "\head",""
  75
+        vmul.f  ACCUM0, COEF0, IN0      @ vector = vector * scalar
  76
+        vldr    COEF4, [PCOEF, #X + (0*JMAX + 1) * Y]
  77
+        vldr    COEF5, [PCOEF, #X + (1*JMAX + 1) * Y]
  78
+        vldr    COEF6, [PCOEF, #X + (2*JMAX + 1) * Y]
  79
+ .endif
  80
+ .ifnc "\tail",""
  81
+        vmul.f  POST0, POST0, SCALE\decifactor  @ vector operation (SCALE may be scalar)
  82
+ .endif
  83
+ .ifnc "\head",""
  84
+        vldr    COEF7, [PCOEF, #X + (3*JMAX + 1) * Y]
  85
+   .ifc "\tail",""
  86
+        vmul.f  ACCUM4, COEF4, IN1      @ vector operation
  87
+   .endif
  88
+        vldr    COEF0, [PCOEF, #X + (0*JMAX + 2) * Y]
  89
+        vldr    COEF1, [PCOEF, #X + (1*JMAX + 2) * Y]
  90
+   .ifnc "\tail",""
  91
+        vmul.f  ACCUM4, COEF4, IN1      @ vector operation
  92
+   .endif
  93
+        vldr    COEF2, [PCOEF, #X + (2*JMAX + 2) * Y]
  94
+        vldr    COEF3, [PCOEF, #X + (3*JMAX + 2) * Y]
  95
+ .endif
  96
+ .ifnc "\tail",""
  97
+        vstmia  POUT!, {POST0-POST3}
  98
+ .endif
  99
+ .ifnc "\head",""
  100
+        vmla.f  ACCUM0, COEF0, IN2      @ vector = vector * scalar
  101
+        vldr    COEF4, [PCOEF, #X + (0*JMAX + 3) * Y]
  102
+        vldr    COEF5, [PCOEF, #X + (1*JMAX + 3) * Y]
  103
+        vldr    COEF6, [PCOEF, #X + (2*JMAX + 3) * Y]
  104
+        vldr    COEF7, [PCOEF, #X + (3*JMAX + 3) * Y]
  105
+        vmla.f  ACCUM4, COEF4, IN3      @ vector = vector * scalar
  106
+  .if \decifactor == 32
  107
+        vldr    COEF0, [PCOEF, #X + (0*JMAX + 4) * Y]
  108
+        vldr    COEF1, [PCOEF, #X + (1*JMAX + 4) * Y]
  109
+        vldr    COEF2, [PCOEF, #X + (2*JMAX + 4) * Y]
  110
+        vldr    COEF3, [PCOEF, #X + (3*JMAX + 4) * Y]
  111
+        vmla.f  ACCUM0, COEF0, IN4      @ vector = vector * scalar
  112
+        vldr    COEF4, [PCOEF, #X + (0*JMAX + 5) * Y]
  113
+        vldr    COEF5, [PCOEF, #X + (1*JMAX + 5) * Y]
  114
+        vldr    COEF6, [PCOEF, #X + (2*JMAX + 5) * Y]
  115
+        vldr    COEF7, [PCOEF, #X + (3*JMAX + 5) * Y]
  116
+        vmla.f  ACCUM4, COEF4, IN5      @ vector = vector * scalar
  117
+        vldr    COEF0, [PCOEF, #X + (0*JMAX + 6) * Y]
  118
+        vldr    COEF1, [PCOEF, #X + (1*JMAX + 6) * Y]
  119
+        vldr    COEF2, [PCOEF, #X + (2*JMAX + 6) * Y]
  120
+        vldr    COEF3, [PCOEF, #X + (3*JMAX + 6) * Y]
  121
+        vmla.f  ACCUM0, COEF0, IN6      @ vector = vector * scalar
  122
+        vldr    COEF4, [PCOEF, #X + (0*JMAX + 7) * Y]
  123
+        vldr    COEF5, [PCOEF, #X + (1*JMAX + 7) * Y]
  124
+        vldr    COEF6, [PCOEF, #X + (2*JMAX + 7) * Y]
  125
+        vldr    COEF7, [PCOEF, #X + (3*JMAX + 7) * Y]
  126
+        vmla.f  ACCUM4, COEF4, IN7      @ vector = vector * scalar
  127
+  .endif
  128
+ .endif
  129
+.endm
  130
+
  131
+.macro dca_lfe_fir  decifactor
  132
+ .if \decifactor == 32
  133
+  .set JMAX, 8
  134
+        vpush   {s16-s31}
  135
+        vmov    SCALE32, s0             @ duplicate scalar across vector
  136
+        vldr    IN4, [PIN, #-4*4]
  137
+        vldr    IN5, [PIN, #-5*4]
  138
+        vldr    IN6, [PIN, #-6*4]
  139
+        vldr    IN7, [PIN, #-7*4]
  140
+ .else
  141
+  .set JMAX, 4
  142
+        vpush   {s16-s27}
  143
+ .endif
  144
+
  145
+        mov     COUNTER, #\decifactor/4 - 1
  146
+        inner_loop  \decifactor, up,, head
  147
+1:      add     PCOEF, PCOEF, #4*JMAX*4
  148
+        subs    COUNTER, COUNTER, #1
  149
+        inner_loop  \decifactor, up, tail, head
  150
+        bne     1b
  151
+        inner_loop  \decifactor, up, tail
  152
+
  153
+        mov     COUNTER, #\decifactor/4 - 1
  154
+        inner_loop  \decifactor, down,, head
  155
+1:      sub     PCOEF, PCOEF, #4*JMAX*4
  156
+        subs    COUNTER, COUNTER, #1
  157
+        inner_loop  \decifactor, down, tail, head
  158
+        bne     1b
  159
+        inner_loop  \decifactor, down, tail
  160
+
  161
+ .if \decifactor == 32
  162
+        vpop    {s16-s31}
  163
+ .else
  164
+        vpop    {s16-s27}
  165
+ .endif
  166
+        fmxr    FPSCR, OLDFPSCR
  167
+        bx      lr
  168
+.endm
  169
+
  170
+
  171
+/* void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
  172
+ *                         int decifactor, float scale)
  173
+ */
  174
+function ff_dca_lfe_fir_vfp, export=1
  175
+        teq     DECIFACTOR, #32
  176
+        fmrx    OLDFPSCR, FPSCR
  177
+        ldr     ip, =0x03030000         @ RunFast mode, short vectors of length 4, stride 1
  178
+        fmxr    FPSCR, ip
  179
+NOVFP   vldr    s0, [sp]
  180
+        vldr    IN0, [PIN, #-0*4]
  181
+        vldr    IN1, [PIN, #-1*4]
  182
+        vldr    IN2, [PIN, #-2*4]
  183
+        vldr    IN3, [PIN, #-3*4]
  184
+        beq     32f
  185
+64:     dca_lfe_fir  64
  186
+ .ltorg
  187
+32:     dca_lfe_fir  32
  188
+endfunc
  189
+
  190
+        .unreq  POUT
  191
+        .unreq  PIN
  192
+        .unreq  PCOEF
  193
+        .unreq  DECIFACTOR
  194
+        .unreq  OLDFPSCR
  195
+        .unreq  COUNTER
  196
+
  197
+        .unreq  SCALE32
  198
+        .unreq  SCALE64
  199
+        .unreq  IN0
  200
+        .unreq  IN1
  201
+        .unreq  IN2
  202
+        .unreq  IN3
  203
+        .unreq  IN4
  204
+        .unreq  IN5
  205
+        .unreq  IN6
  206
+        .unreq  IN7
  207
+        .unreq  COEF0
  208
+        .unreq  COEF1
  209
+        .unreq  COEF2
  210
+        .unreq  COEF3
  211
+        .unreq  COEF4
  212
+        .unreq  COEF5
  213
+        .unreq  COEF6
  214
+        .unreq  COEF7
  215
+        .unreq  ACCUM0
  216
+        .unreq  ACCUM4
  217
+        .unreq  POST0
  218
+        .unreq  POST1
  219
+        .unreq  POST2
  220
+        .unreq  POST3
  221
+
  222
+
  223
+IN      .req    a1
  224
+SBACT   .req    a2
  225
+OLDFPSCR .req   a3
  226
+IMDCT   .req    a4
  227
+WINDOW  .req    v1
  228
+OUT     .req    v2
  229
+BUF     .req    v3
  230
+SCALEINT .req   v4 @ only used in softfp case
  231
+COUNT   .req    v5
  232
+
  233
+SCALE   .req    s0
  234
+
  235
+/* Stack layout differs in softfp and hardfp cases:
  236
+ *
  237
+ * hardfp
  238
+ *      fp -> 6 arg words saved by caller
  239
+ *            a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes)
  240
+ *            s16-s23 on entry
  241
+ *            align 16
  242
+ *     buf -> 8*32*4 bytes buffer
  243
+ *            s0 on entry
  244
+ *      sp -> 3 arg words for callee
  245
+ *
  246
+ * softfp
  247
+ *      fp -> 7 arg words saved by caller
  248
+ *            a4,v1-v5,fp,lr on entry
  249
+ *            s16-s23 on entry
  250
+ *            align 16
  251
+ *     buf -> 8*32*4 bytes buffer
  252
+ *      sp -> 4 arg words for callee
  253
+ */
  254
+
  255
+/* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
  256
+ *                                 SynthFilterContext *synth, FFTContext *imdct,
  257
+ *                                 float (*synth_buf_ptr)[512],
  258
+ *                                 int *synth_buf_offset, float (*synth_buf2)[32],
  259
+ *                                 const float (*window)[512], float *samples_out,
  260
+ *                                 float (*raXin)[32], float scale);
  261
+ */
  262
+function ff_dca_qmf_32_subbands_vfp, export=1
  263
+VFP     push    {a3-a4,v1-v3,v5,fp,lr}
  264
+NOVFP   push    {a4,v1-v5,fp,lr}
  265
+        add     fp, sp, #8*4
  266
+        vpush   {s16-s23}
  267
+        @ The buffer pointed at by raXin isn't big enough for us to do a
  268
+        @ complete matrix transposition as we want to, so allocate an
  269
+        @ alternative buffer from the stack. Align to 4 words for speed.
  270
+        sub     BUF, sp, #8*32*4
  271
+        bic     BUF, BUF, #15
  272
+        mov     sp, BUF
  273
+        ldr     lr, =0x03330000     @ RunFast mode, short vectors of length 4, stride 2
  274
+        fmrx    OLDFPSCR, FPSCR
  275
+        fmxr    FPSCR, lr
  276
+        @ COUNT is used to count down 2 things at once:
  277
+        @ bits 0-4 are the number of word pairs remaining in the output row
  278
+        @ bits 5-31 are the number of words to copy (with possible negation)
  279
+        @   from the source matrix before we start zeroing the remainder
  280
+        mov     COUNT, #(-4 << 5) + 16
  281
+        adds    COUNT, COUNT, SBACT, lsl #5
  282
+        bmi     2f
  283
+1:
  284
+        vldr    s8,  [IN, #(0*8+0)*4]
  285
+        vldr    s10, [IN, #(0*8+1)*4]
  286
+        vldr    s12, [IN, #(0*8+2)*4]
  287
+        vldr    s14, [IN, #(0*8+3)*4]
  288
+        vldr    s16, [IN, #(0*8+4)*4]
  289
+        vldr    s18, [IN, #(0*8+5)*4]
  290
+        vldr    s20, [IN, #(0*8+6)*4]
  291
+        vldr    s22, [IN, #(0*8+7)*4]
  292
+        vneg.f  s8, s8
  293
+        vldr    s9,  [IN, #(1*8+0)*4]
  294
+        vldr    s11, [IN, #(1*8+1)*4]
  295
+        vldr    s13, [IN, #(1*8+2)*4]
  296
+        vldr    s15, [IN, #(1*8+3)*4]
  297
+        vneg.f  s16, s16
  298
+        vldr    s17, [IN, #(1*8+4)*4]
  299
+        vldr    s19, [IN, #(1*8+5)*4]
  300
+        vldr    s21, [IN, #(1*8+6)*4]
  301
+        vldr    s23, [IN, #(1*8+7)*4]
  302
+        vstr    d4,  [BUF, #(0*32+0)*4]
  303
+        vstr    d5,  [BUF, #(1*32+0)*4]
  304
+        vstr    d6,  [BUF, #(2*32+0)*4]
  305
+        vstr    d7,  [BUF, #(3*32+0)*4]
  306
+        vstr    d8,  [BUF, #(4*32+0)*4]
  307
+        vstr    d9,  [BUF, #(5*32+0)*4]
  308
+        vstr    d10, [BUF, #(6*32+0)*4]
  309
+        vstr    d11, [BUF, #(7*32+0)*4]
  310
+        vldr    s9,  [IN, #(3*8+0)*4]
  311
+        vldr    s11, [IN, #(3*8+1)*4]
  312
+        vldr    s13, [IN, #(3*8+2)*4]
  313
+        vldr    s15, [IN, #(3*8+3)*4]
  314
+        vldr    s17, [IN, #(3*8+4)*4]
  315
+        vldr    s19, [IN, #(3*8+5)*4]
  316
+        vldr    s21, [IN, #(3*8+6)*4]
  317
+        vldr    s23, [IN, #(3*8+7)*4]
  318
+        vneg.f  s9, s9
  319
+        vldr    s8,  [IN, #(2*8+0)*4]
  320
+        vldr    s10, [IN, #(2*8+1)*4]
  321
+        vldr    s12, [IN, #(2*8+2)*4]
  322
+        vldr    s14, [IN, #(2*8+3)*4]
  323
+        vneg.f  s17, s17
  324
+        vldr    s16, [IN, #(2*8+4)*4]
  325
+        vldr    s18, [IN, #(2*8+5)*4]
  326
+        vldr    s20, [IN, #(2*8+6)*4]
  327
+        vldr    s22, [IN, #(2*8+7)*4]
  328
+        vstr    d4,  [BUF, #(0*32+2)*4]
  329
+        vstr    d5,  [BUF, #(1*32+2)*4]
  330
+        vstr    d6,  [BUF, #(2*32+2)*4]
  331
+        vstr    d7,  [BUF, #(3*32+2)*4]
  332
+        vstr    d8,  [BUF, #(4*32+2)*4]
  333
+        vstr    d9,  [BUF, #(5*32+2)*4]
  334
+        vstr    d10, [BUF, #(6*32+2)*4]
  335
+        vstr    d11, [BUF, #(7*32+2)*4]
  336
+        add     IN, IN, #4*8*4
  337
+        add     BUF, BUF, #4*4
  338
+        subs    COUNT, COUNT, #(4 << 5) + 2
  339
+        bpl     1b
  340
+2:      @ Now deal with trailing < 4 samples
  341
+        adds    COUNT, COUNT, #3 << 5
  342
+        bmi     4f  @ sb_act was a multiple of 4
  343
+        bics    lr, COUNT, #0x1F
  344
+        bne     3f
  345
+        @ sb_act was n*4+1
  346
+        vldr    s8,  [IN, #(0*8+0)*4]
  347
+        vldr    s10, [IN, #(0*8+1)*4]
  348
+        vldr    s12, [IN, #(0*8+2)*4]
  349
+        vldr    s14, [IN, #(0*8+3)*4]
  350
+        vldr    s16, [IN, #(0*8+4)*4]
  351
+        vldr    s18, [IN, #(0*8+5)*4]
  352
+        vldr    s20, [IN, #(0*8+6)*4]
  353
+        vldr    s22, [IN, #(0*8+7)*4]
  354
+        vneg.f  s8, s8
  355
+        vldr    s9,  zero
  356
+        vldr    s11, zero
  357
+        vldr    s13, zero
  358
+        vldr    s15, zero
  359
+        vneg.f  s16, s16
  360
+        vldr    s17, zero
  361
+        vldr    s19, zero
  362
+        vldr    s21, zero
  363
+        vldr    s23, zero
  364
+        vstr    d4,  [BUF, #(0*32+0)*4]
  365
+        vstr    d5,  [BUF, #(1*32+0)*4]
  366
+        vstr    d6,  [BUF, #(2*32+0)*4]
  367
+        vstr    d7,  [BUF, #(3*32+0)*4]
  368
+        vstr    d8,  [BUF, #(4*32+0)*4]
  369
+        vstr    d9,  [BUF, #(5*32+0)*4]
  370
+        vstr    d10, [BUF, #(6*32+0)*4]
  371
+        vstr    d11, [BUF, #(7*32+0)*4]
  372
+        add     BUF, BUF, #2*4
  373
+        sub     COUNT, COUNT, #1
  374
+        b       4f
  375
+3:      @ sb_act was n*4+2 or n*4+3, so do the first 2
  376
+        vldr    s8,  [IN, #(0*8+0)*4]
  377
+        vldr    s10, [IN, #(0*8+1)*4]
  378
+        vldr    s12, [IN, #(0*8+2)*4]
  379
+        vldr    s14, [IN, #(0*8+3)*4]
  380
+        vldr    s16, [IN, #(0*8+4)*4]
  381
+        vldr    s18, [IN, #(0*8+5)*4]
  382
+        vldr    s20, [IN, #(0*8+6)*4]
  383
+        vldr    s22, [IN, #(0*8+7)*4]
  384
+        vneg.f  s8, s8
  385
+        vldr    s9,  [IN, #(1*8+0)*4]
  386
+        vldr    s11, [IN, #(1*8+1)*4]
  387
+        vldr    s13, [IN, #(1*8+2)*4]
  388
+        vldr    s15, [IN, #(1*8+3)*4]
  389
+        vneg.f  s16, s16
  390
+        vldr    s17, [IN, #(1*8+4)*4]
  391
+        vldr    s19, [IN, #(1*8+5)*4]
  392
+        vldr    s21, [IN, #(1*8+6)*4]
  393
+        vldr    s23, [IN, #(1*8+7)*4]
  394
+        vstr    d4,  [BUF, #(0*32+0)*4]
  395
+        vstr    d5,  [BUF, #(1*32+0)*4]
  396
+        vstr    d6,  [BUF, #(2*32+0)*4]
  397
+        vstr    d7,  [BUF, #(3*32+0)*4]
  398
+        vstr    d8,  [BUF, #(4*32+0)*4]
  399
+        vstr    d9,  [BUF, #(5*32+0)*4]
  400
+        vstr    d10, [BUF, #(6*32+0)*4]
  401
+        vstr    d11, [BUF, #(7*32+0)*4]
  402
+        add     BUF, BUF, #2*4
  403
+        sub     COUNT, COUNT, #(2 << 5) + 1
  404
+        bics    lr, COUNT, #0x1F
  405
+        bne     4f
  406
+        @ sb_act was n*4+3
  407
+        vldr    s8,  [IN, #(2*8+0)*4]
  408
+        vldr    s10, [IN, #(2*8+1)*4]
  409
+        vldr    s12, [IN, #(2*8+2)*4]
  410
+        vldr    s14, [IN, #(2*8+3)*4]
  411
+        vldr    s16, [IN, #(2*8+4)*4]
  412
+        vldr    s18, [IN, #(2*8+5)*4]
  413
+        vldr    s20, [IN, #(2*8+6)*4]
  414
+        vldr    s22, [IN, #(2*8+7)*4]
  415
+        vldr    s9,  zero
  416
+        vldr    s11, zero
  417
+        vldr    s13, zero
  418
+        vldr    s15, zero
  419
+        vldr    s17, zero
  420
+        vldr    s19, zero
  421
+        vldr    s21, zero
  422
+        vldr    s23, zero
  423
+        vstr    d4,  [BUF, #(0*32+0)*4]
  424
+        vstr    d5,  [BUF, #(1*32+0)*4]
  425
+        vstr    d6,  [BUF, #(2*32+0)*4]
  426
+        vstr    d7,  [BUF, #(3*32+0)*4]
  427
+        vstr    d8,  [BUF, #(4*32+0)*4]
  428
+        vstr    d9,  [BUF, #(5*32+0)*4]
  429
+        vstr    d10, [BUF, #(6*32+0)*4]
  430
+        vstr    d11, [BUF, #(7*32+0)*4]
  431
+        add     BUF, BUF, #2*4
  432
+        sub     COUNT, COUNT, #1
  433
+4:      @ Now fill the remainder with 0
  434
+        vldr    s8, zero
  435
+        vldr    s9, zero
  436
+        ands    COUNT, COUNT, #0x1F
  437
+        beq     6f
  438
+5:      vstr    d4, [BUF, #(0*32+0)*4]
  439
+        vstr    d4, [BUF, #(1*32+0)*4]
  440
+        vstr    d4, [BUF, #(2*32+0)*4]
  441
+        vstr    d4, [BUF, #(3*32+0)*4]
  442
+        vstr    d4, [BUF, #(4*32+0)*4]
  443
+        vstr    d4, [BUF, #(5*32+0)*4]
  444
+        vstr    d4, [BUF, #(6*32+0)*4]
  445
+        vstr    d4, [BUF, #(7*32+0)*4]
  446
+        add     BUF, BUF, #2*4
  447
+        subs    COUNT, COUNT, #1
  448
+        bne     5b
  449
+6:
  450
+        fmxr    FPSCR, OLDFPSCR
  451
+        ldr     WINDOW, [fp, #3*4]
  452
+        ldr     OUT, [fp, #4*4]
  453
+        sub     BUF, BUF, #32*4
  454
+NOVFP   ldr     SCALEINT, [fp, #6*4]
  455
+        mov     COUNT, #8
  456
+VFP     vpush   {SCALE}
  457
+VFP     sub     sp, sp, #3*4
  458
+NOVFP   sub     sp, sp, #4*4
  459
+7:
  460
+VFP     ldr     a1, [fp, #-7*4]     @ imdct
  461
+NOVFP   ldr     a1, [fp, #-8*4]
  462
+        ldmia   fp, {a2-a4}
  463
+VFP     stmia   sp, {WINDOW, OUT, BUF}
  464
+NOVFP   stmia   sp, {WINDOW, OUT, BUF, SCALEINT}
  465
+VFP     vldr    SCALE, [sp, #3*4]
  466
+        bl      X(ff_synth_filter_float_vfp)
  467
+        add     OUT, OUT, #32*4
  468
+        add     BUF, BUF, #32*4
  469
+        subs    COUNT, COUNT, #1
  470
+        bne     7b
  471
+
  472
+A       sub     sp, fp, #(8+8)*4
  473
+T       sub     fp, fp, #(8+8)*4
  474
+T       mov     sp, fp
  475
+        vpop    {s16-s23}
  476
+VFP     pop     {a3-a4,v1-v3,v5,fp,pc}
  477
+NOVFP   pop     {a4,v1-v5,fp,pc}
  478
+endfunc
  479
+
  480
+        .unreq  IN
  481
+        .unreq  SBACT
  482
+        .unreq  OLDFPSCR
  483
+        .unreq  IMDCT
  484
+        .unreq  WINDOW
  485
+        .unreq  OUT
  486
+        .unreq  BUF
  487
+        .unreq  SCALEINT
  488
+        .unreq  COUNT
  489
+
  490
+        .unreq  SCALE
  491
+
  492
+        .align 2
  493
+zero:   .word   0
17  lib/ffmpeg/libavcodec/arm/fft_init_arm.c
@@ -26,12 +26,20 @@
26 26
 void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
27 27
 void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
28 28
 
  29
+void ff_imdct_half_vfp(FFTContext *s, FFTSample *output, const FFTSample *input);
  30
+
29 31
 void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
30 32
 void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
31 33
 void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
32 34
 
33 35
 void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z);
34 36
 
  37
+void ff_synth_filter_float_vfp(FFTContext *imdct,
  38
+                               float *synth_buf_ptr, int *synth_buf_offset,
  39
+                               float synth_buf2[32], const float window[512],
  40
+                               float out[32], const float in[32],
  41
+                               float scale);
  42
+
35 43
 void ff_synth_filter_float_neon(FFTContext *imdct,
36 44
                                 float *synth_buf_ptr, int *synth_buf_offset,
37 45
                                 float synth_buf2[32], const float window[512],
@@ -42,6 +50,13 @@ av_cold void ff_fft_init_arm(FFTContext *s)
42 50
 {
43 51
     int cpu_flags = av_get_cpu_flags();
44 52
 
  53
+    if (have_vfp(cpu_flags)) {
  54
+#if CONFIG_MDCT
  55
+        if (!have_vfpv3(cpu_flags))
  56
+            s->imdct_half   = ff_imdct_half_vfp;
  57
+#endif
  58
+    }
  59
+
45 60
     if (have_neon(cpu_flags)) {
46 61
 #if CONFIG_FFT
47 62
         s->fft_permute  = ff_fft_permute_neon;
@@ -71,6 +86,8 @@ av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
71 86
 {
72 87
     int cpu_flags = av_get_cpu_flags();
73 88
 
  89
+    if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags))
  90
+        s->synth_filter_float = ff_synth_filter_float_vfp;
74 91
     if (have_neon(cpu_flags))
75 92
         s->synth_filter_float = ff_synth_filter_float_neon;
76 93
 }
298  lib/ffmpeg/libavcodec/arm/fft_vfp.S
... ...
@@ -0,0 +1,298 @@
  1
+/*
  2
+ * Copyright (c) 2013 RISC OS Open Ltd
  3
+ * Author: Ben Avison <bavison@riscosopen.org>
  4
+ *
  5
+ * This file is part of Libav.
  6
+ *
  7
+ * Libav is free software; you can redistribute it and/or
  8
+ * modify it under the terms of the GNU Lesser General Public
  9
+ * License as published by the Free Software Foundation; either
  10
+ * version 2.1 of the License, or (at your option) any later version.
  11
+ *
  12
+ * Libav is distributed in the hope that it will be useful,
  13
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15
+ * Lesser General Public License for more details.
  16
+ *
  17
+ * You should have received a copy of the GNU Lesser General Public
  18
+ * License along with Libav; if not, write to the Free Software
  19
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20
+ */
  21
+
  22
+#include "libavutil/arm/asm.S"
  23
+
  24
+@ TODO: * FFTs wider than 16
  25
+@       * dispatch code
  26
+
  27
+function fft4_vfp
  28
+        vldr    d0, [a1, #0*2*4]   @ s0,s1   = z[0]
  29
+        vldr    d4, [a1, #1*2*4]   @ s8,s9   = z[1]
  30
+        vldr    d1, [a1, #2*2*4]   @ s2,s3   = z[2]
  31
+        vldr    d5, [a1, #3*2*4]   @ s10,s11 = z[3]
  32
+        @ stall
  33
+        vadd.f  s12, s0, s8        @ i0
  34
+        vadd.f  s13, s1, s9        @ i1
  35
+        vadd.f  s14, s2, s10       @ i2
  36
+        vadd.f  s15, s3, s11       @ i3
  37
+        vsub.f  s8, s0, s8         @ i4
  38
+        vsub.f  s9, s1, s9         @ i5
  39
+        vsub.f  s10, s2, s10       @ i6
  40
+        vsub.f  s11, s3, s11       @ i7
  41
+        @ stall
  42
+        @ stall
  43
+        vadd.f  s0, s12, s14       @ z[0].re
  44
+        vsub.f  s4, s12, s14       @ z[2].re
  45
+        vadd.f  s1, s13, s15       @ z[0].im
  46
+        vsub.f  s5, s13, s15       @ z[2].im
  47
+        vadd.f  s7, s9, s10        @ z[3].im
  48
+        vsub.f  s3, s9, s10        @ z[1].im
  49
+        vadd.f  s2, s8, s11        @ z[1].re
  50
+        vsub.f  s6, s8, s11        @ z[3].re
  51
+        @ stall
  52
+        @ stall
  53
+        vstr    d0, [a1, #0*2*4]
  54
+        vstr    d2, [a1, #2*2*4]
  55
+        @ stall
  56
+        @ stall
  57
+        vstr    d1, [a1, #1*2*4]
  58
+        vstr    d3, [a1, #3*2*4]
  59
+
  60
+        bx      lr
  61
+endfunc
  62
+
  63
+.macro macro_fft8_head
  64
+        @ FFT4
  65
+        vldr    d4, [a1, #0 * 2*4]
  66
+        vldr    d6, [a1, #1 * 2*4]
  67
+        vldr    d5, [a1, #2 * 2*4]
  68
+        vldr    d7, [a1, #3 * 2*4]
  69
+            @ BF
  70
+            vldr    d12, [a1, #4 * 2*4]
  71
+        vadd.f  s16, s8, s12    @ vector op
  72
+            vldr    d14, [a1, #5 * 2*4]
  73
+            vldr    d13, [a1, #6 * 2*4]
  74
+            vldr    d15, [a1, #7 * 2*4]
  75
+        vsub.f  s20, s8, s12    @ vector op
  76
+        vadd.f  s0, s16, s18
  77
+        vsub.f  s2, s16, s18
  78
+        vadd.f  s1, s17, s19
  79
+        vsub.f  s3, s17, s19
  80
+        vadd.f  s7, s21, s22
  81
+        vsub.f  s5, s21, s22
  82
+        vadd.f  s4, s20, s23
  83
+        vsub.f  s6, s20, s23
  84
+            vsub.f  s20, s24, s28   @ vector op
  85
+        vstr    d0, [a1, #0 * 2*4]  @ transfer s0-s7 to s24-s31 via memory
  86
+        vstr    d1, [a1, #1 * 2*4]
  87
+        vldr    s0, cos1pi4
  88
+            vadd.f  s16, s24, s28   @ vector op
  89
+        vstr    d2, [a1, #2 * 2*4]
  90
+        vstr    d3, [a1, #3 * 2*4]
  91
+        vldr    d12, [a1, #0 * 2*4]
  92
+            @ TRANSFORM
  93
+            vmul.f  s20, s20, s0    @ vector x scalar op
  94
+        vldr    d13, [a1, #1 * 2*4]
  95
+        vldr    d14, [a1, #2 * 2*4]
  96
+        vldr    d15, [a1, #3 * 2*4]
  97
+        @ BUTTERFLIES
  98
+        vadd.f  s0, s18, s16
  99
+        vadd.f  s1, s17, s19
  100
+        vsub.f  s2, s17, s19
  101
+        vsub.f  s3, s18, s16
  102
+            vadd.f  s4, s21, s20
  103
+            vsub.f  s5, s21, s20
  104
+            vadd.f  s6, s22, s23
  105
+            vsub.f  s7, s22, s23
  106
+        vadd.f  s8, s0, s24         @ vector op
  107
+        vstr    d0, [a1, #0 * 2*4]  @ transfer s0-s3 to s12-s15 via memory
  108
+        vstr    d1, [a1, #1 * 2*4]
  109
+        vldr    d6, [a1, #0 * 2*4]
  110
+        vldr    d7, [a1, #1 * 2*4]
  111
+            vadd.f  s1, s5, s6
  112
+            vadd.f  s0, s7, s4
  113
+            vsub.f  s2, s5, s6
  114
+            vsub.f  s3, s7, s4
  115
+        vsub.f  s12, s24, s12       @ vector op
  116
+            vsub.f  s5, s29, s1
  117
+            vsub.f  s4, s28, s0
  118
+            vsub.f  s6, s30, s2
  119
+            vsub.f  s7, s31, s3
  120
+            vadd.f  s16, s0, s28    @ vector op
  121
+        vstr    d6, [a1, #4 * 2*4]
  122
+        vstr    d7, [a1, #6 * 2*4]
  123
+        vstr    d4, [a1, #0 * 2*4]
  124
+        vstr    d5, [a1, #2 * 2*4]
  125
+             vstr    d2, [a1, #5 * 2*4]
  126
+             vstr    d3, [a1, #7 * 2*4]
  127
+.endm
  128
+
  129
+.macro macro_fft8_tail
  130
+             vstr    d8, [a1, #1 * 2*4]
  131
+             vstr    d9, [a1, #3 * 2*4]
  132
+.endm
  133
+
  134
+function fft8_vfp
  135
+        ldr     a3, =0x03030000     @ RunFast mode, vector length 4, stride 1
  136
+        fmrx    a2, FPSCR
  137
+        fmxr    FPSCR, a3
  138
+        vpush   {s16-s31}
  139
+
  140
+        macro_fft8_head
  141
+        macro_fft8_tail
  142
+
  143
+        vpop    {s16-s31}
  144
+        fmxr    FPSCR, a2
  145
+        bx      lr
  146
+endfunc
  147
+
  148
+.align 3
  149
+cos1pi4:    @ cos(1*pi/4) = sqrt(2)
  150
+        .float  0.707106769084930419921875
  151
+cos1pi8:    @ cos(1*pi/8) = sqrt(2+sqrt(2))/2
  152
+        .float  0.92387950420379638671875
  153
+cos3pi8:    @ cos(2*pi/8) = sqrt(2-sqrt(2))/2
  154
+        .float  0.3826834261417388916015625
  155
+
  156
+function ff_fft16_vfp, export=1
  157
+        ldr     a3, =0x03030000     @ RunFast mode, vector length 4, stride 1
  158
+        fmrx    a2, FPSCR
  159
+        fmxr    FPSCR, a3
  160
+        vpush   {s16-s31}
  161
+
  162
+        macro_fft8_head
  163
+        @ FFT4(z+8)
  164
+        vldr    d10, [a1, #8 * 2*4]
  165
+        vldr    d12, [a1, #9 * 2*4]
  166
+        vldr    d11, [a1, #10 * 2*4]
  167
+        vldr    d13, [a1, #11 * 2*4]
  168
+        macro_fft8_tail
  169
+        vadd.f  s16, s20, s24   @ vector op
  170
+            @ FFT4(z+12)
  171
+            vldr    d4, [a1, #12 * 2*4]
  172
+            vldr    d6, [a1, #13 * 2*4]
  173
+            vldr    d5, [a1, #14 * 2*4]
  174
+        vsub.f  s20, s20, s24   @ vector op
  175
+            vldr    d7, [a1, #15 * 2*4]
  176
+        vadd.f  s0, s16, s18
  177
+        vsub.f  s4, s16, s18
  178
+        vadd.f  s1, s17, s19
  179
+        vsub.f  s5, s17, s19
  180
+        vadd.f  s7, s21, s22
  181
+        vsub.f  s3, s21, s22
  182
+        vadd.f  s2, s20, s23
  183
+        vsub.f  s6, s20, s23
  184
+            vadd.f  s16, s8, s12    @ vector op
  185
+        vstr    d0, [a1, #8 * 2*4]
  186
+        vstr    d2, [a1, #10 * 2*4]
  187
+        vstr    d1, [a1, #9 * 2*4]
  188
+            vsub.f  s20, s8, s12
  189
+        vstr    d3, [a1, #11 * 2*4]
  190
+        @ TRANSFORM(z[2],z[6],z[10],z[14],cos1pi4,cos1pi4)
  191
+        vldr    d12, [a1, #10 * 2*4]
  192
+            vadd.f  s0, s16, s18
  193
+            vadd.f  s1, s17, s19
  194
+            vsub.f  s6, s16, s18
  195
+            vsub.f  s7, s17, s19
  196
+            vsub.f  s3, s21, s22
  197
+            vadd.f  s2, s20, s23
  198
+            vadd.f  s5, s21, s22
  199
+            vsub.f  s4, s20, s23
  200
+            vstr    d0, [a1, #12 * 2*4]
  201
+        vmov    s0, s6
  202
+          @ TRANSFORM(z[1],z[5],z[9],z[13],cos1pi8,cos3pi8)
  203
+          vldr    d6, [a1, #9 * 2*4]
  204
+            vstr    d1, [a1, #13 * 2*4]
  205
+        vldr    d1, cos1pi4 @ s2 = cos1pi4, s3 = cos1pi8
  206
+            vstr    d2, [a1, #15 * 2*4]
  207
+          vldr    d7, [a1, #13 * 2*4]
  208
+        vadd.f  s4, s25, s24
  209
+        vsub.f  s5, s25, s24
  210
+        vsub.f  s6, s0, s7
  211
+        vadd.f  s7, s0, s7
  212
+          vmul.f  s20, s12, s3  @ vector op
  213
+            @ TRANSFORM(z[3],z[7],z[11],z[15],cos3pi8,cos1pi8)
  214
+            vldr    d4, [a1, #11 * 2*4]
  215
+            vldr    d5, [a1, #15 * 2*4]
  216
+            vldr    s1, cos3pi8
  217
+        vmul.f  s24, s4, s2     @ vector * scalar op
  218
+          vmul.f  s28, s12, s1  @ vector * scalar op
  219
+            vmul.f  s12, s8, s1 @ vector * scalar op
  220
+          vadd.f  s4, s20, s29
  221
+          vsub.f  s5, s21, s28
  222
+          vsub.f  s6, s22, s31
  223
+          vadd.f  s7, s23, s30
  224
+            vmul.f  s8, s8, s3  @ vector * scalar op
  225
+          vldr    d8, [a1, #1 * 2*4]
  226
+          vldr    d9, [a1, #5 * 2*4]
  227
+            vldr    d10, [a1, #3 * 2*4]
  228
+            vldr    d11, [a1, #7 * 2*4]
  229
+        vldr    d14, [a1, #2 * 2*4]
  230
+          vadd.f  s0, s6, s4
  231
+          vadd.f  s1, s5, s7
  232
+          vsub.f  s2, s5, s7
  233
+          vsub.f  s3, s6, s4
  234
+            vadd.f  s4, s12, s9
  235
+            vsub.f  s5, s13, s8
  236
+            vsub.f  s6, s14, s11
  237
+            vadd.f  s7, s15, s10
  238
+          vadd.f  s12, s0, s16  @ vector op
  239
+          vstr    d0, [a1, #1 * 2*4]
  240
+          vstr    d1, [a1, #5 * 2*4]
  241
+          vldr    d4, [a1, #1 * 2*4]
  242
+          vldr    d5, [a1, #5 * 2*4]
  243
+            vadd.f  s0, s6, s4
  244
+            vadd.f  s1, s5, s7
  245
+            vsub.f  s2, s5, s7
  246
+            vsub.f  s3, s6, s4
  247
+          vsub.f  s8, s16, s8   @ vector op
  248
+          vstr    d6, [a1, #1 * 2*4]
  249
+          vstr    d7, [a1, #5 * 2*4]
  250
+        vldr    d15, [a1, #6 * 2*4]
  251
+            vsub.f  s4, s20, s0
  252
+            vsub.f  s5, s21, s1
  253
+            vsub.f  s6, s22, s2
  254
+            vsub.f  s7, s23, s3
  255
+            vadd.f  s20, s0, s20    @ vector op
  256
+          vstr    d4, [a1, #9 * 2*4]
  257
+              @ TRANSFORM_ZERO(z[0],z[4],z[8],z[12])
  258
+              vldr    d6, [a1, #8 * 2*4]
  259
+          vstr    d5, [a1, #13 * 2*4]
  260
+              vldr    d7, [a1, #12 * 2*4]
  261
+          vstr    d2, [a1, #11 * 2*4]
  262
+              vldr    d8, [a1, #0 * 2*4]
  263
+          vstr    d3, [a1, #15 * 2*4]
  264
+              vldr    d9, [a1, #4 * 2*4]
  265
+        vadd.f  s0, s26, s24
  266
+        vadd.f  s1, s25, s27
  267
+        vsub.f  s2, s25, s27
  268
+        vsub.f  s3, s26, s24
  269
+              vadd.f  s4, s14, s12
  270
+              vadd.f  s5, s13, s15
  271
+              vsub.f  s6, s13, s15
  272
+              vsub.f  s7, s14, s12
  273
+        vadd.f  s8, s0, s28 @ vector op
  274
+        vstr    d0, [a1, #3 * 2*4]
  275
+        vstr    d1, [a1, #7 * 2*4]
  276
+        vldr    d6, [a1, #3 * 2*4]
  277
+        vldr    d7, [a1, #7 * 2*4]
  278
+              vsub.f  s0, s16, s4
  279
+              vsub.f  s1, s17, s5
  280
+              vsub.f  s2, s18, s6
  281
+              vsub.f  s3, s19, s7
  282
+        vsub.f  s12, s28, s12       @ vector op
  283
+              vadd.f  s16, s4, s16  @ vector op
  284
+            vstr    d10, [a1, #3 * 2*4]
  285
+            vstr    d11, [a1, #7 * 2*4]
  286
+        vstr    d4, [a1, #2 * 2*4]
  287
+        vstr    d5, [a1, #6 * 2*4]
  288
+              vstr    d0, [a1, #8 * 2*4]
  289
+              vstr    d1, [a1, #12 * 2*4]
  290
+        vstr    d6, [a1, #10 * 2*4]
  291
+        vstr    d7, [a1, #14 * 2*4]
  292
+              vstr    d8, [a1, #0 * 2*4]
  293
+              vstr    d9, [a1, #4 * 2*4]
  294
+
  295
+        vpop    {s16-s31}
  296
+        fmxr    FPSCR, a2
  297
+        bx      lr
  298
+endfunc
14  lib/ffmpeg/libavcodec/arm/fmtconvert_init_arm.c
@@ -28,6 +28,12 @@
28 28
 void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src,
29 29
                                         float mul, int len);
30 30
 
  31
+void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src,
  32
+                                       float mul, int len);
  33
+void ff_int32_to_float_fmul_array8_vfp(FmtConvertContext *c, float *dst,
  34
+                                       const int32_t *src, const float *mul,
  35
+                                       int len);
  36
+
31 37
 void ff_float_to_int16_neon(int16_t *dst, const float *src, long len);
32 38
 void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
33 39
 
@@ -38,6 +44,14 @@ av_cold void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx
38 44
     int cpu_flags = av_get_cpu_flags();
39 45
 
40 46
     if (have_vfp(cpu_flags) && have_armv6(cpu_flags)) {
  47
+        if (!have_vfpv3(cpu_flags)) {
  48
+            // These functions don't use anything armv6 specific in themselves,
  49
+            // but ff_float_to_int16_vfp which is in the same assembly source
  50
+            // file does, thus the whole file requires armv6 to be built.
  51
+            c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_vfp;
  52
+            c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_vfp;
  53
+        }
  54
+
41 55
         c->float_to_int16 = ff_float_to_int16_vfp;
42 56
     }
43 57
 
200  lib/ffmpeg/libavcodec/arm/fmtconvert_vfp.S
... ...
@@ -1,5 +1,6 @@
1 1
 /*
2 2
  * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
  3
+ * Copyright (c) 2013 RISC OS Open Ltd <bavison@riscosopen.org>
3 4
  *
4 5
  * This file is part of FFmpeg.
5 6
  *
@@ -76,3 +77,202 @@ function ff_float_to_int16_vfp, export=1
76 77
         vpop            {d8-d11}
77 78
         pop             {r4-r8,pc}
78 79
 endfunc
  80
+
  81
+/**
  82
+ * ARM VFP optimised int32 to float conversion.
  83
+ * Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned
  84
+ * (16 bytes alignment is best for BCM2835), little-endian.
  85
+ */
  86
+@ void ff_int32_to_float_fmul_array8_vfp(FmtConvertContext *c, float *dst, const int32_t *src, const float *mul, int len)
  87
+function ff_int32_to_float_fmul_array8_vfp, export=1
  88
+        push    {lr}
  89
+        ldr     a1, [sp, #4]
  90
+        subs    lr, a1, #3*8
  91
+        bcc     50f                        @ too short to pipeline
  92
+        @ Now need to find (len / 8) % 3. The approximation
  93
+        @ x / 24 = (x * 0xAB) >> 12
  94
+        @ is good for x < 4096, which is true for both AC3 and DCA.
  95
+        mov     a1, #0xAB
  96
+        ldr     ip, =0x03070000            @ RunFast mode, short vectors of length 8, stride 1
  97
+        mul     a1, lr, a1
  98
+        vpush   {s16-s31}
  99
+        mov     a1, a1, lsr #12
  100
+        add     a1, a1, a1, lsl #1
  101
+        rsb     a1, a1, lr, lsr #3
  102
+        cmp     a1, #1
  103
+        fmrx    a1, FPSCR
  104
+        fmxr    FPSCR, ip
  105
+        beq     11f
  106
+        blo     10f
  107
+        @ Array is (2 + multiple of 3) x 8 floats long
  108
+        @ drop through...
  109
+        vldmia          a3!, {s16-s23}
  110
+        vldmia          a4!, {s2,s3}
  111
+        vldmia          a3!, {s24-s31}
  112
+        vcvt.f32.s32    s16, s16
  113
+        vcvt.f32.s32    s17, s17
  114
+        vcvt.f32.s32    s18, s18
  115
+        vcvt.f32.s32    s19, s19
  116
+        vcvt.f32.s32    s20, s20
  117
+        vcvt.f32.s32    s21, s21
  118
+        vcvt.f32.s32    s22, s22
  119
+        vcvt.f32.s32    s23, s23
  120
+        vmul.f32        s16, s16, s2
  121
+        @ drop through...
  122
+3:
  123
+        vldmia          a3!, {s8-s15}
  124
+        vldmia          a4!, {s1}
  125
+        vcvt.f32.s32    s24, s24
  126
+        vcvt.f32.s32    s25, s25
  127
+        vcvt.f32.s32    s26, s26
  128
+        vcvt.f32.s32    s27, s27
  129
+        vcvt.f32.s32    s28, s28
  130
+        vcvt.f32.s32    s29, s29
  131
+        vcvt.f32.s32    s30, s30
  132
+        vcvt.f32.s32    s31, s31
  133
+        vmul.f32        s24, s24, s3
  134
+        vstmia          a2!, {s16-s19}
  135
+        vstmia          a2!, {s20-s23}
  136
+2:
  137
+        vldmia          a3!, {s16-s23}
  138
+        vldmia          a4!, {s2}
  139
+        vcvt.f32.s32    s8, s8
  140
+        vcvt.f32.s32    s9, s9
  141
+        vcvt.f32.s32    s10, s10
  142
+        vcvt.f32.s32    s11, s11
  143
+        vcvt.f32.s32    s12, s12
  144
+        vcvt.f32.s32    s13, s13
  145
+        vcvt.f32.s32    s14, s14
  146
+        vcvt.f32.s32    s15, s15
  147
+        vmul.f32        s8, s8, s1
  148
+        vstmia          a2!, {s24-s27}
  149
+        vstmia          a2!, {s28-s31}
  150
+1:
  151
+        vldmia          a3!, {s24-s31}
  152
+        vldmia          a4!, {s3}
  153
+        vcvt.f32.s32    s16, s16
  154
+        vcvt.f32.s32    s17, s17
  155
+        vcvt.f32.s32    s18, s18
  156
+        vcvt.f32.s32    s19, s19
  157
+        vcvt.f32.s32    s20, s20
  158
+        vcvt.f32.s32    s21, s21
  159
+        vcvt.f32.s32    s22, s22
  160
+        vcvt.f32.s32    s23, s23
  161
+        vmul.f32        s16, s16, s2
  162
+        vstmia          a2!, {s8-s11}
  163
+        vstmia          a2!, {s12-s15}
  164
+
  165
+        subs            lr, lr, #8*3
  166
+        bpl             3b
  167
+
  168
+        vcvt.f32.s32    s24, s24
  169
+        vcvt.f32.s32    s25, s25
  170
+        vcvt.f32.s32    s26, s26
  171
+        vcvt.f32.s32    s27, s27
  172
+        vcvt.f32.s32    s28, s28
  173
+        vcvt.f32.s32    s29, s29
  174
+        vcvt.f32.s32    s30, s30
  175
+        vcvt.f32.s32    s31, s31
  176
+        vmul.f32        s24, s24, s3
  177
+        vstmia          a2!, {s16-s19}
  178
+        vstmia          a2!, {s20-s23}
  179
+        vstmia          a2!, {s24-s27}
  180
+        vstmia          a2!, {s28-s31}
  181
+
  182
+        fmxr    FPSCR, a1
  183
+        vpop    {s16-s31}
  184
+        pop     {pc}
  185
+
  186
+10:     @ Array is (multiple of 3) x 8 floats long
  187
+        vldmia          a3!, {s8-s15}
  188
+        vldmia          a4!, {s1,s2}
  189
+        vldmia          a3!, {s16-s23}
  190
+        vcvt.f32.s32    s8, s8
  191
+        vcvt.f32.s32    s9, s9
  192
+        vcvt.f32.s32    s10, s10
  193
+        vcvt.f32.s32    s11, s11
  194
+        vcvt.f32.s32    s12, s12
  195
+        vcvt.f32.s32    s13, s13
  196
+        vcvt.f32.s32    s14, s14
  197
+        vcvt.f32.s32    s15, s15
  198
+        vmul.f32        s8, s8, s1
  199
+        b               1b
  200
+
  201
+11:     @ Array is (1 + multiple of 3) x 8 floats long
  202
+        vldmia          a3!, {s24-s31}
  203
+        vldmia          a4!, {s3}
  204
+        vldmia          a3!, {s8-s15}
  205
+        vldmia          a4!, {s1}
  206
+        vcvt.f32.s32    s24, s24
  207
+        vcvt.f32.s32    s25, s25
  208
+        vcvt.f32.s32    s26, s26
  209
+        vcvt.f32.s32    s27, s27
  210
+        vcvt.f32.s32    s28, s28
  211
+        vcvt.f32.s32    s29, s29
  212
+        vcvt.f32.s32    s30, s30
  213
+        vcvt.f32.s32    s31, s31
  214
+        vmul.f32        s24, s24, s3
  215
+        b               2b
  216
+
  217
+50:
  218
+        ldr     lr, =0x03070000         @ RunFast mode, short vectors of length 8, stride 1
  219
+        fmrx    ip, FPSCR
  220
+        fmxr    FPSCR, lr
  221
+51:
  222
+        vldmia          a3!, {s8-s15}
  223
+        vldmia          a4!, {s0}
  224
+        vcvt.f32.s32    s8, s8
  225
+        vcvt.f32.s32    s9, s9
  226
+        vcvt.f32.s32    s10, s10