-
-
Notifications
You must be signed in to change notification settings - Fork 6.3k
/
0057-Fixes-neon-usage-under-iOS5.patch
307 lines (292 loc) · 10.2 KB
/
0057-Fixes-neon-usage-under-iOS5.patch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
diff --git a/lib/ffmpeg/libavcodec/arm/fft_neon.S b/lib/ffmpeg/libavcodec/arm/fft_neon.S
index 1db7abd..6390065 100644
--- a/lib/ffmpeg/libavcodec/arm/fft_neon.S
+++ b/lib/ffmpeg/libavcodec/arm/fft_neon.S
@@ -101,8 +101,12 @@ function fft8_neon
bx lr
endfunc
+ .align 4
+pmmp: .float +1.0, -1.0, -1.0, +1.0
+mppm: .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
+
function fft16_neon
- movrel r1, mppm
+ adr r1, mppm
vld1.32 {d16-d19}, [r0,:128]! @ q8{r0,i0,r1,i1} q9{r2,i2,r3,i3}
pld [r0, #32]
vld1.32 {d2-d3}, [r1,:128]
@@ -144,12 +148,16 @@ function fft16_neon
vswp d29, d30 @ q14{r12,i12,i14,r15} q15{r13,i13,i15,r14}
vadd.f32 q0, q12, q13 @ {t1,t2,t5,t6}
vadd.f32 q1, q14, q15 @ {t1a,t2a,t5a,t6a}
- movrel r2, X(ff_cos_16)
+ ldr r2, _neon_label
+ ldr r3, L$diff1
+ add r2, r3
+local_label1:
+ ldr r2, [pc, r2]
vsub.f32 q13, q12, q13 @ {t3,t4,t7,t8}
vrev64.32 d1, d1
vsub.f32 q15, q14, q15 @ {t3a,t4a,t7a,t8a}
vrev64.32 d3, d3
- movrel r3, pmmp
+ adr r3, pmmp
vswp d1, d26 @ q0{t1,t2,t3,t4} q13{t6,t5,t7,t8}
vswp d3, d30 @ q1{t1a,t2a,t3a,t4a} q15{t6a,t5a,t7a,t8a}
vadd.f32 q12, q0, q13 @ {r8,i8,r9,i9}
@@ -214,7 +222,7 @@ function fft_pass_neon
add r2, r2, r0 @ &z[o2]
add r3, r3, r0 @ &z[o3]
vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]}
- movrel r12, pmmp
+ adr r12, pmmp
vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]}
add r5, r5, r1 @ wim
vld1.32 {d6-d7}, [r12,:128] @ pmmp
@@ -279,6 +287,11 @@ function fft_pass_neon
pop {r4-r6,pc}
endfunc
+.set L$offs16, 0
+.macro setTabOffs n, n2
+.set L$offs\n, L$offs\n2 + 4
+.endm
+
.macro def_fft n, n2, n4
.align 6
function fft\n\()_neon
@@ -291,10 +304,15 @@ function fft\n\()_neon
bl fft\n4\()_neon
mov r0, r4
pop {r4, lr}
- movrel r1, X(ff_cos_\n)
+ ldr r1, _neon_label
+ add r1, #L$diff\n
+local_label\n:
+ ldr r1, [pc, r1]
mov r2, #\n4/2
b fft_pass_neon
endfunc
+setTabOffs \n, \n2
+.set L$diff\n, _neon_label - local_label\n + L$offs\n - 8
.endm
def_fft 32, 16, 8
@@ -310,10 +328,14 @@ endfunc
def_fft 32768, 16384, 8192
def_fft 65536, 32768, 16384
+.set L$diffTab, fft_tab_neon_offs - local_label_tab - 8
function ff_fft_calc_neon, export=1
ldr r2, [r0]
sub r2, r2, #2
- movrel r3, fft_tab_neon
+ ldr r3, fft_tab_neon_offs
+ add r3, #L$diffTab
+local_label_tab:
+ add r3, pc
ldr r3, [r3, r2, lsl #2]
mov r0, r1
bx r3
@@ -349,9 +371,22 @@ function ff_fft_permute_neon, export=1
pop {r4,pc}
endfunc
- .section .rodata
+
+.global _neon_label
+_neon_label:
+.word _neon_cos_tab - .
+
+L$diff1:
+.word _neon_label - local_label1 - 8
+
+fft_tab_neon_offs:
+.word _fft_tab_neon - .
+
+
+.section .rodata
+
.align 4
-fft_tab_neon:
+_fft_tab_neon:
.word fft4_neon
.word fft8_neon
.word fft16_neon
@@ -367,8 +402,20 @@ fft_tab_neon:
.word fft16384_neon
.word fft32768_neon
.word fft65536_neon
-ELF .size fft_tab_neon, . - fft_tab_neon
-
- .align 4
-pmmp: .float +1.0, -1.0, -1.0, +1.0
-mppm: .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
+ELF .size _fft_tab_neon, . - _fft_tab_neon
+
+ .align 4
+_neon_cos_tab:
+ .word X(ff_cos_16)
+ .word X(ff_cos_32)
+ .word X(ff_cos_64)
+ .word X(ff_cos_128)
+ .word X(ff_cos_256)
+ .word X(ff_cos_512)
+ .word X(ff_cos_1024)
+ .word X(ff_cos_2048)
+ .word X(ff_cos_4096)
+ .word X(ff_cos_8192)
+ .word X(ff_cos_16384)
+ .word X(ff_cos_32768)
+ .word X(ff_cos_65536)
diff --git a/lib/ffmpeg/libavcodec/arm/h264idct_neon.S b/lib/ffmpeg/libavcodec/arm/h264idct_neon.S
index 6b6a669..8111975 100644
--- a/lib/ffmpeg/libavcodec/arm/h264idct_neon.S
+++ b/lib/ffmpeg/libavcodec/arm/h264idct_neon.S
@@ -97,7 +97,7 @@ function ff_h264_idct_add16_neon, export=1
mov r1, r2
mov r2, r3
ldr r6, [sp, #24]
- movrel r7, scan8
+ adr r7, scan8
mov ip, #16
1: ldrb r8, [r7], #1
ldr r0, [r5], #4
@@ -117,6 +117,16 @@ function ff_h264_idct_add16_neon, export=1
pop {r4-r8,pc}
endfunc
+ .align
+scan8: .byte 4+1*8, 5+1*8, 4+2*8, 5+2*8
+ .byte 6+1*8, 7+1*8, 6+2*8, 7+2*8
+ .byte 4+3*8, 5+3*8, 4+4*8, 5+4*8
+ .byte 6+3*8, 7+3*8, 6+4*8, 7+4*8
+ .byte 1+1*8, 2+1*8
+ .byte 1+2*8, 2+2*8
+ .byte 1+4*8, 2+4*8
+ .byte 1+5*8, 2+5*8
+
function ff_h264_idct_add16intra_neon, export=1
push {r4-r8,lr}
mov r4, r0
@@ -124,7 +134,7 @@ function ff_h264_idct_add16intra_neon, export=1
mov r1, r2
mov r2, r3
ldr r6, [sp, #24]
- movrel r7, scan8
+ adr r7, scan8
mov ip, #16
1: ldrb r8, [r7], #1
ldr r0, [r5], #4
@@ -149,7 +159,7 @@ function ff_h264_idct_add8_neon, export=1
add r1, r2, #16*32
mov r2, r3
ldr r6, [sp, #32]
- movrel r7, scan8+16
+ adr r7, scan8+16
mov ip, #7
1: ldrb r8, [r7], #1
ldr r0, [r5], #4
@@ -353,7 +363,7 @@ function ff_h264_idct8_add4_neon, export=1
mov r1, r2
mov r2, r3
ldr r6, [sp, #24]
- movrel r7, scan8
+ adr r7, scan8
mov r12, #16
1: ldrb r8, [r7], #4
ldr r0, [r5], #16
@@ -372,13 +382,3 @@ function ff_h264_idct8_add4_neon, export=1
bne 1b
pop {r4-r8,pc}
endfunc
-
- .section .rodata
-scan8: .byte 4+1*8, 5+1*8, 4+2*8, 5+2*8
- .byte 6+1*8, 7+1*8, 6+2*8, 7+2*8
- .byte 4+3*8, 5+3*8, 4+4*8, 5+4*8
- .byte 6+3*8, 7+3*8, 6+4*8, 7+4*8
- .byte 1+1*8, 2+1*8
- .byte 1+2*8, 2+2*8
- .byte 1+4*8, 2+4*8
- .byte 1+5*8, 2+5*8
diff --git a/lib/ffmpeg/libavcodec/arm/h264pred_neon.S b/lib/ffmpeg/libavcodec/arm/h264pred_neon.S
index 63c96ee..357c10a 100644
--- a/lib/ffmpeg/libavcodec/arm/h264pred_neon.S
+++ b/lib/ffmpeg/libavcodec/arm/h264pred_neon.S
@@ -123,7 +123,7 @@ function ff_pred16x16_plane_neon, export=1
vaddl.u8 q8, d2, d3
vsubl.u8 q2, d2, d0
vsubl.u8 q3, d3, d1
- movrel r3, p16weight
+ adr r3, p16weight
vld1.8 {q0}, [r3,:128]
vmul.s16 q2, q2, q0
vmul.s16 q3, q3, q0
@@ -166,7 +166,6 @@ function ff_pred16x16_plane_neon, export=1
bx lr
endfunc
- .section .rodata
.align 4
p16weight:
.short 1,2,3,4,5,6,7,8
@@ -207,7 +206,7 @@ function ff_pred8x8_plane_neon, export=1
vrev32.8 d0, d0
vtrn.32 d2, d3
vsubl.u8 q2, d2, d0
- movrel r3, p16weight
+ adr r3, p16weight
vld1.16 {q0}, [r3,:128]
vmul.s16 d4, d4, d0
vmul.s16 d5, d5, d0
diff --git a/lib/ffmpeg/libavcodec/arm/simple_idct_neon.S b/lib/ffmpeg/libavcodec/arm/simple_idct_neon.S
index 17cde58..e61414e 100644
--- a/lib/ffmpeg/libavcodec/arm/simple_idct_neon.S
+++ b/lib/ffmpeg/libavcodec/arm/simple_idct_neon.S
@@ -239,7 +239,6 @@ function idct_col4_st8_neon
bx lr
endfunc
- .section .rodata
.align 4
idct_coeff_neon:
.short W1, W2, W3, W4, W5, W6, W7, W4c
@@ -249,7 +248,7 @@ idct_coeff_neon:
pld [\data]
pld [\data, #64]
vpush {d8-d15}
- movrel r3, idct_coeff_neon
+ adr r3, idct_coeff_neon
vld1.64 {d0,d1}, [r3,:128]
.endm
diff --git a/lib/ffmpeg/libavcodec/arm/vp3dsp_neon.S b/lib/ffmpeg/libavcodec/arm/vp3dsp_neon.S
index d97ed3d..74bf7ba 100644
--- a/lib/ffmpeg/libavcodec/arm/vp3dsp_neon.S
+++ b/lib/ffmpeg/libavcodec/arm/vp3dsp_neon.S
@@ -20,12 +20,9 @@
#include "asm.S"
-.section .rodata
+.text
.align 4
-vp3_idct_constants:
-.short 64277, 60547, 54491, 46341, 36410, 25080, 12785
-
#define xC1S7 d0[0]
#define xC2S6 d0[1]
#define xC3S5 d0[2]
@@ -34,8 +31,6 @@ vp3_idct_constants:
#define xC6S2 d1[1]
#define xC7S1 d1[2]
-.text
-
.macro vp3_loop_filter
vsubl.u8 q3, d18, d17
vsubl.u8 q2, d16, d19
@@ -109,10 +104,14 @@ function ff_vp3_h_loop_filter_neon, export=1
bx lr
endfunc
+.align 4
+vp3_idct_constants:
+.short 64277, 60547, 54491, 46341, 36410, 25080, 12785
+.align 4
function vp3_idct_start_neon
vpush {d8-d15}
- movrel r3, vp3_idct_constants
+ adr r3, vp3_idct_constants
vld1.64 {d0-d1}, [r3,:128]
vld1.64 {d16-d19}, [r2,:128]!
vld1.64 {d20-d23}, [r2,:128]!