forked from openjdk/jdk
/
stubGenerator_arm.cpp
3184 lines (2581 loc) · 114 KB
/
stubGenerator_arm.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Copyright (c) 2008, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
#include "precompiled.hpp"
#include "asm/assembler.inline.hpp"
#include "compiler/oopMap.hpp"
#include "gc/shared/barrierSet.hpp"
#include "gc/shared/barrierSetAssembler.hpp"
#include "gc/shared/barrierSetNMethod.hpp"
#include "interpreter/interpreter.hpp"
#include "memory/universe.hpp"
#include "nativeInst_arm.hpp"
#include "oops/instanceOop.hpp"
#include "oops/method.hpp"
#include "oops/objArrayKlass.hpp"
#include "oops/oop.inline.hpp"
#include "prims/methodHandles.hpp"
#include "runtime/frame.inline.hpp"
#include "runtime/handles.inline.hpp"
#include "runtime/sharedRuntime.hpp"
#include "runtime/stubCodeGenerator.hpp"
#include "runtime/stubRoutines.hpp"
#include "utilities/align.hpp"
#include "utilities/powerOfTwo.hpp"
#ifdef COMPILER2
#include "opto/runtime.hpp"
#endif
// Declaration and definition of StubGenerator (no .hpp file).
// For a more detailed description of the stub routine structure
// see the comment in stubRoutines.hpp
#define __ _masm->
#ifdef PRODUCT
#define BLOCK_COMMENT(str) /* nothing */
#else
#define BLOCK_COMMENT(str) __ block_comment(str)
#endif
#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
// -------------------------------------------------------------------------------------------------------------------------
// Stub Code definitions
// Platform dependent parameters for array copy stubs
// Note: we have noticed a huge change in behavior on a microbenchmark
// from platform to platform depending on the configuration.
// Instead of adding a series of command line options (which
// unfortunately have to be done in the shared file and cannot appear
// only in the ARM port), the tested result are hard-coded here in a set
// of options, selected by specifying 'ArmCopyPlatform'
// Currently, this 'platform' is hardcoded to a value that is a good
// enough trade-off. However, one can easily modify this file to test
// the hard-coded configurations or create new ones. If the gain is
// significant, we could decide to either add command line options or
// add code to automatically choose a configuration.
// see comments below for the various configurations created
#define DEFAULT_ARRAYCOPY_CONFIG 0
#define TEGRA2_ARRAYCOPY_CONFIG 1
#define IMX515_ARRAYCOPY_CONFIG 2
// Hard coded choices (XXX: could be changed to a command line option)
#define ArmCopyPlatform DEFAULT_ARRAYCOPY_CONFIG
#define ArmCopyCacheLineSize 32 // not worth optimizing to 64 according to measured gains
// configuration for each kind of loop
typedef struct {
int pld_distance; // prefetch distance (0 => no prefetch, <0: prefetch_before);
bool split_ldm; // if true, split each STM in STMs with fewer registers
bool split_stm; // if true, split each LTM in LTMs with fewer registers
} arraycopy_loop_config;
// configuration for all loops
typedef struct {
// const char *description;
arraycopy_loop_config forward_aligned;
arraycopy_loop_config backward_aligned;
arraycopy_loop_config forward_shifted;
arraycopy_loop_config backward_shifted;
} arraycopy_platform_config;
// configured platforms
static arraycopy_platform_config arraycopy_configurations[] = {
// configuration parameters for arraycopy loops
// Configurations were chosen based on manual analysis of benchmark
// results, minimizing overhead with respect to best results on the
// different test cases.
// Prefetch before is always favored since it avoids dirtying the
// cache uselessly for small copies. Code for prefetch after has
// been kept in case the difference is significant for some
// platforms but we might consider dropping it.
// distance, ldm, stm
{
// default: tradeoff tegra2/imx515/nv-tegra2,
// Notes on benchmarking:
// - not far from optimal configuration on nv-tegra2
// - within 5% of optimal configuration except for backward aligned on IMX
// - up to 40% from optimal configuration for backward shifted and backward align for tegra2
// but still on par with the operating system copy
{-256, true, true }, // forward aligned
{-256, true, true }, // backward aligned
{-256, false, false }, // forward shifted
{-256, true, true } // backward shifted
},
{
// configuration tuned on tegra2-4.
// Warning: should not be used on nv-tegra2 !
// Notes:
// - prefetch after gives 40% gain on backward copies on tegra2-4,
// resulting in better number than the operating system
// copy. However, this can lead to a 300% loss on nv-tegra and has
// more impact on the cache (fetches further than what is
// copied). Use this configuration with care, in case it improves
// reference benchmarks.
{-256, true, true }, // forward aligned
{96, false, false }, // backward aligned
{-256, false, false }, // forward shifted
{96, false, false } // backward shifted
},
{
// configuration tuned on imx515
// Notes:
// - smaller prefetch distance is sufficient to get good result and might be more stable
// - refined backward aligned options within 5% of optimal configuration except for
// tests were the arrays fit in the cache
{-160, false, false }, // forward aligned
{-160, false, false }, // backward aligned
{-160, false, false }, // forward shifted
{-160, true, true } // backward shifted
}
};
class StubGenerator: public StubCodeGenerator {
#ifdef PRODUCT
#define inc_counter_np(a,b,c) ((void)0)
#else
#define inc_counter_np(counter, t1, t2) \
BLOCK_COMMENT("inc_counter " #counter); \
__ inc_counter(&counter, t1, t2);
#endif
private:
address generate_call_stub(address& return_address) {
StubCodeMark mark(this, "StubRoutines", "call_stub");
address start = __ pc();
assert(frame::entry_frame_call_wrapper_offset == 0, "adjust this code");
__ mov(Rtemp, SP);
__ push(RegisterSet(FP) | RegisterSet(LR));
__ fpush_hardfp(FloatRegisterSet(D8, 8));
__ stmdb(SP, RegisterSet(R0, R2) | RegisterSet(R4, R6) | RegisterSet(R8, R10) | altFP_7_11, writeback);
__ mov(Rmethod, R3);
__ ldmia(Rtemp, RegisterSet(R1, R3) | Rthread); // stacked arguments
// XXX: TODO
// Would be better with respect to native tools if the following
// setting of FP was changed to conform to the native ABI, with FP
// pointing to the saved FP slot (and the corresponding modifications
// for entry_frame_call_wrapper_offset and frame::real_fp).
__ mov(FP, SP);
{
Label no_parameters, pass_parameters;
__ cmp(R3, 0);
__ b(no_parameters, eq);
__ bind(pass_parameters);
__ ldr(Rtemp, Address(R2, wordSize, post_indexed)); // Rtemp OK, unused and scratchable
__ subs(R3, R3, 1);
__ push(Rtemp);
__ b(pass_parameters, ne);
__ bind(no_parameters);
}
__ mov(Rsender_sp, SP);
__ blx(R1);
return_address = __ pc();
__ add(SP, FP, wordSize); // Skip link to JavaCallWrapper
__ pop(RegisterSet(R2, R3));
#ifndef __ABI_HARD__
__ cmp(R3, T_LONG);
__ cmp(R3, T_DOUBLE, ne);
__ str(R0, Address(R2));
__ str(R1, Address(R2, wordSize), eq);
#else
Label cont, l_float, l_double;
__ cmp(R3, T_DOUBLE);
__ b(l_double, eq);
__ cmp(R3, T_FLOAT);
__ b(l_float, eq);
__ cmp(R3, T_LONG);
__ str(R0, Address(R2));
__ str(R1, Address(R2, wordSize), eq);
__ b(cont);
__ bind(l_double);
__ fstd(D0, Address(R2));
__ b(cont);
__ bind(l_float);
__ fsts(S0, Address(R2));
__ bind(cont);
#endif
__ pop(RegisterSet(R4, R6) | RegisterSet(R8, R10) | altFP_7_11);
__ fpop_hardfp(FloatRegisterSet(D8, 8));
__ pop(RegisterSet(FP) | RegisterSet(PC));
return start;
}
// (in) Rexception_obj: exception oop
address generate_catch_exception() {
StubCodeMark mark(this, "StubRoutines", "catch_exception");
address start = __ pc();
__ str(Rexception_obj, Address(Rthread, Thread::pending_exception_offset()));
__ b(StubRoutines::_call_stub_return_address);
return start;
}
// (in) Rexception_pc: return address
address generate_forward_exception() {
StubCodeMark mark(this, "StubRoutines", "forward exception");
address start = __ pc();
__ mov(c_rarg0, Rthread);
__ mov(c_rarg1, Rexception_pc);
__ call_VM_leaf(CAST_FROM_FN_PTR(address,
SharedRuntime::exception_handler_for_return_address),
c_rarg0, c_rarg1);
__ ldr(Rexception_obj, Address(Rthread, Thread::pending_exception_offset()));
const Register Rzero = __ zero_register(Rtemp); // Rtemp OK (cleared by above call)
__ str(Rzero, Address(Rthread, Thread::pending_exception_offset()));
#ifdef ASSERT
// make sure exception is set
{ Label L;
__ cbnz(Rexception_obj, L);
__ stop("StubRoutines::forward exception: no pending exception (2)");
__ bind(L);
}
#endif
// Verify that there is really a valid exception in RAX.
__ verify_oop(Rexception_obj);
__ jump(R0); // handler is returned in R0 by runtime function
return start;
}
// Integer division shared routine
// Input:
// R0 - dividend
// R2 - divisor
// Output:
// R0 - remainder
// R1 - quotient
// Destroys:
// R2
// LR
address generate_idiv_irem() {
Label positive_arguments, negative_or_zero, call_slow_path;
Register dividend = R0;
Register divisor = R2;
Register remainder = R0;
Register quotient = R1;
Register tmp = LR;
assert(dividend == remainder, "must be");
address start = __ pc();
// Check for special cases: divisor <= 0 or dividend < 0
__ cmp(divisor, 0);
__ orrs(quotient, dividend, divisor, ne);
__ b(negative_or_zero, le);
__ bind(positive_arguments);
// Save return address on stack to free one extra register
__ push(LR);
// Approximate the mamximum order of the quotient
__ clz(tmp, dividend);
__ clz(quotient, divisor);
__ subs(tmp, quotient, tmp);
__ mov(quotient, 0);
// Jump to the appropriate place in the unrolled loop below
__ ldr(PC, Address(PC, tmp, lsl, 2), pl);
// If divisor is greater than dividend, return immediately
__ pop(PC);
// Offset table
Label offset_table[32];
int i;
for (i = 0; i <= 31; i++) {
__ emit_address(offset_table[i]);
}
// Unrolled loop of 32 division steps
for (i = 31; i >= 0; i--) {
__ bind(offset_table[i]);
__ cmp(remainder, AsmOperand(divisor, lsl, i));
__ sub(remainder, remainder, AsmOperand(divisor, lsl, i), hs);
__ add(quotient, quotient, 1 << i, hs);
}
__ pop(PC);
__ bind(negative_or_zero);
// Find the combination of argument signs and jump to corresponding handler
__ andr(quotient, dividend, 0x80000000, ne);
__ orr(quotient, quotient, AsmOperand(divisor, lsr, 31), ne);
__ add(PC, PC, AsmOperand(quotient, ror, 26), ne);
__ str(LR, Address(Rthread, JavaThread::saved_exception_pc_offset()));
// The leaf runtime function can destroy R0-R3 and R12 registers which are still alive
RegisterSet saved_registers = RegisterSet(R3) | RegisterSet(R12);
#if R9_IS_SCRATCHED
// Safer to save R9 here since callers may have been written
// assuming R9 survives. This is suboptimal but may not be worth
// revisiting for this slow case.
// save also R10 for alignment
saved_registers = saved_registers | RegisterSet(R9, R10);
#endif
{
// divisor == 0
FixedSizeCodeBlock zero_divisor(_masm, 8, true);
__ push(saved_registers);
__ mov(R0, Rthread);
__ mov(R1, LR);
__ mov(R2, SharedRuntime::IMPLICIT_DIVIDE_BY_ZERO);
__ b(call_slow_path);
}
{
// divisor > 0 && dividend < 0
FixedSizeCodeBlock positive_divisor_negative_dividend(_masm, 8, true);
__ push(LR);
__ rsb(dividend, dividend, 0);
__ bl(positive_arguments);
__ rsb(remainder, remainder, 0);
__ rsb(quotient, quotient, 0);
__ pop(PC);
}
{
// divisor < 0 && dividend > 0
FixedSizeCodeBlock negative_divisor_positive_dividend(_masm, 8, true);
__ push(LR);
__ rsb(divisor, divisor, 0);
__ bl(positive_arguments);
__ rsb(quotient, quotient, 0);
__ pop(PC);
}
{
// divisor < 0 && dividend < 0
FixedSizeCodeBlock negative_divisor_negative_dividend(_masm, 8, true);
__ push(LR);
__ rsb(dividend, dividend, 0);
__ rsb(divisor, divisor, 0);
__ bl(positive_arguments);
__ rsb(remainder, remainder, 0);
__ pop(PC);
}
__ bind(call_slow_path);
__ call(CAST_FROM_FN_PTR(address, SharedRuntime::continuation_for_implicit_exception));
__ pop(saved_registers);
__ bx(R0);
return start;
}
// As per atomic.hpp the Atomic read-modify-write operations must be logically implemented as:
// <fence>; <op>; <membar StoreLoad|StoreStore>
// But for load-linked/store-conditional based systems a fence here simply means
// no load/store can be reordered with respect to the initial load-linked, so we have:
// <membar storeload|loadload> ; load-linked; <op>; store-conditional; <membar storeload|storestore>
// There are no memory actions in <op> so nothing further is needed.
//
// So we define the following for convenience:
#define MEMBAR_ATOMIC_OP_PRE \
MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad|MacroAssembler::LoadLoad)
#define MEMBAR_ATOMIC_OP_POST \
MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad|MacroAssembler::StoreStore)
// Note: JDK 9 only supports ARMv7+ so we always have ldrexd available even though the
// code below allows for it to be otherwise. The else clause indicates an ARMv5 system
// for which we do not support MP and so membars are not necessary. This ARMv5 code will
// be removed in the future.
// Implementation of atomic_add(jint add_value, volatile jint* dest)
// used by Atomic::add(volatile jint* dest, jint add_value)
//
// Arguments :
//
// add_value: R0
// dest: R1
//
// Results:
//
// R0: the new stored in dest
//
// Overwrites:
//
// R1, R2, R3
//
address generate_atomic_add() {
address start;
StubCodeMark mark(this, "StubRoutines", "atomic_add");
Label retry;
start = __ pc();
Register addval = R0;
Register dest = R1;
Register prev = R2;
Register ok = R2;
Register newval = R3;
if (VM_Version::supports_ldrex()) {
__ membar(MEMBAR_ATOMIC_OP_PRE, prev);
__ bind(retry);
__ ldrex(newval, Address(dest));
__ add(newval, addval, newval);
__ strex(ok, newval, Address(dest));
__ cmp(ok, 0);
__ b(retry, ne);
__ mov (R0, newval);
__ membar(MEMBAR_ATOMIC_OP_POST, prev);
} else {
__ bind(retry);
__ ldr (prev, Address(dest));
__ add(newval, addval, prev);
__ atomic_cas_bool(prev, newval, dest, 0, noreg/*ignored*/);
__ b(retry, ne);
__ mov (R0, newval);
}
__ bx(LR);
return start;
}
// Implementation of jint atomic_xchg(jint exchange_value, volatile jint* dest)
// used by Atomic::add(volatile jint* dest, jint exchange_value)
//
// Arguments :
//
// exchange_value: R0
// dest: R1
//
// Results:
//
// R0: the value previously stored in dest
//
// Overwrites:
//
// R1, R2, R3
//
address generate_atomic_xchg() {
address start;
StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
start = __ pc();
Register newval = R0;
Register dest = R1;
Register prev = R2;
Label retry;
if (VM_Version::supports_ldrex()) {
Register ok=R3;
__ membar(MEMBAR_ATOMIC_OP_PRE, prev);
__ bind(retry);
__ ldrex(prev, Address(dest));
__ strex(ok, newval, Address(dest));
__ cmp(ok, 0);
__ b(retry, ne);
__ mov (R0, prev);
__ membar(MEMBAR_ATOMIC_OP_POST, prev);
} else {
__ bind(retry);
__ ldr (prev, Address(dest));
__ atomic_cas_bool(prev, newval, dest, 0, noreg/*ignored*/);
__ b(retry, ne);
__ mov (R0, prev);
}
__ bx(LR);
return start;
}
// Implementation of jint atomic_cmpxchg(jint exchange_value, volatile jint *dest, jint compare_value)
// used by Atomic::cmpxchg(volatile jint *dest, jint compare_value, jint exchange_value)
//
// Arguments :
//
// compare_value: R0
// exchange_value: R1
// dest: R2
//
// Results:
//
// R0: the value previously stored in dest
//
// Overwrites:
//
// R0, R1, R2, R3, Rtemp
//
address generate_atomic_cmpxchg() {
address start;
StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
start = __ pc();
Register cmp = R0;
Register newval = R1;
Register dest = R2;
Register temp1 = R3;
Register temp2 = Rtemp; // Rtemp free (native ABI)
__ membar(MEMBAR_ATOMIC_OP_PRE, temp1);
// atomic_cas returns previous value in R0
__ atomic_cas(temp1, temp2, cmp, newval, dest, 0);
__ membar(MEMBAR_ATOMIC_OP_POST, temp1);
__ bx(LR);
return start;
}
// Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value)
// reordered before by a wrapper to (jlong compare_value, jlong exchange_value, volatile jlong *dest)
//
// Arguments :
//
// compare_value: R1 (High), R0 (Low)
// exchange_value: R3 (High), R2 (Low)
// dest: SP+0
//
// Results:
//
// R0:R1: the value previously stored in dest
//
// Overwrites:
//
address generate_atomic_cmpxchg_long() {
address start;
StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
start = __ pc();
Register cmp_lo = R0;
Register cmp_hi = R1;
Register newval_lo = R2;
Register newval_hi = R3;
Register addr = Rtemp; /* After load from stack */
Register temp_lo = R4;
Register temp_hi = R5;
Register temp_result = R8;
assert_different_registers(cmp_lo, newval_lo, temp_lo, addr, temp_result, R7);
assert_different_registers(cmp_hi, newval_hi, temp_hi, addr, temp_result, R7);
__ membar(MEMBAR_ATOMIC_OP_PRE, Rtemp); // Rtemp free (native ABI)
// Stack is unaligned, maintain double word alignment by pushing
// odd number of regs.
__ push(RegisterSet(temp_result) | RegisterSet(temp_lo, temp_hi));
__ ldr(addr, Address(SP, 12));
// atomic_cas64 returns previous value in temp_lo, temp_hi
__ atomic_cas64(temp_lo, temp_hi, temp_result, cmp_lo, cmp_hi,
newval_lo, newval_hi, addr, 0);
__ mov(R0, temp_lo);
__ mov(R1, temp_hi);
__ pop(RegisterSet(temp_result) | RegisterSet(temp_lo, temp_hi));
__ membar(MEMBAR_ATOMIC_OP_POST, Rtemp); // Rtemp free (native ABI)
__ bx(LR);
return start;
}
address generate_atomic_load_long() {
address start;
StubCodeMark mark(this, "StubRoutines", "atomic_load_long");
start = __ pc();
Register result_lo = R0;
Register result_hi = R1;
Register src = R0;
if (VM_Version::supports_ldrexd()) {
__ ldrexd(result_lo, Address(src));
__ clrex(); // FIXME: safe to remove?
} else if (!os::is_MP()) {
// Last-ditch attempt: we are allegedly running on uni-processor.
// Load the thing non-atomically and hope for the best.
__ ldmia(src, RegisterSet(result_lo, result_hi));
} else {
__ stop("Atomic load(jlong) unsupported on this platform");
}
__ bx(LR);
return start;
}
address generate_atomic_store_long() {
address start;
StubCodeMark mark(this, "StubRoutines", "atomic_store_long");
start = __ pc();
Register newval_lo = R0;
Register newval_hi = R1;
Register dest = R2;
Register scratch_lo = R2;
Register scratch_hi = R3; /* After load from stack */
Register result = R3;
if (VM_Version::supports_ldrexd()) {
__ mov(Rtemp, dest); // get dest to Rtemp
Label retry;
__ bind(retry);
__ ldrexd(scratch_lo, Address(Rtemp));
__ strexd(result, R0, Address(Rtemp));
__ rsbs(result, result, 1);
__ b(retry, eq);
} else if (!os::is_MP()) {
// Last-ditch attempt: we are allegedly running on uni-processor.
// Store the thing non-atomically and hope for the best.
__ stmia(dest, RegisterSet(newval_lo, newval_hi));
} else {
__ stop("Atomic store(jlong) unsupported on this platform");
}
__ bx(LR);
return start;
}
#ifdef COMPILER2
// Support for uint StubRoutine::Arm::partial_subtype_check( Klass sub, Klass super );
// Arguments :
//
// ret : R0, returned
// icc/xcc: set as R0 (depending on wordSize)
// sub : R1, argument, not changed
// super: R2, argument, not changed
// raddr: LR, blown by call
address generate_partial_subtype_check() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
address start = __ pc();
// based on SPARC check_klass_subtype_[fast|slow]_path (without CompressedOops)
// R0 used as tmp_reg (in addition to return reg)
Register sub_klass = R1;
Register super_klass = R2;
Register tmp_reg2 = R3;
Register tmp_reg3 = R4;
#define saved_set tmp_reg2, tmp_reg3
Label L_loop, L_fail;
int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
// fast check should be redundant
// slow check
{
__ raw_push(saved_set);
// a couple of useful fields in sub_klass:
int ss_offset = in_bytes(Klass::secondary_supers_offset());
// Do a linear scan of the secondary super-klass chain.
// This code is rarely used, so simplicity is a virtue here.
inc_counter_np(SharedRuntime::_partial_subtype_ctr, tmp_reg2, tmp_reg3);
Register scan_temp = tmp_reg2;
Register count_temp = tmp_reg3;
// We will consult the secondary-super array.
__ ldr(scan_temp, Address(sub_klass, ss_offset));
Register search_key = super_klass;
// Load the array length.
__ ldr_s32(count_temp, Address(scan_temp, Array<Klass*>::length_offset_in_bytes()));
__ add(scan_temp, scan_temp, Array<Klass*>::base_offset_in_bytes());
__ add(count_temp, count_temp, 1);
// Top of search loop
__ bind(L_loop);
// Notes:
// scan_temp starts at the array elements
// count_temp is 1+size
__ subs(count_temp, count_temp, 1);
__ b(L_fail, eq); // not found in the array
// Load next super to check
// In the array of super classes elements are pointer sized.
int element_size = wordSize;
__ ldr(R0, Address(scan_temp, element_size, post_indexed));
// Look for Rsuper_klass on Rsub_klass's secondary super-class-overflow list
__ subs(R0, R0, search_key); // set R0 to 0 on success (and flags to eq)
// A miss means we are NOT a subtype and need to keep looping
__ b(L_loop, ne);
// Falling out the bottom means we found a hit; we ARE a subtype
// Success. Cache the super we found and proceed in triumph.
__ str(super_klass, Address(sub_klass, sc_offset));
// Return success
// R0 is already 0 and flags are already set to eq
__ raw_pop(saved_set);
__ ret();
// Return failure
__ bind(L_fail);
__ movs(R0, 1); // sets the flags
__ raw_pop(saved_set);
__ ret();
}
return start;
}
#undef saved_set
#endif // COMPILER2
//----------------------------------------------------------------------------------------------------
// Non-destructive plausibility checks for oops
address generate_verify_oop() {
StubCodeMark mark(this, "StubRoutines", "verify_oop");
address start = __ pc();
// Incoming arguments:
//
// R0: error message (char* )
// R1: address of register save area
// R2: oop to verify
//
// All registers are saved before calling this stub. However, condition flags should be saved here.
const Register oop = R2;
const Register klass = R3;
const Register tmp1 = R6;
const Register tmp2 = R8;
const Register flags = Rtmp_save0; // R4/R19
const Register ret_addr = Rtmp_save1; // R5/R20
assert_different_registers(oop, klass, tmp1, tmp2, flags, ret_addr, R7);
Label exit, error;
InlinedAddress verify_oop_count((address) StubRoutines::verify_oop_count_addr());
__ mrs(Assembler::CPSR, flags);
__ ldr_literal(tmp1, verify_oop_count);
__ ldr_s32(tmp2, Address(tmp1));
__ add(tmp2, tmp2, 1);
__ str_32(tmp2, Address(tmp1));
// make sure object is 'reasonable'
__ cbz(oop, exit); // if obj is NULL it is ok
// Check if the oop is in the right area of memory
// Note: oop_mask and oop_bits must be updated if the code is saved/reused
const address oop_mask = (address) Universe::verify_oop_mask();
const address oop_bits = (address) Universe::verify_oop_bits();
__ mov_address(tmp1, oop_mask);
__ andr(tmp2, oop, tmp1);
__ mov_address(tmp1, oop_bits);
__ cmp(tmp2, tmp1);
__ b(error, ne);
// make sure klass is 'reasonable'
__ load_klass(klass, oop); // get klass
__ cbz(klass, error); // if klass is NULL it is broken
// return if everything seems ok
__ bind(exit);
__ msr(Assembler::CPSR_f, flags);
__ ret();
// handle errors
__ bind(error);
__ mov(ret_addr, LR); // save return address
// R0: error message
// R1: register save area
__ call(CAST_FROM_FN_PTR(address, MacroAssembler::debug));
__ mov(LR, ret_addr);
__ b(exit);
__ bind_literal(verify_oop_count);
return start;
}
//----------------------------------------------------------------------------------------------------
// Array copy stubs
//
// Generate overlap test for array copy stubs
//
// Input:
// R0 - array1
// R1 - array2
// R2 - element count, 32-bit int
//
// input registers are preserved
//
void array_overlap_test(address no_overlap_target, int log2_elem_size, Register tmp1, Register tmp2) {
assert(no_overlap_target != NULL, "must be generated");
array_overlap_test(no_overlap_target, NULL, log2_elem_size, tmp1, tmp2);
}
void array_overlap_test(Label& L_no_overlap, int log2_elem_size, Register tmp1, Register tmp2) {
array_overlap_test(NULL, &L_no_overlap, log2_elem_size, tmp1, tmp2);
}
void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size, Register tmp1, Register tmp2) {
const Register from = R0;
const Register to = R1;
const Register count = R2;
const Register to_from = tmp1; // to - from
const Register byte_count = (log2_elem_size == 0) ? count : tmp2; // count << log2_elem_size
assert_different_registers(from, to, count, tmp1, tmp2);
// no_overlap version works if 'to' lower (unsigned) than 'from'
// and or 'to' more than (count*size) from 'from'
BLOCK_COMMENT("Array Overlap Test:");
__ subs(to_from, to, from);
if (log2_elem_size != 0) {
__ mov(byte_count, AsmOperand(count, lsl, log2_elem_size));
}
if (NOLp == NULL)
__ b(no_overlap_target,lo);
else
__ b((*NOLp), lo);
__ cmp(to_from, byte_count);
if (NOLp == NULL)
__ b(no_overlap_target, ge);
else
__ b((*NOLp), ge);
}
// probably we should choose between "prefetch-store before or after store", not "before or after load".
void prefetch(Register from, Register to, int offset, int to_delta = 0) {
__ prefetch_read(Address(from, offset));
}
// Generate the inner loop for forward aligned array copy
//
// Arguments
// from: src address, 64 bits aligned
// to: dst address, wordSize aligned
// count: number of elements (32-bit int)
// bytes_per_count: number of bytes for each unit of 'count'
//
// Return the minimum initial value for count
//
// Notes:
// - 'from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA)
// - 'to' aligned on wordSize
// - 'count' must be greater or equal than the returned value
//
// Increases 'from' and 'to' by count*bytes_per_count.
//
// Scratches 'count', R3.
// R4-R10 are preserved (saved/restored).
//
int generate_forward_aligned_copy_loop(Register from, Register to, Register count, int bytes_per_count, bool unsafe_copy = false) {
assert (from == R0 && to == R1 && count == R2, "adjust the implementation below");
const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iteration
arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].forward_aligned;
int pld_offset = config->pld_distance;
const int count_per_loop = bytes_per_loop / bytes_per_count;
bool split_read= config->split_ldm;
bool split_write= config->split_stm;
// XXX optim: use VLDM/VSTM when available (Neon) with PLD
// NEONCopyPLD
// PLD [r1, #0xC0]
// VLDM r1!,{d0-d7}
// VSTM r0!,{d0-d7}
// SUBS r2,r2,#0x40
// BGE NEONCopyPLD
__ push(RegisterSet(R4,R10));
const bool prefetch_before = pld_offset < 0;
const bool prefetch_after = pld_offset > 0;
Label L_skip_pld;
{
// UnsafeCopyMemory page error: continue after ucm
UnsafeCopyMemoryMark ucmm(this, unsafe_copy, true);
// predecrease to exit when there is less than count_per_loop
__ sub_32(count, count, count_per_loop);
if (pld_offset != 0) {
pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset;
prefetch(from, to, 0);
if (prefetch_before) {
// If prefetch is done ahead, final PLDs that overflow the
// copied area can be easily avoided. 'count' is predecreased
// by the prefetch distance to optimize the inner loop and the
// outer loop skips the PLD.
__ subs_32(count, count, (bytes_per_loop+pld_offset)/bytes_per_count);
// skip prefetch for small copies
__ b(L_skip_pld, lt);
}
int offset = ArmCopyCacheLineSize;
while (offset <= pld_offset) {
prefetch(from, to, offset);
offset += ArmCopyCacheLineSize;
};
}
{
// 32-bit ARM note: we have tried implementing loop unrolling to skip one
// PLD with 64 bytes cache line but the gain was not significant.
Label L_copy_loop;
__ align(OptoLoopAlignment);
__ BIND(L_copy_loop);
if (prefetch_before) {
prefetch(from, to, bytes_per_loop + pld_offset);
__ BIND(L_skip_pld);
}
if (split_read) {