Skip to content

Commit

Permalink
PPC/s390: [deoptimizer] Change deopt entries into builtins
Browse files Browse the repository at this point in the history
Port 7f58ced

Original Commit Message:

    While the overall goal of this commit is to change deoptimization
    entries into builtins, there are multiple related things happening:

    - Deoptimization entries, formerly stubs (i.e. Code objects generated
      at runtime, guaranteed to be immovable), have been converted into
      builtins. The major restriction is that we now need to preserve the
      kRootRegister, which was formerly used on most architectures to pass
      the deoptimization id. The solution differs based on platform.
    - Renamed DEOPT_ENTRIES_OR_FOR_TESTING code kind to FOR_TESTING.
    - Removed heap/ support for immovable Code generation.
    - Removed the DeserializerData class (no longer needed).
    - arm64: to preserve 4-byte deopt exits, introduced a new optimization
      in which the final jump to the deoptimization entry is generated
      once per Code object, and deopt exits can continue to emit a
      near-call.
    - arm,ia32,x64: change to fixed-size deopt exits. This reduces exit
      sizes by 4/8, 5, and 5 bytes, respectively.

    On arm the deopt exit size is reduced from 12 (or 16) bytes to 8 bytes
    by using the same strategy as on arm64 (recalc deopt id from return
    address). Before:

     e300a002       movw r10, <id>
     e59fc024       ldr ip, [pc, <entry offset>]
     e12fff3c       blx ip

    After:

     e59acb35       ldr ip, [r10, <entry offset>]
     e12fff3c       blx ip

    On arm64 the deopt exit size remains 4 bytes (or 8 bytes in same cases
    with CFI). Additionally, up to 4 builtin jumps are emitted per Code
    object (max 32 bytes added overhead per Code object). Before:

     9401cdae       bl <entry offset>

    After:

     # eager deoptimization entry jump.
     f95b1f50       ldr x16, [x26, <eager entry offset>]
     d61f0200       br x16
     # lazy deoptimization entry jump.
     f95b2b50       ldr x16, [x26, <lazy entry offset>]
     d61f0200       br x16
     # the deopt exit.
     97fffffc       bl <eager deoptimization entry jump offset>

    On ia32 the deopt exit size is reduced from 10 to 5 bytes. Before:

     bb00000000     mov ebx,<id>
     e825f5372b     call <entry>

    After:

     e8ea2256ba     call <entry>

    On x64 the deopt exit size is reduced from 12 to 7 bytes. Before:

     49c7c511000000 REX.W movq r13,<id>
     e8ea2f0700     call <entry>

    After:

     41ff9560360000 call [r13+<entry offset>]

R=jgruber@chromium.org, joransiu@ca.ibm.com, jyan@ca.ibm.com, michael_dawson@ca.ibm.com, miladfar@ca.ibm.com
BUG=
LOG=N

Change-Id: I49e4c92759043e46beb3c76c97823285b16feeef
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2486225
Reviewed-by: Milad Fa <mfarazma@redhat.com>
Commit-Queue: Junliang Yan <junyan@redhat.com>
Cr-Commit-Position: refs/heads/master@{#70637}
  • Loading branch information
Junliang Yan authored and Commit Bot committed Oct 20, 2020
1 parent 89d9eb7 commit 5d5ed19
Show file tree
Hide file tree
Showing 10 changed files with 524 additions and 498 deletions.
246 changes: 246 additions & 0 deletions src/builtins/ppc/builtins-ppc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3351,6 +3351,252 @@ void Builtins::Generate_DirectCEntry(MacroAssembler* masm) {
__ blr();
}

namespace {

// This code tries to be close to ia32 code so that any changes can be
// easily ported.
void Generate_DeoptimizationEntry(MacroAssembler* masm,
DeoptimizeKind deopt_kind) {
Isolate* isolate = masm->isolate();

// Unlike on ARM we don't save all the registers, just the useful ones.
// For the rest, there are gaps on the stack, so the offsets remain the same.
const int kNumberOfRegisters = Register::kNumRegisters;

RegList restored_regs = kJSCallerSaved | kCalleeSaved;
RegList saved_regs = restored_regs | sp.bit();

const int kDoubleRegsSize = kDoubleSize * DoubleRegister::kNumRegisters;

// Save all double registers before messing with them.
__ subi(sp, sp, Operand(kDoubleRegsSize));
const RegisterConfiguration* config = RegisterConfiguration::Default();
for (int i = 0; i < config->num_allocatable_double_registers(); ++i) {
int code = config->GetAllocatableDoubleCode(i);
const DoubleRegister dreg = DoubleRegister::from_code(code);
int offset = code * kDoubleSize;
__ stfd(dreg, MemOperand(sp, offset));
}

// Push saved_regs (needed to populate FrameDescription::registers_).
// Leave gaps for other registers.
__ subi(sp, sp, Operand(kNumberOfRegisters * kSystemPointerSize));
for (int16_t i = kNumberOfRegisters - 1; i >= 0; i--) {
if ((saved_regs & (1 << i)) != 0) {
__ StoreP(ToRegister(i), MemOperand(sp, kSystemPointerSize * i));
}
}
{
UseScratchRegisterScope temps(masm);
Register scratch = temps.Acquire();
__ Move(scratch, ExternalReference::Create(
IsolateAddressId::kCEntryFPAddress, isolate));
__ StoreP(fp, MemOperand(scratch));
}
const int kSavedRegistersAreaSize =
(kNumberOfRegisters * kSystemPointerSize) + kDoubleRegsSize;

// Get the bailout id is passed as r29 by the caller.
__ mr(r5, r29);

__ mov(r5, Operand(Deoptimizer::kFixedExitSizeMarker));
// Get the address of the location in the code object (r6) (return
// address for lazy deoptimization) and compute the fp-to-sp delta in
// register r7.
__ mflr(r6);
__ addi(r7, sp, Operand(kSavedRegistersAreaSize));
__ sub(r7, fp, r7);

// Allocate a new deoptimizer object.
// Pass six arguments in r3 to r8.
__ PrepareCallCFunction(6, r8);
__ li(r3, Operand::Zero());
Label context_check;
__ LoadP(r4, MemOperand(fp, CommonFrameConstants::kContextOrFrameTypeOffset));
__ JumpIfSmi(r4, &context_check);
__ LoadP(r3, MemOperand(fp, StandardFrameConstants::kFunctionOffset));
__ bind(&context_check);
__ li(r4, Operand(static_cast<int>(deopt_kind)));
// r5: bailout id already loaded.
// r6: code address or 0 already loaded.
// r7: Fp-to-sp delta.
__ Move(r8, ExternalReference::isolate_address(isolate));
// Call Deoptimizer::New().
{
AllowExternalCallThatCantCauseGC scope(masm);
__ CallCFunction(ExternalReference::new_deoptimizer_function(), 6);
}

// Preserve "deoptimizer" object in register r3 and get the input
// frame descriptor pointer to r4 (deoptimizer->input_);
__ LoadP(r4, MemOperand(r3, Deoptimizer::input_offset()));

// Copy core registers into FrameDescription::registers_[kNumRegisters].
DCHECK_EQ(Register::kNumRegisters, kNumberOfRegisters);
for (int i = 0; i < kNumberOfRegisters; i++) {
int offset =
(i * kSystemPointerSize) + FrameDescription::registers_offset();
__ LoadP(r5, MemOperand(sp, i * kSystemPointerSize));
__ StoreP(r5, MemOperand(r4, offset));
}

int double_regs_offset = FrameDescription::double_registers_offset();
// Copy double registers to
// double_registers_[DoubleRegister::kNumRegisters]
for (int i = 0; i < config->num_allocatable_double_registers(); ++i) {
int code = config->GetAllocatableDoubleCode(i);
int dst_offset = code * kDoubleSize + double_regs_offset;
int src_offset =
code * kDoubleSize + kNumberOfRegisters * kSystemPointerSize;
__ lfd(d0, MemOperand(sp, src_offset));
__ stfd(d0, MemOperand(r4, dst_offset));
}

// Mark the stack as not iterable for the CPU profiler which won't be able to
// walk the stack without the return address.
{
UseScratchRegisterScope temps(masm);
Register is_iterable = temps.Acquire();
Register zero = r7;
__ Move(is_iterable, ExternalReference::stack_is_iterable_address(isolate));
__ li(zero, Operand(0));
__ stb(zero, MemOperand(is_iterable));
}

// Remove the saved registers from the stack.
__ addi(sp, sp, Operand(kSavedRegistersAreaSize));

// Compute a pointer to the unwinding limit in register r5; that is
// the first stack slot not part of the input frame.
__ LoadP(r5, MemOperand(r4, FrameDescription::frame_size_offset()));
__ add(r5, r5, sp);

// Unwind the stack down to - but not including - the unwinding
// limit and copy the contents of the activation frame to the input
// frame description.
__ addi(r6, r4, Operand(FrameDescription::frame_content_offset()));
Label pop_loop;
Label pop_loop_header;
__ b(&pop_loop_header);
__ bind(&pop_loop);
__ pop(r7);
__ StoreP(r7, MemOperand(r6, 0));
__ addi(r6, r6, Operand(kSystemPointerSize));
__ bind(&pop_loop_header);
__ cmp(r5, sp);
__ bne(&pop_loop);

// Compute the output frame in the deoptimizer.
__ push(r3); // Preserve deoptimizer object across call.
// r3: deoptimizer object; r4: scratch.
__ PrepareCallCFunction(1, r4);
// Call Deoptimizer::ComputeOutputFrames().
{
AllowExternalCallThatCantCauseGC scope(masm);
__ CallCFunction(ExternalReference::compute_output_frames_function(), 1);
}
__ pop(r3); // Restore deoptimizer object (class Deoptimizer).

__ LoadP(sp, MemOperand(r3, Deoptimizer::caller_frame_top_offset()));

// Replace the current (input) frame with the output frames.
Label outer_push_loop, inner_push_loop, outer_loop_header, inner_loop_header;
// Outer loop state: r7 = current "FrameDescription** output_",
// r4 = one past the last FrameDescription**.
__ lwz(r4, MemOperand(r3, Deoptimizer::output_count_offset()));
__ LoadP(r7, MemOperand(r3, Deoptimizer::output_offset())); // r7 is output_.
__ ShiftLeftImm(r4, r4, Operand(kSystemPointerSizeLog2));
__ add(r4, r7, r4);
__ b(&outer_loop_header);

__ bind(&outer_push_loop);
// Inner loop state: r5 = current FrameDescription*, r6 = loop index.
__ LoadP(r5, MemOperand(r7, 0)); // output_[ix]
__ LoadP(r6, MemOperand(r5, FrameDescription::frame_size_offset()));
__ b(&inner_loop_header);

__ bind(&inner_push_loop);
__ addi(r6, r6, Operand(-sizeof(intptr_t)));
__ add(r9, r5, r6);
__ LoadP(r9, MemOperand(r9, FrameDescription::frame_content_offset()));
__ push(r9);

__ bind(&inner_loop_header);
__ cmpi(r6, Operand::Zero());
__ bne(&inner_push_loop); // test for gt?

__ addi(r7, r7, Operand(kSystemPointerSize));
__ bind(&outer_loop_header);
__ cmp(r7, r4);
__ blt(&outer_push_loop);

__ LoadP(r4, MemOperand(r3, Deoptimizer::input_offset()));
for (int i = 0; i < config->num_allocatable_double_registers(); ++i) {
int code = config->GetAllocatableDoubleCode(i);
const DoubleRegister dreg = DoubleRegister::from_code(code);
int src_offset = code * kDoubleSize + double_regs_offset;
__ lfd(dreg, MemOperand(r4, src_offset));
}

// Push pc, and continuation from the last output frame.
__ LoadP(r9, MemOperand(r5, FrameDescription::pc_offset()));
__ push(r9);
__ LoadP(r9, MemOperand(r5, FrameDescription::continuation_offset()));
__ push(r9);

// Restore the registers from the last output frame.
{
UseScratchRegisterScope temps(masm);
Register scratch = temps.Acquire();
DCHECK(!(scratch.bit() & restored_regs));
__ mr(scratch, r5);
for (int i = kNumberOfRegisters - 1; i >= 0; i--) {
int offset =
(i * kSystemPointerSize) + FrameDescription::registers_offset();
if ((restored_regs & (1 << i)) != 0) {
__ LoadP(ToRegister(i), MemOperand(scratch, offset));
}
}
}

{
UseScratchRegisterScope temps(masm);
Register is_iterable = temps.Acquire();
Register one = r7;
__ Move(is_iterable, ExternalReference::stack_is_iterable_address(isolate));
__ li(one, Operand(1));
__ stb(one, MemOperand(is_iterable));
}

{
UseScratchRegisterScope temps(masm);
Register scratch = temps.Acquire();
__ pop(scratch); // get continuation, leave pc on stack
__ pop(r0);
__ mtlr(r0);
__ Jump(scratch);
}

__ stop();
}

} // namespace

void Builtins::Generate_DeoptimizationEntry_Eager(MacroAssembler* masm) {
Generate_DeoptimizationEntry(masm, DeoptimizeKind::kEager);
}

void Builtins::Generate_DeoptimizationEntry_Soft(MacroAssembler* masm) {
Generate_DeoptimizationEntry(masm, DeoptimizeKind::kSoft);
}

void Builtins::Generate_DeoptimizationEntry_Bailout(MacroAssembler* masm) {
Generate_DeoptimizationEntry(masm, DeoptimizeKind::kBailout);
}

void Builtins::Generate_DeoptimizationEntry_Lazy(MacroAssembler* masm) {
Generate_DeoptimizationEntry(masm, DeoptimizeKind::kLazy);
}
#undef __
} // namespace internal
} // namespace v8
Expand Down
Loading

0 comments on commit 5d5ed19

Please sign in to comment.