Skip to content

Commit 814625f

Browse files
Mike Palligormunkin
authored andcommitted
ARM64: Fix exit stub patching.
Contributed by Javier Guerra Giraldez. (cherry picked from commit 9da0653) When the side trace is assembled, it is linked to its parent trace. For this purpose, JIT runs through the parent trace mcode and updates jump instruction targeted to the corresponding exitno. Prior to this patch, these instructions were patched unconditionally, that leads to errors if the jump target address is out of the value ranges specified in ARM64 references[1][2][3][4][5][6]. As a result of the patch <lj_asm_patchexit> considers value ranges of the jump targets and updates directly only those instructions fitting the particular jump range. Moreover, the corresponding jump in the pad leading to <lj_vm_exit_handler> is also patched, so those instructions, that are not updated before, targets to the linked side trace too. Additionally, there is some refactoring of jump targets assembling in scope of this patch. Igor Munkin: * added the description and the test for the problem [1]: https://developer.arm.com/documentation/dui0801/g/A64-General-Instructions/B [2]: https://developer.arm.com/documentation/dui0801/g/A64-General-Instructions/B-cond [3]: https://developer.arm.com/documentation/dui0801/g/A64-General-Instructions/CBZ [4]: https://developer.arm.com/documentation/dui0801/g/A64-General-Instructions/CBNZ [5]: https://developer.arm.com/documentation/dui0801/g/A64-General-Instructions/TBZ [6]: https://developer.arm.com/documentation/dui0801/g/A64-General-Instructions/TBNZ Resolves tarantool/tarantool#6098 Part of tarantool/tarantool#5629 Reviewed-by: Sergey Kaplun <skaplun@tarantool.org> Reviewed-by: Kirill Yukhin <kyukhin@tarantool.org> Signed-off-by: Igor Munkin <imun@tarantool.org>
1 parent 7570ff6 commit 814625f

File tree

7 files changed

+205
-38
lines changed

7 files changed

+205
-38
lines changed

src/lj_asm_arm64.h

Lines changed: 37 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -56,11 +56,11 @@ static void asm_exitstub_setup(ASMState *as, ExitNo nexits)
5656
asm_mclimit(as);
5757
/* 1: str lr,[sp]; bl ->vm_exit_handler; movz w0,traceno; bl <1; bl <1; ... */
5858
for (i = nexits-1; (int32_t)i >= 0; i--)
59-
*--mxp = A64I_LE(A64I_BL|((-3-i)&0x03ffffffu));
60-
*--mxp = A64I_LE(A64I_MOVZw|A64F_U16(as->T->traceno));
59+
*--mxp = A64I_LE(A64I_BL | A64F_S26(-3-i));
60+
*--mxp = A64I_LE(A64I_MOVZw | A64F_U16(as->T->traceno));
6161
mxp--;
62-
*mxp = A64I_LE(A64I_BL|(((MCode *)(void *)lj_vm_exit_handler-mxp)&0x03ffffffu));
63-
*--mxp = A64I_LE(A64I_STRx|A64F_D(RID_LR)|A64F_N(RID_SP));
62+
*mxp = A64I_LE(A64I_BL | A64F_S26(((MCode *)(void *)lj_vm_exit_handler-mxp)));
63+
*--mxp = A64I_LE(A64I_STRx | A64F_D(RID_LR) | A64F_N(RID_SP));
6464
as->mctop = mxp;
6565
}
6666

@@ -77,7 +77,7 @@ static void asm_guardcc(ASMState *as, A64CC cc)
7777
MCode *p = as->mcp;
7878
if (LJ_UNLIKELY(p == as->invmcp)) {
7979
as->loopinv = 1;
80-
*p = A64I_B | ((target-p) & 0x03ffffffu);
80+
*p = A64I_B | A64F_S26(target-p);
8181
emit_cond_branch(as, cc^1, p-1);
8282
return;
8383
}
@@ -91,7 +91,7 @@ static void asm_guardtnb(ASMState *as, A64Ins ai, Reg r, uint32_t bit)
9191
MCode *p = as->mcp;
9292
if (LJ_UNLIKELY(p == as->invmcp)) {
9393
as->loopinv = 1;
94-
*p = A64I_B | ((target-p) & 0x03ffffffu);
94+
*p = A64I_B | A64F_S26(target-p);
9595
emit_tnb(as, ai^0x01000000u, r, bit, p-1);
9696
return;
9797
}
@@ -105,7 +105,7 @@ static void asm_guardcnb(ASMState *as, A64Ins ai, Reg r)
105105
MCode *p = as->mcp;
106106
if (LJ_UNLIKELY(p == as->invmcp)) {
107107
as->loopinv = 1;
108-
*p = A64I_B | ((target-p) & 0x03ffffffu);
108+
*p = A64I_B | A64F_S26(target-p);
109109
emit_cnb(as, ai^0x01000000u, r, p-1);
110110
return;
111111
}
@@ -1858,7 +1858,7 @@ static void asm_loop_fixup(ASMState *as)
18581858
p[-2] |= ((uint32_t)delta & mask) << 5;
18591859
} else {
18601860
ptrdiff_t delta = target - (p - 1);
1861-
p[-1] = A64I_B | ((uint32_t)(delta) & 0x03ffffffu);
1861+
p[-1] = A64I_B | A64F_S26(delta);
18621862
}
18631863
}
18641864

@@ -1927,7 +1927,7 @@ static void asm_tail_fixup(ASMState *as, TraceNo lnk)
19271927
}
19281928
/* Patch exit branch. */
19291929
target = lnk ? traceref(as->J, lnk)->mcode : (MCode *)lj_vm_exit_interp;
1930-
p[-1] = A64I_B | (((target-p)+1)&0x03ffffffu);
1930+
p[-1] = A64I_B | A64F_S26((target-p)+1);
19311931
}
19321932

19331933
/* Prepare tail of code. */
@@ -1990,40 +1990,50 @@ void lj_asm_patchexit(jit_State *J, GCtrace *T, ExitNo exitno, MCode *target)
19901990
{
19911991
MCode *p = T->mcode;
19921992
MCode *pe = (MCode *)((char *)p + T->szmcode);
1993-
MCode *cstart = NULL, *cend = p;
1993+
MCode *cstart = NULL;
19941994
MCode *mcarea = lj_mcode_patch(J, p, 0);
19951995
MCode *px = exitstub_trace_addr(T, exitno);
1996+
/* Note: this assumes a trace exit is only ever patched once. */
19961997
for (; p < pe; p++) {
19971998
/* Look for exitstub branch, replace with branch to target. */
1999+
ptrdiff_t delta = target - p;
19982000
MCode ins = A64I_LE(*p);
19992001
if ((ins & 0xff000000u) == 0x54000000u &&
20002002
((ins ^ ((px-p)<<5)) & 0x00ffffe0u) == 0) {
2001-
/* Patch bcc exitstub. */
2002-
*p = A64I_LE((ins & 0xff00001fu) | (((target-p)<<5) & 0x00ffffe0u));
2003-
cend = p+1;
2004-
if (!cstart) cstart = p;
2003+
/* Patch bcc, if within range. */
2004+
if (A64F_S_OK(delta, 19)) {
2005+
*p = A64I_LE((ins & 0xff00001fu) | A64F_S19(delta));
2006+
if (!cstart) cstart = p;
2007+
}
20052008
} else if ((ins & 0xfc000000u) == 0x14000000u &&
20062009
((ins ^ (px-p)) & 0x03ffffffu) == 0) {
2007-
/* Patch b exitstub. */
2008-
*p = A64I_LE((ins & 0xfc000000u) | ((target-p) & 0x03ffffffu));
2009-
cend = p+1;
2010+
/* Patch b. */
2011+
lua_assert(A64F_S_OK(delta, 26));
2012+
*p = A64I_LE((ins & 0xfc000000u) | A64F_S26(delta));
20102013
if (!cstart) cstart = p;
20112014
} else if ((ins & 0x7e000000u) == 0x34000000u &&
20122015
((ins ^ ((px-p)<<5)) & 0x00ffffe0u) == 0) {
2013-
/* Patch cbz/cbnz exitstub. */
2014-
*p = A64I_LE((ins & 0xff00001f) | (((target-p)<<5) & 0x00ffffe0u));
2015-
cend = p+1;
2016-
if (!cstart) cstart = p;
2016+
/* Patch cbz/cbnz, if within range. */
2017+
if (A64F_S_OK(delta, 19)) {
2018+
*p = A64I_LE((ins & 0xff00001fu) | A64F_S19(delta));
2019+
if (!cstart) cstart = p;
2020+
}
20172021
} else if ((ins & 0x7e000000u) == 0x36000000u &&
20182022
((ins ^ ((px-p)<<5)) & 0x0007ffe0u) == 0) {
2019-
/* Patch tbz/tbnz exitstub. */
2020-
*p = A64I_LE((ins & 0xfff8001fu) | (((target-p)<<5) & 0x0007ffe0u));
2021-
cend = p+1;
2022-
if (!cstart) cstart = p;
2023+
/* Patch tbz/tbnz, if within range. */
2024+
if (A64F_S_OK(delta, 14)) {
2025+
*p = A64I_LE((ins & 0xfff8001fu) | A64F_S14(delta));
2026+
if (!cstart) cstart = p;
2027+
}
20232028
}
20242029
}
2025-
lua_assert(cstart != NULL);
2026-
lj_mcode_sync(cstart, cend);
2030+
{ /* Always patch long-range branch in exit stub itself. */
2031+
ptrdiff_t delta = target - px;
2032+
lua_assert(A64F_S_OK(delta, 26));
2033+
*px = A64I_B | A64F_S26(delta);
2034+
if (!cstart) cstart = px;
2035+
}
2036+
lj_mcode_sync(cstart, px+1);
20272037
lj_mcode_patch(J, mcarea, 1);
20282038
}
20292039

src/lj_emit_arm64.h

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ static void emit_loadk(ASMState *as, Reg rd, uint64_t u64, int is64)
241241
#define mcpofs(as, k) \
242242
((intptr_t)((uintptr_t)(k) - (uintptr_t)(as->mcp - 1)))
243243
#define checkmcpofs(as, k) \
244-
((((mcpofs(as, k)>>2) + 0x00040000) >> 19) == 0)
244+
(A64F_S_OK(mcpofs(as, k)>>2, 19))
245245

246246
static Reg ra_allock(ASMState *as, intptr_t k, RegSet allow);
247247

@@ -312,32 +312,32 @@ static void emit_cond_branch(ASMState *as, A64CC cond, MCode *target)
312312
{
313313
MCode *p = --as->mcp;
314314
ptrdiff_t delta = target - p;
315-
lua_assert(((delta + 0x40000) >> 19) == 0);
315+
lua_assert(A64F_S_OK(delta, 19));
316316
*p = A64I_BCC | A64F_S19(delta) | cond;
317317
}
318318

319319
static void emit_branch(ASMState *as, A64Ins ai, MCode *target)
320320
{
321321
MCode *p = --as->mcp;
322322
ptrdiff_t delta = target - p;
323-
lua_assert(((delta + 0x02000000) >> 26) == 0);
324-
*p = ai | ((uint32_t)delta & 0x03ffffffu);
323+
lua_assert(A64F_S_OK(delta, 26));
324+
*p = ai | A64F_S26(delta);
325325
}
326326

327327
static void emit_tnb(ASMState *as, A64Ins ai, Reg r, uint32_t bit, MCode *target)
328328
{
329329
MCode *p = --as->mcp;
330330
ptrdiff_t delta = target - p;
331-
lua_assert(bit < 63 && ((delta + 0x2000) >> 14) == 0);
331+
lua_assert(bit < 63 && A64F_S_OK(delta, 14));
332332
if (bit > 31) ai |= A64I_X;
333-
*p = ai | A64F_BIT(bit & 31) | A64F_S14((uint32_t)delta & 0x3fffu) | r;
333+
*p = ai | A64F_BIT(bit & 31) | A64F_S14(delta) | r;
334334
}
335335

336336
static void emit_cnb(ASMState *as, A64Ins ai, Reg r, MCode *target)
337337
{
338338
MCode *p = --as->mcp;
339339
ptrdiff_t delta = target - p;
340-
lua_assert(((delta + 0x40000) >> 19) == 0);
340+
lua_assert(A64F_S_OK(delta, 19));
341341
*p = ai | A64F_S19(delta) | r;
342342
}
343343

@@ -347,8 +347,8 @@ static void emit_call(ASMState *as, void *target)
347347
{
348348
MCode *p = --as->mcp;
349349
ptrdiff_t delta = (char *)target - (char *)p;
350-
if ((((delta>>2) + 0x02000000) >> 26) == 0) {
351-
*p = A64I_BL | ((uint32_t)(delta>>2) & 0x03ffffffu);
350+
if (A64F_S_OK(delta>>2, 26)) {
351+
*p = A64I_BL | A64F_S26(delta>>2);
352352
} else { /* Target out of range: need indirect call. But don't use R0-R7. */
353353
Reg r = ra_allock(as, i64ptr(target),
354354
RSET_RANGE(RID_X8, RID_MAX_GPR)-RSET_FIXED);

src/lj_target_arm64.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,9 +132,9 @@ static LJ_AINLINE uint32_t *exitstub_trace_addr_(uint32_t *p, uint32_t exitno)
132132
#define A64F_IMMR(x) ((x) << 16)
133133
#define A64F_U16(x) ((x) << 5)
134134
#define A64F_U12(x) ((x) << 10)
135-
#define A64F_S26(x) (x)
135+
#define A64F_S26(x) (((uint32_t)(x) & 0x03ffffffu))
136136
#define A64F_S19(x) (((uint32_t)(x) & 0x7ffffu) << 5)
137-
#define A64F_S14(x) ((x) << 5)
137+
#define A64F_S14(x) (((uint32_t)(x) & 0x3fffu) << 5)
138138
#define A64F_S9(x) ((x) << 12)
139139
#define A64F_BIT(x) ((x) << 19)
140140
#define A64F_SH(sh, x) (((sh) << 22) | ((x) << 10))
@@ -145,6 +145,9 @@ static LJ_AINLINE uint32_t *exitstub_trace_addr_(uint32_t *p, uint32_t exitno)
145145
#define A64F_LSL16(x) (((x) / 16) << 21)
146146
#define A64F_BSH(sh) ((sh) << 10)
147147

148+
/* Check for valid field range. */
149+
#define A64F_S_OK(x, b) ((((x) + (1 << (b-1))) >> (b)) == 0)
150+
148151
typedef enum A64Ins {
149152
A64I_S = 0x20000000,
150153
A64I_X = 0x80000000,

test/tarantool-tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ macro(BuildTestCLib lib sources)
5757
endmacro()
5858

5959
add_subdirectory(gh-4427-ffi-sandwich)
60+
add_subdirectory(gh-6098-fix-side-exit-patching-on-arm64)
6061
add_subdirectory(gh-6189-cur_L)
6162
add_subdirectory(lj-49-bad-lightuserdata)
6263
add_subdirectory(lj-flush-on-trace)
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
local tap = require('tap')
2+
local test = tap.test('gh-6098-fix-side-exit-patching-on-arm64')
3+
test:plan(1)
4+
5+
-- The function to be tested for side exit patching:
6+
-- * At the beginning of the test case, the <if> branch is
7+
-- recorded as a root trace.
8+
-- * After <refuncs> (and some other hotspots) are recorded, the
9+
-- <else> branch is recorded as a side trace.
10+
-- When JIT is linking the side trace to the corresponding side
11+
-- exit, it patches the jump targets.
12+
local function cbool(cond)
13+
if cond then
14+
return 1
15+
else
16+
return 0
17+
end
18+
end
19+
20+
-- XXX: Function template below produces 8Kb mcode for ARM64, so
21+
-- we need to compile at least 128 traces to exceed 1Mb delta
22+
-- between <cbool> root trace side exit and <cbool> side trace.
23+
-- Unfortunately, we have no other option for extending this jump
24+
-- delta, since the base of the current mcode area (J->mcarea) is
25+
-- used as a hint for mcode allocator (see lj_mcode.c for info).
26+
local FUNCS = 128
27+
local recfuncs = { }
28+
for i = 1, FUNCS do
29+
-- This is a quite heavy workload (though it doesn't look like
30+
-- one at first). Each load from a table is type guarded. Each
31+
-- table lookup (for both stores and loads) is guarded for table
32+
-- <hmask> value and metatable presence. The code below results
33+
-- to 8Kb of mcode for ARM64 in practice.
34+
recfuncs[i] = assert(load(([[
35+
return function(src)
36+
local p = %d
37+
local tmp = { }
38+
local dst = { }
39+
for i = 1, 3 do
40+
tmp.a = src.a * p tmp.j = src.j * p tmp.s = src.s * p
41+
tmp.b = src.b * p tmp.k = src.k * p tmp.t = src.t * p
42+
tmp.c = src.c * p tmp.l = src.l * p tmp.u = src.u * p
43+
tmp.d = src.d * p tmp.m = src.m * p tmp.v = src.v * p
44+
tmp.e = src.e * p tmp.n = src.n * p tmp.w = src.w * p
45+
tmp.f = src.f * p tmp.o = src.o * p tmp.x = src.x * p
46+
tmp.g = src.g * p tmp.p = src.p * p tmp.y = src.y * p
47+
tmp.h = src.h * p tmp.q = src.q * p tmp.z = src.z * p
48+
tmp.i = src.i * p tmp.r = src.r * p
49+
50+
dst.a = tmp.z + p dst.j = tmp.q + p dst.s = tmp.h + p
51+
dst.b = tmp.y + p dst.k = tmp.p + p dst.t = tmp.g + p
52+
dst.c = tmp.x + p dst.l = tmp.o + p dst.u = tmp.f + p
53+
dst.d = tmp.w + p dst.m = tmp.n + p dst.v = tmp.e + p
54+
dst.e = tmp.v + p dst.n = tmp.m + p dst.w = tmp.d + p
55+
dst.f = tmp.u + p dst.o = tmp.l + p dst.x = tmp.c + p
56+
dst.g = tmp.t + p dst.p = tmp.k + p dst.y = tmp.b + p
57+
dst.h = tmp.s + p dst.q = tmp.j + p dst.z = tmp.a + p
58+
dst.i = tmp.r + p dst.r = tmp.i + p
59+
end
60+
dst.tmp = tmp
61+
return dst
62+
end
63+
]]):format(i)), ('Syntax error in function recfuncs[%d]'):format(i))()
64+
end
65+
66+
-- Make compiler work hard:
67+
-- * No optimizations at all to produce more mcode.
68+
-- * Try to compile all compiled paths as early as JIT can.
69+
-- * Allow to compile 2Mb of mcode to be sure the issue occurs.
70+
jit.opt.start(0, 'hotloop=1', 'hotexit=1', 'maxmcode=2048')
71+
72+
-- First call makes <cbool> hot enough to be recorded next time.
73+
cbool(true)
74+
-- Second call records <cbool> body (i.e. <if> branch). This is
75+
-- a root trace for <cbool>.
76+
cbool(true)
77+
78+
for i = 1, FUNCS do
79+
-- XXX: FNEW is NYI, hence loop recording fails at this point.
80+
-- The recording is aborted on purpose: we are going to record
81+
-- <FUNCS> number of traces for functions in <recfuncs>.
82+
-- Otherwise, loop recording might lead to a very long trace
83+
-- error (via return to a lower frame), or a trace with lots of
84+
-- side traces. We need neither of this, but just bunch of
85+
-- traces filling the available mcode area.
86+
local function tnew(p)
87+
return {
88+
a = p + 1, f = p + 6, k = p + 11, p = p + 16, u = p + 21, z = p + 26,
89+
b = p + 2, g = p + 7, l = p + 12, q = p + 17, v = p + 22,
90+
c = p + 3, h = p + 8, m = p + 13, r = p + 18, w = p + 23,
91+
d = p + 4, i = p + 9, n = p + 14, s = p + 19, x = p + 24,
92+
e = p + 5, j = p + 10, o = p + 15, t = p + 20, y = p + 25,
93+
}
94+
end
95+
-- Each function call produces a trace (see the template for the
96+
-- function definition above).
97+
recfuncs[i](tnew(i))
98+
end
99+
100+
-- XXX: I tried to make the test in pure Lua, but I failed to
101+
-- implement the robust solution. As a result I've implemented a
102+
-- tiny Lua C API module to route the flow through C frames and
103+
-- make JIT work the way I need to reproduce the fail. See the
104+
-- usage below.
105+
-- <pxcall> is just a wrapper for <lua_call> with "multiargs" and
106+
-- "multiret" with the same signature as <pcall>.
107+
local pxcall = require('libproxy').proxycall
108+
109+
-- XXX: Here is the dessert: JIT is aimed to work better for
110+
-- highly biased code. It means, the root trace should be the
111+
-- most popular flow. Furthermore, JIT also considers the fact,
112+
-- that frequently taken side exits are *also* popular, and
113+
-- compiles the side traces for such popular exits. However,
114+
-- to recoup his attempts JIT try to compile the flow as far
115+
-- as it can (see <lj_record_ret> in lj_record.c for more info).
116+
--
117+
-- Such "kind" behaviour occurs in our case: if one calls <cbool>
118+
-- the native way, JIT continues recording in a lower frame after
119+
-- returning from <cbool>. As a result, the second call is also
120+
-- recorded, but it has to trigger the far jump to the side trace.
121+
-- However, if the lower frame is not the Lua one, JIT doesn't
122+
-- proceed the further flow recording and assembles the trace. In
123+
-- this case, the second call jumps to <cbool> root trace, hits
124+
-- the assertion guard and jumps to <cbool> side trace.
125+
pxcall(cbool, false)
126+
cbool(false)
127+
128+
test:ok(true)
129+
os.exit(test:check() and 0 or 1)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
BuildTestCLib(libproxy libproxy.c)
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#include <lua.h>
2+
#include <lauxlib.h>
3+
4+
/*
5+
* Function with the signature similar to Lua <pcall> builtin,
6+
* that routes the flow through C frame.
7+
*/
8+
static int proxycall(lua_State *L)
9+
{
10+
lua_call(L, lua_gettop(L) - 1, LUA_MULTRET);
11+
return lua_gettop(L);
12+
}
13+
14+
static const struct luaL_Reg libproxy[] = {
15+
{"proxycall", proxycall},
16+
{NULL, NULL}
17+
};
18+
19+
LUA_API int luaopen_libproxy(lua_State *L)
20+
{
21+
luaL_register(L, "libproxy", libproxy);
22+
return 1;
23+
}

0 commit comments

Comments
 (0)