New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
u24 generates inefficient code #7336
Comments
I looked at a few other cases, with slight modifications. The inefficient code generation seems to have something to do with the export. If you can convince the compiler to reinterpret the pointers then the generated code looks better.
export fn bad_foo(bytes: *const [2]u24) u32 {
const a = bytes[0] + bytes[1];
return a;
}
fn foo_style_internal(bytes: *const [2]u24) u32 {
const a = bytes[0] + bytes[1];
return a;
}
export fn foo_style_u32_ptr_truncated(bytes: *const [2]u32) u32 {
const a = @truncate(u24, bytes[0]);
const b = @truncate(u24, bytes[1]);
return a + b;
}
export fn foo_style_u25_ptr_truncated(bytes: *const [2]u25) u32 {
const a = @truncate(u24, bytes[0]);
const b = @truncate(u24, bytes[1]);
return a + b;
}
export fn export_foo_style_internal_with_foo_style_u32_ptr(bytes: *const [2]u32) u32 {
const c: *[2]u24 = &[2]u24{
@truncate(u24, bytes[0]),
@truncate(u24, bytes[1])
};
return foo_style_internal(c);
}
export fn export_foo_style_internal_with_foo_style_u25_ptr(bytes: *const [2]u25) u32 {
const c: *[2]u24 = &[2]u24{
@truncate(u24,bytes[0]),
@truncate(u24,bytes[1])
};
return foo_style_internal(c);
}
export fn export_foo_style_internal_with_foo_style_u24_ptr_export_upconvert_downconvert(bytes2: *const [2]u24) u32 {
const bytes = @ptrCast(*const [2]u32, bytes2);
const c: *[2]u24 = &[2]u24{
@truncate(u24, bytes[0]),
@truncate(u24, bytes[1])
};
return foo_style_internal(c);
} bad_foo:
movzx eax, word ptr [rdi]
movzx ecx, byte ptr [rdi + 2]
shl ecx, 16
or ecx, eax
movzx edx, word ptr [rdi + 4]
movzx eax, byte ptr [rdi + 6]
shl eax, 16
or eax, edx
add eax, ecx
and eax, 16777215
ret
foo_style_u32_ptr_truncated:
mov eax, dword ptr [rdi + 4]
add eax, dword ptr [rdi]
and eax, 16777215
ret
foo_style_u25_ptr_truncated:
mov eax, dword ptr [rdi]
add eax, dword ptr [rdi + 4]
and eax, 16777215
ret
export_foo_style_internal_with_foo_style_u32_ptr:
mov eax, dword ptr [rdi + 4]
add eax, dword ptr [rdi]
and eax, 16777215
ret
export_foo_style_internal_with_foo_style_u25_ptr:
mov eax, dword ptr [rdi]
add eax, dword ptr [rdi + 4]
and eax, 16777215
ret
export_foo_style_internal_with_foo_style_u24_ptr_export_upconvert_downconvert:
mov eax, dword ptr [rdi + 4]
add eax, dword ptr [rdi]
and eax, 16777215
ret |
define i32 @bad_foo([2 x i24]* nocapture nonnull readonly %0) local_unnamed_addr #0 {
Entry:
%1 = bitcast [2 x i24]* %0 to i24*
%2 = load i24, i24* %1, align 4
%3 = getelementptr inbounds [2 x i24], [2 x i24]* %0, i64 0, i64 1
%4 = load i24, i24* %3, align 4
%5 = add nuw i24 %4, %2
%6 = zext i24 %5 to i32
ret i32 %6
} define i32 @export_foo_style_internal_with_foo_style_u24_ptr_export_upconvert_downconvert([2 x i24]* nocapture nonnull readonly %0) local_unnamed_addr #0 {
Entry:
%1 = bitcast [2 x i24]* %0 to i32*
%2 = load i32, i32* %1, align 4
%3 = getelementptr inbounds [2 x i24], [2 x i24]* %0, i64 0, i64 1
%4 = bitcast i24* %3 to i32*
%5 = load i32, i32* %4, align 4
%6 = add i32 %5, %2
%7 = and i32 %6, 16777215
ret i32 %7
} No wonder the codegen is better, it's operating on |
I think the OP example should be a compilation error - u24 is not a type recognized by the C ABI. |
My apologies, I didn't see the example clearly:
This is a pointer, and the current stance on pointers in C calling convention functions is that they are allowed because they basically constitute an opaque pointer to C but in zig they have whatever true type they have. We have plenty of examples of this already, here's a couple: Lines 275 to 340 in ce65533
This all works great. So I would expect u24 to follow the same rules. As for the inefficient code, I think because we tell LLVM the function is external, LLVM thinks it has to follow some specific kind of ABI. But we should be able to massage the parameters into telling LLVM how the real ABI of the u24 type works for zig. |
At the heart of this is the question "how big is a u24?".
@sizeOf
would say it's 4 bytes. This is how much space is reserved for it. But the compiler seems to only ever read three bytes.I would expect the compiler to generate this code:
But instead it generates this in release-fast:
This means we're getting the worst of both worlds. We're paying the performance cost of using only three bytes for a u24 and the memory cost of using four bytes for a u24. This is just silly. We need to resolve this one way or the other. Either keep the three-byte reads and say
@sizeOf(u24) == 3
and@alignOf(u24) == 1
, or switch to four byte reads and keep the current size and alignment.The text was updated successfully, but these errors were encountered: