Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

u24 generates inefficient code #7336

Open
SpexGuy opened this issue Dec 8, 2020 · 4 comments
Open

u24 generates inefficient code #7336

SpexGuy opened this issue Dec 8, 2020 · 4 comments
Labels
enhancement Solving this issue will likely involve adding new logic or components to the codebase.
Milestone

Comments

@SpexGuy
Copy link
Contributor

SpexGuy commented Dec 8, 2020

At the heart of this is the question "how big is a u24?". @sizeOf would say it's 4 bytes. This is how much space is reserved for it. But the compiler seems to only ever read three bytes.

export fn foo(bytes: *const [2]u24) u32 {
    return bytes[0] + bytes[1];
}

I would expect the compiler to generate this code:

mov     eax, dword ptr [rdi]
add     eax, dword ptr [rdi + 4]
and     eax, 16777215
ret

But instead it generates this in release-fast:

movzx   eax, word ptr [rdi]
movzx   ecx, byte ptr [rdi + 2]
shl     ecx, 16
or      ecx, eax
movzx   edx, word ptr [rdi + 4]
movzx   eax, byte ptr [rdi + 6]
shl     eax, 16
or      eax, edx
add     eax, ecx
and     eax, 16777215
ret

This means we're getting the worst of both worlds. We're paying the performance cost of using only three bytes for a u24 and the memory cost of using four bytes for a u24. This is just silly. We need to resolve this one way or the other. Either keep the three-byte reads and say @sizeOf(u24) == 3 and @alignOf(u24) == 1, or switch to four byte reads and keep the current size and alignment.

@Vexu Vexu added the enhancement Solving this issue will likely involve adding new logic or components to the codebase. label Dec 8, 2020
@Vexu Vexu added this to the 0.9.0 milestone Dec 8, 2020
@kliberty
Copy link

kliberty commented Dec 17, 2020

I looked at a few other cases, with slight modifications. The inefficient code generation seems to have something to do with the export. If you can convince the compiler to reinterpret the pointers then the generated code looks better.

export_foo_style_internal_with_foo_style_u24_ptr_export_upconvert_downconvert seems the most telling. It has the same signature as your example and generates the code you expected.

export fn bad_foo(bytes: *const [2]u24) u32 {
    const a = bytes[0] + bytes[1];
    return a;
}


fn foo_style_internal(bytes: *const [2]u24) u32 {
    const a = bytes[0] + bytes[1];
    return a;
}




export fn foo_style_u32_ptr_truncated(bytes: *const [2]u32) u32 {
    const a = @truncate(u24, bytes[0]);
    const b = @truncate(u24, bytes[1]);
    return a + b;
}


export fn foo_style_u25_ptr_truncated(bytes: *const [2]u25) u32 {
    const a = @truncate(u24, bytes[0]);
    const b = @truncate(u24, bytes[1]);
    return a + b;
}




export fn export_foo_style_internal_with_foo_style_u32_ptr(bytes: *const [2]u32) u32 {
    const c: *[2]u24 = &[2]u24{
        @truncate(u24, bytes[0]),
        @truncate(u24, bytes[1])
    };
    return foo_style_internal(c);
}

export fn export_foo_style_internal_with_foo_style_u25_ptr(bytes: *const [2]u25) u32 {
    const c: *[2]u24 = &[2]u24{
        @truncate(u24,bytes[0]),
        @truncate(u24,bytes[1])
    };
    return foo_style_internal(c);
}


export fn export_foo_style_internal_with_foo_style_u24_ptr_export_upconvert_downconvert(bytes2: *const [2]u24) u32 {
    const bytes = @ptrCast(*const [2]u32, bytes2);
    const c: *[2]u24 = &[2]u24{
        @truncate(u24, bytes[0]),
        @truncate(u24, bytes[1])
    };
    return foo_style_internal(c);
}
bad_foo:
        movzx   eax, word ptr [rdi]
        movzx   ecx, byte ptr [rdi + 2]
        shl     ecx, 16
        or      ecx, eax
        movzx   edx, word ptr [rdi + 4]
        movzx   eax, byte ptr [rdi + 6]
        shl     eax, 16
        or      eax, edx
        add     eax, ecx
        and     eax, 16777215
        ret

foo_style_u32_ptr_truncated:
        mov     eax, dword ptr [rdi + 4]
        add     eax, dword ptr [rdi]
        and     eax, 16777215
        ret

foo_style_u25_ptr_truncated:
        mov     eax, dword ptr [rdi]
        add     eax, dword ptr [rdi + 4]
        and     eax, 16777215
        ret

export_foo_style_internal_with_foo_style_u32_ptr:
        mov     eax, dword ptr [rdi + 4]
        add     eax, dword ptr [rdi]
        and     eax, 16777215
        ret

export_foo_style_internal_with_foo_style_u25_ptr:
        mov     eax, dword ptr [rdi]
        add     eax, dword ptr [rdi + 4]
        and     eax, 16777215
        ret

export_foo_style_internal_with_foo_style_u24_ptr_export_upconvert_downconvert:
        mov     eax, dword ptr [rdi + 4]
        add     eax, dword ptr [rdi]
        and     eax, 16777215
        ret

@LemonBoy
Copy link
Contributor

define i32 @bad_foo([2 x i24]* nocapture nonnull readonly %0) local_unnamed_addr #0 {
Entry:
  %1 = bitcast [2 x i24]* %0 to i24*
  %2 = load i24, i24* %1, align 4
  %3 = getelementptr inbounds [2 x i24], [2 x i24]* %0, i64 0, i64 1
  %4 = load i24, i24* %3, align 4
  %5 = add nuw i24 %4, %2
  %6 = zext i24 %5 to i32
  ret i32 %6
}
define i32 @export_foo_style_internal_with_foo_style_u24_ptr_export_upconvert_downconvert([2 x i24]* nocapture nonnull readonly %0) local_unnamed_addr #0 {
Entry:
  %1 = bitcast [2 x i24]* %0 to i32*
  %2 = load i32, i32* %1, align 4
  %3 = getelementptr inbounds [2 x i24], [2 x i24]* %0, i64 0, i64 1
  %4 = bitcast i24* %3 to i32*
  %5 = load i32, i32* %4, align 4
  %6 = add i32 %5, %2
  %7 = and i32 %6, 16777215
  ret i32 %7
}

No wonder the codegen is better, it's operating on i32.

@andrewrk
Copy link
Member

I think the OP example should be a compilation error - u24 is not a type recognized by the C ABI.

@andrewrk
Copy link
Member

andrewrk commented Dec 19, 2020

My apologies, I didn't see the example clearly:

export fn foo(bytes: *const [2]u24) u32 {

This is a pointer, and the current stance on pointers in C calling convention functions is that they are allowed because they basically constitute an opaque pointer to C but in zig they have whatever true type they have.

We have plenty of examples of this already, here's a couple:

zig/src/stage1.zig

Lines 275 to 340 in ce65533

// ABI warning
export fn stage2_progress_create() *std.Progress {
const ptr = std.heap.c_allocator.create(std.Progress) catch @panic("out of memory");
ptr.* = std.Progress{};
return ptr;
}
// ABI warning
export fn stage2_progress_destroy(progress: *std.Progress) void {
std.heap.c_allocator.destroy(progress);
}
// ABI warning
export fn stage2_progress_start_root(
progress: *std.Progress,
name_ptr: [*]const u8,
name_len: usize,
estimated_total_items: usize,
) *std.Progress.Node {
return progress.start(
name_ptr[0..name_len],
if (estimated_total_items == 0) null else estimated_total_items,
) catch @panic("timer unsupported");
}
// ABI warning
export fn stage2_progress_disable_tty(progress: *std.Progress) void {
progress.terminal = null;
}
// ABI warning
export fn stage2_progress_start(
node: *std.Progress.Node,
name_ptr: [*]const u8,
name_len: usize,
estimated_total_items: usize,
) *std.Progress.Node {
const child_node = std.heap.c_allocator.create(std.Progress.Node) catch @panic("out of memory");
child_node.* = node.start(
name_ptr[0..name_len],
if (estimated_total_items == 0) null else estimated_total_items,
);
child_node.activate();
return child_node;
}
// ABI warning
export fn stage2_progress_end(node: *std.Progress.Node) void {
node.end();
if (&node.context.root != node) {
std.heap.c_allocator.destroy(node);
}
}
// ABI warning
export fn stage2_progress_complete_one(node: *std.Progress.Node) void {
node.completeOne();
}
// ABI warning
export fn stage2_progress_update_node(node: *std.Progress.Node, done_count: usize, total_count: usize) void {
node.completed_items = done_count;
node.estimated_total_items = total_count;
node.activate();
node.context.maybeRefresh();
}

This all works great.

So I would expect u24 to follow the same rules. As for the inefficient code, I think because we tell LLVM the function is external, LLVM thinks it has to follow some specific kind of ABI. But we should be able to massage the parameters into telling LLVM how the real ABI of the u24 type works for zig.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
enhancement Solving this issue will likely involve adding new logic or components to the codebase.
Projects
None yet
Development

No branches or pull requests

5 participants