u24 generates inefficient code #7336

SpexGuy · 2020-12-08T07:18:33Z

At the heart of this is the question "how big is a u24?". @sizeOf would say it's 4 bytes. This is how much space is reserved for it. But the compiler seems to only ever read three bytes.

export fn foo(bytes: *const [2]u24) u32 {
    return bytes[0] + bytes[1];
}

I would expect the compiler to generate this code:

mov     eax, dword ptr [rdi]
add     eax, dword ptr [rdi + 4]
and     eax, 16777215
ret

But instead it generates this in release-fast:

movzx   eax, word ptr [rdi]
movzx   ecx, byte ptr [rdi + 2]
shl     ecx, 16
or      ecx, eax
movzx   edx, word ptr [rdi + 4]
movzx   eax, byte ptr [rdi + 6]
shl     eax, 16
or      eax, edx
add     eax, ecx
and     eax, 16777215
ret

This means we're getting the worst of both worlds. We're paying the performance cost of using only three bytes for a u24 and the memory cost of using four bytes for a u24. This is just silly. We need to resolve this one way or the other. Either keep the three-byte reads and say @sizeOf(u24) == 3 and @alignOf(u24) == 1, or switch to four byte reads and keep the current size and alignment.

The text was updated successfully, but these errors were encountered:

kliberty · 2020-12-17T17:48:56Z

I looked at a few other cases, with slight modifications. The inefficient code generation seems to have something to do with the export. If you can convince the compiler to reinterpret the pointers then the generated code looks better.

export_foo_style_internal_with_foo_style_u24_ptr_export_upconvert_downconvert seems the most telling. It has the same signature as your example and generates the code you expected.

export fn bad_foo(bytes: *const [2]u24) u32 {
    const a = bytes[0] + bytes[1];
    return a;
}


fn foo_style_internal(bytes: *const [2]u24) u32 {
    const a = bytes[0] + bytes[1];
    return a;
}




export fn foo_style_u32_ptr_truncated(bytes: *const [2]u32) u32 {
    const a = @truncate(u24, bytes[0]);
    const b = @truncate(u24, bytes[1]);
    return a + b;
}


export fn foo_style_u25_ptr_truncated(bytes: *const [2]u25) u32 {
    const a = @truncate(u24, bytes[0]);
    const b = @truncate(u24, bytes[1]);
    return a + b;
}




export fn export_foo_style_internal_with_foo_style_u32_ptr(bytes: *const [2]u32) u32 {
    const c: *[2]u24 = &[2]u24{
        @truncate(u24, bytes[0]),
        @truncate(u24, bytes[1])
    };
    return foo_style_internal(c);
}

export fn export_foo_style_internal_with_foo_style_u25_ptr(bytes: *const [2]u25) u32 {
    const c: *[2]u24 = &[2]u24{
        @truncate(u24,bytes[0]),
        @truncate(u24,bytes[1])
    };
    return foo_style_internal(c);
}


export fn export_foo_style_internal_with_foo_style_u24_ptr_export_upconvert_downconvert(bytes2: *const [2]u24) u32 {
    const bytes = @ptrCast(*const [2]u32, bytes2);
    const c: *[2]u24 = &[2]u24{
        @truncate(u24, bytes[0]),
        @truncate(u24, bytes[1])
    };
    return foo_style_internal(c);
}

bad_foo:
        movzx   eax, word ptr [rdi]
        movzx   ecx, byte ptr [rdi + 2]
        shl     ecx, 16
        or      ecx, eax
        movzx   edx, word ptr [rdi + 4]
        movzx   eax, byte ptr [rdi + 6]
        shl     eax, 16
        or      eax, edx
        add     eax, ecx
        and     eax, 16777215
        ret

foo_style_u32_ptr_truncated:
        mov     eax, dword ptr [rdi + 4]
        add     eax, dword ptr [rdi]
        and     eax, 16777215
        ret

foo_style_u25_ptr_truncated:
        mov     eax, dword ptr [rdi]
        add     eax, dword ptr [rdi + 4]
        and     eax, 16777215
        ret

export_foo_style_internal_with_foo_style_u32_ptr:
        mov     eax, dword ptr [rdi + 4]
        add     eax, dword ptr [rdi]
        and     eax, 16777215
        ret

export_foo_style_internal_with_foo_style_u25_ptr:
        mov     eax, dword ptr [rdi]
        add     eax, dword ptr [rdi + 4]
        and     eax, 16777215
        ret

export_foo_style_internal_with_foo_style_u24_ptr_export_upconvert_downconvert:
        mov     eax, dword ptr [rdi + 4]
        add     eax, dword ptr [rdi]
        and     eax, 16777215
        ret

LemonBoy · 2020-12-17T18:00:20Z

define i32 @bad_foo([2 x i24]* nocapture nonnull readonly %0) local_unnamed_addr #0 {
Entry:
  %1 = bitcast [2 x i24]* %0 to i24*
  %2 = load i24, i24* %1, align 4
  %3 = getelementptr inbounds [2 x i24], [2 x i24]* %0, i64 0, i64 1
  %4 = load i24, i24* %3, align 4
  %5 = add nuw i24 %4, %2
  %6 = zext i24 %5 to i32
  ret i32 %6
}

define i32 @export_foo_style_internal_with_foo_style_u24_ptr_export_upconvert_downconvert([2 x i24]* nocapture nonnull readonly %0) local_unnamed_addr #0 {
Entry:
  %1 = bitcast [2 x i24]* %0 to i32*
  %2 = load i32, i32* %1, align 4
  %3 = getelementptr inbounds [2 x i24], [2 x i24]* %0, i64 0, i64 1
  %4 = bitcast i24* %3 to i32*
  %5 = load i32, i32* %4, align 4
  %6 = add i32 %5, %2
  %7 = and i32 %6, 16777215
  ret i32 %7
}

No wonder the codegen is better, it's operating on i32.

andrewrk · 2020-12-17T18:18:29Z

I think the OP example should be a compilation error - u24 is not a type recognized by the C ABI.

andrewrk · 2020-12-19T02:47:51Z

My apologies, I didn't see the example clearly:

export fn foo(bytes: *const [2]u24) u32 {

This is a pointer, and the current stance on pointers in C calling convention functions is that they are allowed because they basically constitute an opaque pointer to C but in zig they have whatever true type they have.

We have plenty of examples of this already, here's a couple:

zig/src/stage1.zig

Lines 275 to 340 in ce65533

    
           // ABI warning 
        
           export fn stage2_progress_create() *std.Progress { 
        
               const ptr = std.heap.c_allocator.create(std.Progress) catch @panic("out of memory"); 
        
               ptr.* = std.Progress{}; 
        
               return ptr; 
        
           } 
        
           // ABI warning 
        
           export fn stage2_progress_destroy(progress: *std.Progress) void { 
        
               std.heap.c_allocator.destroy(progress); 
        
           } 
        
           // ABI warning 
        
           export fn stage2_progress_start_root( 
        
               progress: *std.Progress, 
        
               name_ptr: [*]const u8, 
        
               name_len: usize, 
        
               estimated_total_items: usize, 
        
           ) *std.Progress.Node { 
        
               return progress.start( 
        
                   name_ptr[0..name_len], 
        
                   if (estimated_total_items == 0) null else estimated_total_items, 
        
               ) catch @panic("timer unsupported"); 
        
           } 
        
           // ABI warning 
        
           export fn stage2_progress_disable_tty(progress: *std.Progress) void { 
        
               progress.terminal = null; 
        
           } 
        
           // ABI warning 
        
           export fn stage2_progress_start( 
        
               node: *std.Progress.Node, 
        
               name_ptr: [*]const u8, 
        
               name_len: usize, 
        
               estimated_total_items: usize, 
        
           ) *std.Progress.Node { 
        
               const child_node = std.heap.c_allocator.create(std.Progress.Node) catch @panic("out of memory"); 
        
               child_node.* = node.start( 
        
                   name_ptr[0..name_len], 
        
                   if (estimated_total_items == 0) null else estimated_total_items, 
        
               ); 
        
               child_node.activate(); 
        
               return child_node; 
        
           } 
        
           // ABI warning 
        
           export fn stage2_progress_end(node: *std.Progress.Node) void { 
        
               node.end(); 
        
               if (&node.context.root != node) { 
        
                   std.heap.c_allocator.destroy(node); 
        
               } 
        
           } 
        
           // ABI warning 
        
           export fn stage2_progress_complete_one(node: *std.Progress.Node) void { 
        
               node.completeOne(); 
        
           } 
        
           // ABI warning 
        
           export fn stage2_progress_update_node(node: *std.Progress.Node, done_count: usize, total_count: usize) void { 
        
               node.completed_items = done_count; 
        
               node.estimated_total_items = total_count; 
        
               node.activate(); 
        
               node.context.maybeRefresh(); 
        
           }

This all works great.

So I would expect u24 to follow the same rules. As for the inefficient code, I think because we tell LLVM the function is external, LLVM thinks it has to follow some specific kind of ABI. But we should be able to massage the parameters into telling LLVM how the real ABI of the u24 type works for zig.

Vexu added the enhancement Solving this issue will likely involve adding new logic or components to the codebase. label Dec 8, 2020

Vexu added this to the 0.9.0 milestone Dec 8, 2020

LemonBoy mentioned this issue Dec 17, 2020

More extern type validation #7477

Closed

andrewrk modified the milestones: 0.9.0, 0.10.0 May 19, 2021

InspectorBoat mentioned this issue Nov 14, 2023

std.PackedInt(Array/Slice/Io) are unsound #17997

Open

alichraghi mentioned this issue Jan 28, 2024

sysaudio: runtime alignment error when converting samples on Windows hexops/mach#1152

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

u24 generates inefficient code #7336

u24 generates inefficient code #7336

SpexGuy commented Dec 8, 2020 •

edited

kliberty commented Dec 17, 2020 •

edited

LemonBoy commented Dec 17, 2020

andrewrk commented Dec 17, 2020

andrewrk commented Dec 19, 2020 •

edited

u24 generates inefficient code #7336

u24 generates inefficient code #7336

Comments

SpexGuy commented Dec 8, 2020 • edited

kliberty commented Dec 17, 2020 • edited

LemonBoy commented Dec 17, 2020

andrewrk commented Dec 17, 2020

andrewrk commented Dec 19, 2020 • edited

SpexGuy commented Dec 8, 2020 •

edited

kliberty commented Dec 17, 2020 •

edited

andrewrk commented Dec 19, 2020 •

edited