# libaco

[github](https://github.com/hnes/libaco)

总体架构

1. 有一个main_co,用线程本身的stack
2. 有一个shared_stack, 多个coroutine分享，但是同一时刻只能有一个运行
3. 当两个coroutine切换的时候(from_co,to_co)，先把当前的context,包括registers和stack上的内容，都复制到from_to的save_stack中保存，然后把to_co的save_stack中的内容和reg的内容复制到shared_stack和registers中，做switch
   
个人认为，如果内存足够大，完全可以每个coroutine都分配自己的stack空间，这样在做切换的时候，不需要复制stack上的内容，只需要保存和恢复寄存器，提高效率

----------------------------

## aco.c

```c
aco_share_stack_t* aco_share_stack_new2(size_t sz, char guard_page_enabled){
    if(sz == 0){
        sz = 1024 * 1024 * 2;
    }
    if(sz < 4096){
        sz = 4096;
    }
    assert(sz > 0);

    size_t u_pgsz = 0;
    if(guard_page_enabled != 0){
        // although gcc's Built-in Functions to Perform Arithmetic with
        // Overflow Checking is better, but it would require gcc >= 5.0
        long pgsz = sysconf(_SC_PAGESIZE);
        // pgsz must be > 0 && a power of two
        assert(pgsz > 0 && (((pgsz - 1) & pgsz) == 0));
        u_pgsz = (size_t)((unsigned long)pgsz);
        // it should be always true in real life
        assert(u_pgsz == (unsigned long)pgsz && ((u_pgsz << 1) >> 1) == u_pgsz);
        if(sz <= u_pgsz){
            sz = u_pgsz << 1;
        } else {
            size_t new_sz;
            if((sz & (u_pgsz - 1)) != 0){
                new_sz = (sz & (~(u_pgsz - 1)));
                assert(new_sz >= u_pgsz);
                aco_size_t_safe_add_assert(new_sz, (u_pgsz << 1));
                new_sz = new_sz + (u_pgsz << 1);
                assert(sz / u_pgsz + 2 == new_sz / u_pgsz);
            } else {
                aco_size_t_safe_add_assert(sz, u_pgsz);
                new_sz = sz + u_pgsz;
                assert(sz / u_pgsz + 1 == new_sz / u_pgsz);
            }
            sz = new_sz;
            assert((sz / u_pgsz > 1) && ((sz & (u_pgsz - 1)) == 0));
        }
    }

    aco_share_stack_t* p = (aco_share_stack_t*)malloc(sizeof(aco_share_stack_t));
    assertalloc_ptr(p);
    memset(p, 0, sizeof(aco_share_stack_t));

    if(guard_page_enabled != 0){
        p->real_ptr = mmap(
            NULL, sz, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0
        );
        assertalloc_bool(p->real_ptr != MAP_FAILED);
        p->guard_page_enabled = 1;
        assert(0 == mprotect(p->real_ptr, u_pgsz, PROT_READ));

        p->ptr = (void*)(((uintptr_t)p->real_ptr) + u_pgsz);
        p->real_sz = sz;
        assert(sz >= (u_pgsz << 1));
        p->sz = sz - u_pgsz;
    } else {
        //p->guard_page_enabled = 0;
        p->sz = sz;
        p->ptr = malloc(sz);
        assertalloc_ptr(p->ptr);
    }

    p->owner = NULL;
#ifdef ACO_USE_VALGRIND
    p->valgrind_stk_id = VALGRIND_STACK_REGISTER(
        p->ptr, (void*)((uintptr_t)p->ptr + p->sz)
    );
#endif
#if defined(__i386__) || defined(__x86_64__)
    uintptr_t u_p = (uintptr_t)(p->sz - (sizeof(void*) << 1) + (uintptr_t)p->ptr);
    u_p = (u_p >> 4) << 4;
    p->align_highptr = (void*)u_p;
    p->align_retptr  = (void*)(u_p - sizeof(void*));
    *((void**)(p->align_retptr)) = (void*)(aco_funcp_protector_asm);
    assert(p->sz > (16 + (sizeof(void*) << 1) + sizeof(void*)));
    p->align_limit = p->sz - 16 - (sizeof(void*) << 1);
#else
    #error "platform no support yet"
#endif
    return p;
}
```

1. `assert(0 == mprotect(p->real_ptr, u_pgsz, PROT_READ));`

![](mprotect01.png)

Use a READ-ONLY guard page at the low stack address to protect the stack overflow. If the stack use the memory in the guard page, the system will abort it. Nice trick !

2. `assert(pgsz > 0 && (((pgsz - 1) & pgsz) == 0));` 

`((a - 1) & a) == 0` check if the a is a power of 2

3. `long pgsz = sysconf(_SC_PAGESIZE);`
   
   ![](sysconf01.png)

4. x86 stack frame

![](stackframe01.png)
![](stackframe02.png)


--------------------------------

## aco.c  memcpy

```c
#define aco_amd64_inline_short_aligned_memcpy(dst, src, sz) do {\
    __uint128_t xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7; \
    switch((sz) >> 4){ \
    case 0:  \
        break;  \
    case 1:  \
        xmm0 = *((__uint128_t*)(src) + 0);  \
        *((__uint128_t*)(dst) + 0) = xmm0; \
        break;  \
    case 2:  \
        xmm0 = *((__uint128_t*)(src) + 0);  \
        xmm1 = *((__uint128_t*)(src) + 1);  \
        *((__uint128_t*)(dst) + 0) = xmm0; \
        *((__uint128_t*)(dst) + 1) = xmm1; \
        break;  \
    case 3:  \
        xmm0 = *((__uint128_t*)(src) + 0);  \
        xmm1 = *((__uint128_t*)(src) + 1);  \
        xmm2 = *((__uint128_t*)(src) + 2);  \
        *((__uint128_t*)(dst) + 0) = xmm0; \
        *((__uint128_t*)(dst) + 1) = xmm1; \
        *((__uint128_t*)(dst) + 2) = xmm2; \
        break;  \
```

1. using 128bit instruction to improve the memory copy performance.

-----------------------------------

## acosw.S  acosw

```asm
/*
    0x00             -->               0xff
    eip esp ebp edi esi ebx fpucw16 mxcsr32
    0   4   8   c   10  14  18      1c
*/

#ifdef __i386__

# xitongsys 
# Stack frame struct. 
# acosw(from_co, to_co)
# low ----------------------------- high
#  return_address   from_co  to_co
# 
# [esp+0x0] = return address
# [esp+0x4] = from_co
# [esp+0x8] = to_co

# this part is copying the register values to from_co.reg[8]

    mov     eax,DWORD PTR [esp+0x4]     // from_co
    mov     edx,DWORD PTR [esp]         // retaddr
    lea     ecx,[esp+0x4]               // esp
    mov     DWORD PTR [eax+0x8],ebp     //<ebp
    mov     DWORD PTR [eax+0x4],ecx     //<esp
    mov     DWORD PTR [eax+0x0],edx     //<retaddr
    mov     DWORD PTR [eax+0xc],edi     //<edi
    mov     ecx,DWORD PTR [esp+0x8]     // to_co
    mov     DWORD PTR [eax+0x10],esi    //<esi
    mov     DWORD PTR [eax+0x14],ebx    //<ebx
#ifndef ACO_CONFIG_SHARE_FPU_MXCSR_ENV
    fnstcw  WORD  PTR [eax+0x18]        //<fpucw
    stmxcsr DWORD PTR [eax+0x1c]        //<mxcsr
#endif


# this part is copying the the to_co.reg[8] to registers

    mov     edx,DWORD PTR [ecx+0x4]     //>esp
    mov     ebp,DWORD PTR [ecx+0x8]     //>ebp
    mov     eax,DWORD PTR [ecx+0x0]     //>retaddr
    mov     edi,DWORD PTR [ecx+0xc]     //>edi
    mov     esi,DWORD PTR [ecx+0x10]    //>esi
    mov     ebx,DWORD PTR [ecx+0x14]    //>ebx
#ifndef ACO_CONFIG_SHARE_FPU_MXCSR_ENV
    fldcw   WORD  PTR     [ecx+0x18]        //>fpucw
    ldmxcsr DWORD PTR     [ecx+0x1c]        //>mxcsr
#endif
    xor     ecx,ecx
    mov     esp,edx
    xor     edx,edx
    jmp     eax
```

1. see comments

-------------------------