Use intrinsic atomics #1123

Merged
merged 2 commits into from Jul 10, 2012
View
476 xbmc/threads/Atomics.cpp
@@ -21,62 +21,62 @@
#include "Atomics.h"
+// the only safe way to be absolutly sure that
+// gcc intrinsics are present when using an unknown GCC
+#if defined(__GNUC__) && defined(__GNUC_MINOR__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4))
+ #define HAS_GCC_INTRINSICS
+#elif defined(TARGET_DARWIN)
+ // safe under darwin gcc-4.2, llvm-gcc-4.2 and clang
+ #define HAS_GCC_INTRINSICS
+#endif
///////////////////////////////////////////////////////////////////////////
// 32-bit atomic compare-and-swap
// Returns previous value of *pAddr
///////////////////////////////////////////////////////////////////////////
-#if defined(__ppc__) || defined(__powerpc__) // PowerPC
-
long cas(volatile long *pAddr, long expectedVal, long swapVal)
{
+#if defined(__ppc__) || defined(__powerpc__) // PowerPC
unsigned int prev;
-
__asm__ __volatile__ (
- " 1: lwarx %0,0,%2 \n" /* Load the current value of *pAddr(%2) into prev (%0) and lock pAddr, */
- " cmpw 0,%0,%3 \n" /* Verify that the current value (%2) == old value (%3) */
- " bne- 2f \n" /* Bail if the two values are not equal [not as expected] */
- " stwcx. %4,0,%2 \n" /* Attempt to store swapVal (%4) value into *pAddr (%2) [p must still be reserved] */
- " bne- 1b \n" /* Loop if p was no longer reserved */
- " isync \n" /* Reconcile multiple processors [if present] */
- " 2: \n"
- : "=&r" (prev), "+m" (*pAddr) /* Outputs [prev, *pAddr] */
- : "r" (pAddr), "r" (expectedVal), "r" (swapVal) /* Inputs [pAddr, expectedVal, swapVal] */
- : "cc", "memory"); /* Clobbers */
-
+ " 1: lwarx %0,0,%2 \n" /* Load the current value of *pAddr(%2) into prev (%0) and lock pAddr, */
+ " cmpw 0,%0,%3 \n" /* Verify that the current value (%2) == old value (%3) */
+ " bne- 2f \n" /* Bail if the two values are not equal [not as expected] */
+ " stwcx. %4,0,%2 \n" /* Attempt to store swapVal (%4) value into *pAddr (%2) [p must still be reserved] */
+ " bne- 1b \n" /* Loop if p was no longer reserved */
+ " isync \n" /* Reconcile multiple processors [if present] */
+ " 2: \n"
+ : "=&r" (prev), "+m" (*pAddr) /* Outputs [prev, *pAddr] */
+ : "r" (pAddr), "r" (expectedVal), "r" (swapVal) /* Inputs [pAddr, expectedVal, swapVal] */
+ : "cc", "memory"); /* Clobbers */
return prev;
-}
#elif defined(__arm__)
-long cas(volatile long* pAddr, long expectedVal, long swapVal)
-{
register long prev;
asm volatile (
- "dmb ish \n" // Memory barrier. Make sure all memory accesses appearing before this complete before any that appear after
- "1: \n"
- "ldrex %0, [%1] \n" // Load the current value of *pAddr(%1) into prev (%0) and lock pAddr,
- "cmp %0, %2 \n" // Verify that the current value (%0) == old value (%2)
- "bne 2f \n" // Bail if the two values are not equal [not as expected]
- "strex r1, %3, [%1] \n"
- "cmp r1, #0 \n"
- "bne 1b \n"
- "dmb ish \n" // Memory barrier.
- "2: \n"
- : "=&r" (prev)
- : "r"(pAddr), "r"(expectedVal),"r"(swapVal)
- : "r1"
- );
+ "dmb ish \n" // Memory barrier. Make sure all memory accesses appearing before this complete before any that appear after
+ "1: \n"
+ "ldrex %0, [%1] \n" // Load the current value of *pAddr(%1) into prev (%0) and lock pAddr,
+ "cmp %0, %2 \n" // Verify that the current value (%0) == old value (%2)
+ "bne 2f \n" // Bail if the two values are not equal [not as expected]
+ "strex r1, %3, [%1] \n"
+ "cmp r1, #0 \n"
+ "bne 1b \n"
+ "dmb ish \n" // Memory barrier.
+ "2: \n"
+ : "=&r" (prev)
+ : "r"(pAddr), "r"(expectedVal),"r"(swapVal)
+ : "r1"
+ );
return prev;
-}
#elif defined(__mips__)
// TODO:
+ unsigned int prev;
+ #error atomic cas undefined for mips
+ return prev;
#elif defined(WIN32)
-
-long cas(volatile long* pAddr, long expectedVal, long swapVal)
-{
long prev;
-
__asm
{
// Load parameters
@@ -90,41 +90,33 @@ long cas(volatile long* pAddr, long expectedVal, long swapVal)
// Store the return value
mov prev, eax;
}
-
return prev;
-}
#else // Linux / OSX86 (GCC)
-
-long cas(volatile long* pAddr,long expectedVal, long swapVal)
-{
long prev;
-
__asm__ __volatile__ (
- "lock/cmpxchg %1, %2"
- : "=a" (prev)
- : "r" (swapVal), "m" (*pAddr), "0" (expectedVal)
- : "memory" );
+ "lock/cmpxchg %1, %2"
+ : "=a" (prev)
+ : "r" (swapVal), "m" (*pAddr), "0" (expectedVal)
+ : "memory" );
return prev;
-}
-
#endif
+}
///////////////////////////////////////////////////////////////////////////
// 64-bit atomic compare-and-swap
// Returns previous value of *pAddr
///////////////////////////////////////////////////////////////////////////
+long long cas2(volatile long long* pAddr, long long expectedVal, long long swapVal)
+{
#if defined(__ppc__) || defined(__powerpc__) || defined(__arm__) || defined(__mips__) // PowerPC, ARM, and MIPS
-
// Not available/required
+// Hack to allow compilation
+ throw "cas2 is not implemented";
#elif defined(WIN32)
-
-long long cas2(volatile long long* pAddr, long long expectedVal, long long swapVal)
-{
long long prev;
-
__asm
{
mov esi, pAddr ;
@@ -136,86 +128,74 @@ long long cas2(volatile long long* pAddr, long long expectedVal, long long swapV
mov dword ptr [prev], eax ;
mov dword ptr prev[4], edx ;
}
-
return prev;
-}
#else // Linux / OSX86 (GCC)
-#if !defined (__x86_64)
-long long cas2(volatile long long* pAddr, long long expectedVal, long long swapVal)
-{
- long long prev;
-
- __asm__ volatile (
- " push %%ebx \n" // We have to manually handle ebx, because PIC uses it and the compiler refuses to build anything that touches it
- " mov %4, %%ebx \n"
- " lock/cmpxchg8b (%%esi) \n"
- " pop %%ebx"
- : "=A" (prev)
- : "c" ((unsigned long)(swapVal >> 32)), "0" (expectedVal), "S" (pAddr), "m" (swapVal)
- : "memory");
- return prev;
-}
-#else
-// Hack to allow compilation on x86_64
-long long cas2(volatile long long* pAddr, long long expectedVal, long long swapVal)
-{
- throw "cas2 is not implemented on x86_64!";
-}
-#endif // !defined (__x86_64)
+ #if !defined (__x86_64)
+ long long prev;
+ __asm__ volatile (
+ " push %%ebx \n" // We have to manually handle ebx, because PIC uses it and the compiler refuses to build anything that touches it
+ " mov %4, %%ebx \n"
+ " lock/cmpxchg8b (%%esi) \n"
+ " pop %%ebx"
+ : "=A" (prev)
+ : "c" ((unsigned long)(swapVal >> 32)), "0" (expectedVal), "S" (pAddr), "m" (swapVal)
+ : "memory");
+ return prev;
+ #else
+ // Hack to allow compilation on x86_64
+ throw "cas2 is not implemented on x86_64!";
+ #endif
#endif
+}
///////////////////////////////////////////////////////////////////////////
// 32-bit atomic increment
// Returns new value of *pAddr
///////////////////////////////////////////////////////////////////////////
-#if defined(__ppc__) || defined(__powerpc__) // PowerPC
-
long AtomicIncrement(volatile long* pAddr)
{
- long val;
+#if defined(HAS_GCC_INTRINSICS)
+ return __sync_add_and_fetch(pAddr, 1);
+#elif defined(__ppc__) || defined(__powerpc__) // PowerPC
+ long val;
__asm__ __volatile__ (
- "sync \n"
- "1: lwarx %0, 0, %1 \n"
- "addic %0, %0, 1 \n"
- "stwcx. %0, 0, %1 \n"
- "bne- 1b \n"
- "isync"
- : "=&r" (val)
- : "r" (pAddr)
- : "cc", "xer", "memory");
+ "sync \n"
+ "1: lwarx %0, 0, %1 \n"
+ "addic %0, %0, 1 \n"
+ "stwcx. %0, 0, %1 \n"
+ "bne- 1b \n"
+ "isync"
+ : "=&r" (val)
+ : "r" (pAddr)
+ : "cc", "xer", "memory");
return val;
-}
#elif defined(__arm__)
-
-long AtomicIncrement(volatile long* pAddr)
-{
register long val;
asm volatile (
- "dmb ish \n" // Memory barrier. Make sure all memory accesses appearing before this complete before any that appear after
- "1: \n"
- "ldrex %0, [%1] \n" // (val = *pAddr)
- "add %0, #1 \n" // (val += 1)
- "strex r1, %0, [%1] \n"
- "cmp r1, #0 \n"
- "bne 1b \n"
- "dmb ish \n" // Memory barrier.
- : "=&r" (val)
- : "r"(pAddr)
- : "r1"
- );
+ "dmb ish \n" // Memory barrier. Make sure all memory accesses appearing before this complete before any that appear after
+ "1: \n"
+ "ldrex %0, [%1] \n" // (val = *pAddr)
+ "add %0, #1 \n" // (val += 1)
+ "strex r1, %0, [%1] \n"
+ "cmp r1, #0 \n"
+ "bne 1b \n"
+ "dmb ish \n" // Memory barrier.
+ : "=&r" (val)
+ : "r"(pAddr)
+ : "r1"
+ );
return val;
-}
#elif defined(__mips__)
// TODO:
+ long val;
+ #error AtomicIncrement undefined for mips
+ return val;
#elif defined(WIN32)
-
-long AtomicIncrement(volatile long* pAddr)
-{
long val;
__asm
{
@@ -225,76 +205,75 @@ long AtomicIncrement(volatile long* pAddr)
mov val, eax ;
}
return val;
-}
-#else // Linux / OSX86 (GCC)
+#elif defined(__x86_64__)
+ register long result;
+ __asm__ __volatile__ (
+ "lock/xaddq %q0, %1"
+ : "=r" (result), "=m" (*pAddr)
+ : "0" ((long) (1)), "m" (*pAddr));
+ return *pAddr;
-long AtomicIncrement(volatile long* pAddr)
-{
+#else // Linux / OSX86 (GCC)
register long reg __asm__ ("eax") = 1;
__asm__ __volatile__ (
- "lock/xadd %0, %1 \n"
- "inc %%eax"
- : "+r" (reg)
- : "m" (*pAddr)
- : "memory" );
+ "lock/xadd %0, %1 \n"
+ "inc %%eax"
+ : "+r" (reg)
+ : "m" (*pAddr)
+ : "memory" );
return reg;
-}
#endif
+}
///////////////////////////////////////////////////////////////////////////
// 32-bit atomic add
// Returns new value of *pAddr
///////////////////////////////////////////////////////////////////////////
-
-#if defined(__ppc__) || defined(__powerpc__) // PowerPC
-
long AtomicAdd(volatile long* pAddr, long amount)
{
- long val;
+#if defined(HAS_GCC_INTRINSICS)
+ return __sync_add_and_fetch(pAddr, amount);
+#elif defined(__ppc__) || defined(__powerpc__) // PowerPC
+ long val;
__asm__ __volatile__ (
- "sync \n"
- "1: lwarx %0, 0, %1 \n"
- "add %0, %2, %0 \n"
- "stwcx. %0, 0, %1 \n"
- "bne- 1b \n"
- "isync"
- : "=&r" (val)
- : "r" (pAddr), "r" (amount)
- : "cc", "memory");
+ "sync \n"
+ "1: lwarx %0, 0, %1 \n"
+ "add %0, %2, %0 \n"
+ "stwcx. %0, 0, %1 \n"
+ "bne- 1b \n"
+ "isync"
+ : "=&r" (val)
+ : "r" (pAddr), "r" (amount)
+ : "cc", "memory");
return val;
-}
#elif defined(__arm__)
-
-long AtomicAdd(volatile long* pAddr, long amount)
-{
register long val;
asm volatile (
- "dmb ish \n" // Memory barrier. Make sure all memory accesses appearing before this complete before any that appear after
- "1: \n"
- "ldrex %0, [%1] \n" // (val = *pAddr)
- "add %0, %2 \n" // (val += amount)
- "strex r1, %0, [%1] \n"
- "cmp r1, #0 \n"
- "bne 1b \n"
- "dmb ish \n" // Memory barrier.
- : "=&r" (val)
- : "r"(pAddr), "r"(amount)
- : "r1"
- );
+ "dmb ish \n" // Memory barrier. Make sure all memory accesses appearing before this complete before any that appear after
+ "1: \n"
+ "ldrex %0, [%1] \n" // (val = *pAddr)
+ "add %0, %2 \n" // (val += amount)
+ "strex r1, %0, [%1] \n"
+ "cmp r1, #0 \n"
+ "bne 1b \n"
+ "dmb ish \n" // Memory barrier.
+ : "=&r" (val)
+ : "r"(pAddr), "r"(amount)
+ : "r1"
+ );
return val;
-}
#elif defined(__mips__)
// TODO:
+ long val;
+ #error AtomicAdd undefined for mips
+ return val;
#elif defined(WIN32)
-
-long AtomicAdd(volatile long* pAddr, long amount)
-{
__asm
{
mov eax, amount;
@@ -304,76 +283,75 @@ long AtomicAdd(volatile long* pAddr, long amount)
mov amount, ebx;
}
return amount;
-}
-#else // Linux / OSX86 (GCC)
+#elif defined(__x86_64__)
+ register long result;
+ __asm__ __volatile__ (
+ "lock/xaddq %q0, %1"
+ : "=r" (result), "=m" (*pAddr)
+ : "0" ((long) (amount)), "m" (*pAddr));
+ return *pAddr;
-long AtomicAdd(volatile long* pAddr, long amount)
-{
+#else // Linux / OSX86 (GCC)
register long reg __asm__ ("eax") = amount;
__asm__ __volatile__ (
- "lock/xadd %0, %1 \n"
- "dec %%eax"
- : "+r" (reg)
- : "m" (*pAddr)
- : "memory" );
+ "lock/xadd %0, %1 \n"
+ "dec %%eax"
+ : "+r" (reg)
+ : "m" (*pAddr)
+ : "memory" );
return reg;
-}
#endif
+}
///////////////////////////////////////////////////////////////////////////
// 32-bit atomic decrement
// Returns new value of *pAddr
///////////////////////////////////////////////////////////////////////////
-#if defined(__ppc__) || defined(__powerpc__) // PowerPC
-
long AtomicDecrement(volatile long* pAddr)
{
- long val;
+#if defined(HAS_GCC_INTRINSICS)
+ return __sync_sub_and_fetch(pAddr, 1);
+#elif defined(__ppc__) || defined(__powerpc__) // PowerPC
+ long val;
__asm__ __volatile__ (
- "sync \n"
- "1: lwarx %0, 0, %1 \n"
- "addic %0, %0, -1 \n"
- "stwcx. %0, 0, %1 \n"
- "bne- 1b \n"
- "isync"
- : "=&r" (val)
- : "r" (pAddr)
- : "cc", "xer", "memory");
+ "sync \n"
+"1: lwarx %0, 0, %1 \n"
+ "addic %0, %0, -1 \n"
+ "stwcx. %0, 0, %1 \n"
+ "bne- 1b \n"
+ "isync"
+ : "=&r" (val)
+ : "r" (pAddr)
+ : "cc", "xer", "memory");
return val;
-}
#elif defined(__arm__)
-
-long AtomicDecrement(volatile long* pAddr)
-{
register long val;
asm volatile (
- "dmb ish \n" // Memory barrier. Make sure all memory accesses appearing before this complete before any that appear after
- "1: \n"
- "ldrex %0, [%1] \n" // (val = *pAddr)
- "sub %0, #1 \n" // (val -= 1)
- "strex r1, %0, [%1] \n"
- "cmp r1, #0 \n"
- "bne 1b \n"
- "dmb ish \n" // Memory barrier.
- : "=&r" (val)
- : "r"(pAddr)
- : "r1"
- );
-
+ "dmb ish \n" // Memory barrier. Make sure all memory accesses appearing before this complete before any that appear after
+ "1: \n"
+ "ldrex %0, [%1] \n" // (val = *pAddr)
+ "sub %0, #1 \n" // (val -= 1)
+ "strex r1, %0, [%1] \n"
+ "cmp r1, #0 \n"
+ "bne 1b \n"
+ "dmb ish \n" // Memory barrier.
+ : "=&r" (val)
+ : "r"(pAddr)
+ : "r1"
+ );
return val;
-}
#elif defined(__mips__)
// TODO:
+ long val;
+ #error AtomicDecrement undefined for mips
+ return val;
#elif defined(WIN32)
-
-long AtomicDecrement(volatile long* pAddr)
-{
long val;
__asm
{
@@ -383,77 +361,75 @@ long AtomicDecrement(volatile long* pAddr)
mov val, eax ;
}
return val;
-}
-#else // Linux / OSX86 (GCC)
+#elif defined(__x86_64__)
+ register long result;
+ __asm__ __volatile__ (
+ "lock/xaddq %q0, %1"
+ : "=r" (result), "=m" (*pAddr)
+ : "0" ((long) (-1)), "m" (*pAddr));
+ return *pAddr;
-long AtomicDecrement(volatile long* pAddr)
-{
+#else // Linux / OSX86 (GCC)
register long reg __asm__ ("eax") = -1;
__asm__ __volatile__ (
- "lock/xadd %0, %1 \n"
- "dec %%eax"
- : "+r" (reg)
- : "m" (*pAddr)
- : "memory" );
+ "lock/xadd %0, %1 \n"
+ "dec %%eax"
+ : "+r" (reg)
+ : "m" (*pAddr)
+ : "memory" );
return reg;
-}
#endif
+}
///////////////////////////////////////////////////////////////////////////
// 32-bit atomic subtract
// Returns new value of *pAddr
///////////////////////////////////////////////////////////////////////////
-#if defined(__ppc__) || defined(__powerpc__) // PowerPC
-
long AtomicSubtract(volatile long* pAddr, long amount)
{
+#if defined(HAS_GCC_INTRINSICS)
+ return __sync_sub_and_fetch(pAddr, amount);
+
+#elif defined(__ppc__) || defined(__powerpc__) // PowerPC
long val;
amount *= -1;
-
__asm__ __volatile__ (
- "sync \n"
- "1: lwarx %0, 0, %1 \n"
- "add %0, %2, %0 \n"
- "stwcx. %0, 0, %1 \n"
- "bne- 1b \n"
- "isync"
- : "=&r" (val)
- : "r" (pAddr), "r" (amount)
- : "cc", "memory");
+ "sync \n"
+ "1: lwarx %0, 0, %1 \n"
+ "add %0, %2, %0 \n"
+ "stwcx. %0, 0, %1 \n"
+ "bne- 1b \n"
+ "isync"
+ : "=&r" (val)
+ : "r" (pAddr), "r" (amount)
+ : "cc", "memory");
return val;
-}
#elif defined(__arm__)
-
-long AtomicSubtract(volatile long* pAddr, long amount)
-{
register long val;
asm volatile (
- "dmb ish \n" // Memory barrier. Make sure all memory accesses appearing before this complete before any that appear after
- "1: \n"
- "ldrex %0, [%1] \n" // (val = *pAddr)
- "sub %0, %2 \n" // (val -= amount)
- "strex r1, %0, [%1] \n"
- "cmp r1, #0 \n"
- "bne 1b \n"
- "dmb ish \n" // Memory barrier.
- : "=&r" (val)
- : "r"(pAddr), "r"(amount)
- : "r1"
- );
-
+ "dmb ish \n" // Memory barrier. Make sure all memory accesses appearing before this complete before any that appear after
+ "1: \n"
+ "ldrex %0, [%1] \n" // (val = *pAddr)
+ "sub %0, %2 \n" // (val -= amount)
+ "strex r1, %0, [%1] \n"
+ "cmp r1, #0 \n"
+ "bne 1b \n"
+ "dmb ish \n" // Memory barrier.
+ : "=&r" (val)
+ : "r"(pAddr), "r"(amount)
+ : "r1"
+ );
return val;
-}
#elif defined(__mips__)
// TODO:
+ #error AtomicSubtract undefined for mips
+ return val;
#elif defined(WIN32)
-
-long AtomicSubtract(volatile long* pAddr, long amount)
-{
amount *= -1;
__asm
{
@@ -464,23 +440,27 @@ long AtomicSubtract(volatile long* pAddr, long amount)
mov amount, ebx;
}
return amount;
-}
-#else // Linux / OSX86 (GCC)
+#elif defined(__x86_64__)
+ register long result;
+ __asm__ __volatile__ (
+ "lock/xaddq %q0, %1"
+ : "=r" (result), "=m" (*pAddr)
+ : "0" ((long) (-1 * amount)), "m" (*pAddr));
+ return *pAddr;
-long AtomicSubtract(volatile long* pAddr, long amount)
-{
+#else // Linux / OSX86 (GCC)
register long reg __asm__ ("eax") = -1 * amount;
__asm__ __volatile__ (
- "lock/xadd %0, %1 \n"
- "dec %%eax"
- : "+r" (reg)
- : "m" (*pAddr)
- : "memory" );
+ "lock/xadd %0, %1 \n"
+ "dec %%eax"
+ : "+r" (reg)
+ : "m" (*pAddr)
+ : "memory" );
return reg;
-}
#endif
+}
///////////////////////////////////////////////////////////////////////////
// Fast spinlock implmentation. No backoff when busy