Skip to content

Commit

Permalink
Add faster sqrt code for xBRZ filters.
Browse files Browse the repository at this point in the history
We use ASM code for Windows `x86` and Linux desktop. The others remains
the same.
  • Loading branch information
denisfa authored and rkitover committed Sep 16, 2019
1 parent 10f5576 commit efa09bd
Showing 1 changed file with 21 additions and 14 deletions.
35 changes: 21 additions & 14 deletions src/filters/xBRZ/xbrz.cpp
Expand Up @@ -64,16 +64,23 @@ uint32_t gradientARGB(uint32_t pixFront, uint32_t pixBack) //find intermediate c
}


//inline
//double fastSqrt(double n)
//{
// __asm //speeds up xBRZ by about 9% compared to std::sqrt which internally uses the same assembler instructions but adds some "fluff"
// {
// fld n
// fsqrt
// }
//}
//
inline double fastSqrt(double n)
{
#ifdef __GNUC__ || __clang__ || __MINGW64_VERSION_MAJOR || __MINGW32_MAJOR_VERSION

This comment has been minimized.

Copy link
@rkitover

rkitover Oct 4, 2019

Collaborator

I didn't catch this last time, but this does not work. #ifdef checks for only one define, to or them together you have to do this:

#if defined(__GNUC__) || defined(__clang__)

This comment has been minimized.

Copy link
@rkitover

rkitover Oct 4, 2019

Collaborator

I will fix this.

__asm__ ("fsqrt" : "+t" (n));
return n;
#elif _MSC_VER && _M_IX86
// speeds up xBRZ by about 9% compared to std::sqrt which internally uses
// the same assembler instructions but adds some "fluff"
__asm {
fld n
fsqrt
}
#else // _MSC_VER && _M_X64 OR other platforms
// VisualStudio x86_64 does not allow inline ASM
return std::sqrt(n);
#endif
}


#ifdef _MSC_VER
Expand Down Expand Up @@ -147,7 +154,7 @@ double distRGB(uint32_t pix1, uint32_t pix2)
const double b_diff = static_cast<int>(getBlue (pix1)) - getBlue (pix2);

//euklidean RGB distance
return std::sqrt(square(r_diff) + square(g_diff) + square(b_diff));
return fastSqrt(square(r_diff) + square(g_diff) + square(b_diff));
}
#endif

Expand Down Expand Up @@ -175,7 +182,7 @@ double distYCbCr(uint32_t pix1, uint32_t pix2, double lumaWeight)
const double c_r = scale_r * (r_diff - y);

//we skip division by 255 to have similar range like other distance functions
return std::sqrt(square(lumaWeight * y) + square(c_b) + square(c_r));
return fastSqrt(square(lumaWeight * y) + square(c_b) + square(c_r));
}


Expand Down Expand Up @@ -205,7 +212,7 @@ double distYCbCrBuffered(uint32_t pix1, uint32_t pix2)
const double c_b = scale_b * (b_diff - y);
const double c_r = scale_r * (r_diff - y);

tmp.push_back(static_cast<float>(std::sqrt(square(y) + square(c_b) + square(c_r))));
tmp.push_back(static_cast<float>(fastSqrt(square(y) + square(c_b) + square(c_r))));
}
return tmp;
}();
Expand Down Expand Up @@ -1042,7 +1049,7 @@ struct ColorDistanceARGB
else
return a2 * d + 255 * (a1 - a2);

//alternative? return std::sqrt(a1 * a2 * square(distYCbCrBuffered(pix1, pix2)) + square(255 * (a1 - a2)));
//alternative? return fastSqrt(a1 * a2 * square(distYCbCrBuffered(pix1, pix2)) + square(255 * (a1 - a2)));
}
};

Expand Down

0 comments on commit efa09bd

Please sign in to comment.