Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Merge pull request #1237 from theuni/android-AE-optims

Android AE Optims
  • Loading branch information...
commit 54722924c3bdc4542ec085aaeafaf9b250b1b08f 2 parents f7a6e2b + 5d28aa3
@davilla davilla authored
View
145 xbmc/cores/AudioEngine/Engines/SoftAE/SoftAE.cpp
@@ -162,6 +162,7 @@ void CSoftAE::OpenSink()
m_reOpenEvent.Reset();
m_reOpen = true;
m_reOpenEvent.Wait();
+ m_wake.Set();
}
/* this must NEVER be called from outside the main thread or Initialization */
@@ -322,6 +323,10 @@ void CSoftAE::InternalOpenSink()
m_sinkFormatSampleRateMul = 1.0 / (float)newFormat.m_sampleRate;
m_sinkFormatFrameSizeMul = 1.0 / (float)newFormat.m_frameSize;
m_sinkBlockSize = newFormat.m_frames * newFormat.m_frameSize;
+ // check if sink controls volume, if so, init the volume.
+ m_sinkHandlesVolume = m_sink->HasVolume();
+ if (m_sinkHandlesVolume)
+ m_sink->SetVolume(m_volume);
/* invalidate the buffer */
m_buffer.Empty();
@@ -425,6 +430,7 @@ void CSoftAE::InternalOpenSink()
/* notify any event listeners that we are done */
m_reOpen = false;
m_reOpenEvent.Set();
+ m_wake.Set();
}
void CSoftAE::ResetEncoder()
@@ -666,6 +672,7 @@ void CSoftAE::ResumeStream(CSoftAEStream *stream)
void CSoftAE::Stop()
{
m_running = false;
+ m_wake.Set();
/* wait for the thread to stop */
CSingleLock lock(m_runningLock);
@@ -732,6 +739,7 @@ void CSoftAE::PlaySound(IAESound *sound)
((CSoftAESound*)sound)->GetSampleCount()
};
m_playing_sounds.push_back(ss);
+ m_wake.Set();
}
void CSoftAE::FreeSound(IAESound *sound)
@@ -795,34 +803,34 @@ IAEStream *CSoftAE::FreeStream(IAEStream *stream)
double CSoftAE::GetDelay()
{
+ double delay = (double)m_buffer.Used() * m_sinkFormatFrameSizeMul *m_sinkFormatSampleRateMul;
CSharedLock sinkLock(m_sinkLock);
+ if (m_sink)
+ delay += m_sink->GetDelay();
+ sinkLock.Leave();
- double delay = m_sink->GetDelay();
if (m_transcode && m_encoder && !m_rawPassthrough)
delay += m_encoder->GetDelay((double)m_encodedBuffer.Used() * m_encoderFrameSizeMul);
- double buffered = (double)m_buffer.Used() * m_sinkFormatFrameSizeMul;
- return delay + (buffered * m_sinkFormatSampleRateMul);
+ return delay;
}
double CSoftAE::GetCacheTime()
{
+ double time = (double)m_buffer.Used() * m_sinkFormatFrameSizeMul * m_sinkFormatSampleRateMul;
CSharedLock sinkLock(m_sinkLock);
-
- double time;
- time = (double)m_buffer.Used() * m_sinkFormatFrameSizeMul * m_sinkFormatSampleRateMul;
- time += m_sink->GetCacheTime();
+ if (m_sink)
+ time += m_sink->GetCacheTime();
return time;
}
double CSoftAE::GetCacheTotal()
{
+ double total = (double)m_buffer.Size() * m_sinkFormatFrameSizeMul * m_sinkFormatSampleRateMul;
CSharedLock sinkLock(m_sinkLock);
-
- double total;
- total = (double)m_buffer.Size() * m_sinkFormatFrameSizeMul * m_sinkFormatSampleRateMul;
- total += m_sink->GetCacheTotal();
+ if (m_sink)
+ total += m_sink->GetCacheTotal();
return total;
}
@@ -835,6 +843,12 @@ float CSoftAE::GetVolume()
void CSoftAE::SetVolume(float volume)
{
m_volume = volume;
+ if (!m_sinkHandlesVolume)
+ return;
+
+ CSharedLock sinkLock(m_sinkLock);
+ if (m_sink)
+ m_sink->SetVolume(m_volume);
}
void CSoftAE::StopAllSounds()
@@ -885,10 +899,19 @@ void CSoftAE::Run()
CLog::Log(LOGDEBUG, "CSoftAE::Run - Sink restart flagged");
InternalOpenSink();
}
+#if defined(TARGET_ANDROID)
+ else if (m_playingStreams.empty() && m_playing_sounds.empty())
+ {
+ // if we have nothing to do, take a dirt nap.
+ // we do not have to take a lock just to check empty.
+ // this keeps AE from sucking CPU if nothing is going on.
+ m_wake.WaitMSec(100);
+ }
+#endif
}
}
-void CSoftAE::AllocateConvIfNeeded(size_t convertedSize)
+void CSoftAE::AllocateConvIfNeeded(size_t convertedSize, bool prezero)
{
if (m_convertedSize < convertedSize)
{
@@ -896,10 +919,17 @@ void CSoftAE::AllocateConvIfNeeded(size_t convertedSize)
m_converted = (uint8_t *)_aligned_malloc(convertedSize, 16);
m_convertedSize = convertedSize;
}
+ if (prezero)
+ memset(m_converted, 0x00, convertedSize);
}
unsigned int CSoftAE::MixSounds(float *buffer, unsigned int samples)
{
+ // no point doing anything if we have no sounds,
+ // we do not have to take a lock just to check empty
+ if (m_playing_sounds.empty())
+ return 0;
+
SoundStateList::iterator itt;
unsigned int mixed = 0;
@@ -922,8 +952,9 @@ unsigned int CSoftAE::MixSounds(float *buffer, unsigned int samples)
#ifdef __SSE__
CAEUtil::SSEMulAddArray(buffer, ss->samples, volume, mixSamples);
#else
+ float *sample_buffer = ss->samples;
for (unsigned int i = 0; i < mixSamples; ++i)
- buffer[i] = (buffer[i] + (ss->samples[i] * volume));
+ *buffer++ = *sample_buffer++ * volume;
#endif
ss->sampleCount -= mixSamples;
@@ -951,34 +982,27 @@ bool CSoftAE::FinalizeSamples(float *buffer, unsigned int samples, bool hasAudio
}
/* deamplify */
- bool clamp = false;
- if (m_volume < 1.0)
+ if (!m_sinkHandlesVolume && m_volume < 1.0)
{
#ifdef __SSE__
CAEUtil::SSEMulArray(buffer, m_volume, samples);
- for (unsigned int i = 0; i < samples; ++i)
- if (buffer[i] < -1.0f || buffer[i] > 1.0f)
- {
- clamp = true;
- break;
- }
#else
- for (unsigned int i = 0; i < samples; ++i)
- {
- buffer[i] *= m_volume;
- if (!clamp && (buffer[i] < -1.0f || buffer[i] > 1.0f))
- clamp = true;
- }
+ float *fbuffer = buffer;
+ for (unsigned int i = 0; i < samples; i++)
+ *fbuffer++ *= m_volume;
#endif
}
- else
+
+ /* check if we need to clamp */
+ bool clamp = false;
+ float *fbuffer = buffer;
+ for (unsigned int i = 0; i < samples; i++, fbuffer++)
{
- for (unsigned int i = 0; i < samples; ++i)
- if (buffer[i] < -1.0f || buffer[i] > 1.0f)
- {
- clamp = true;
- break;
- }
+ if (*fbuffer < -1.0f || *fbuffer > 1.0f)
+ {
+ clamp = true;
+ break;
+ }
}
/* if there were no samples outside of the range, dont clamp the buffer */
@@ -1004,8 +1028,9 @@ int CSoftAE::RunOutputStage(bool hasAudio)
if (m_convertFn)
{
const unsigned int convertedBytes = m_sinkFormat.m_frames * m_sinkFormat.m_frameSize;
- AllocateConvIfNeeded(convertedBytes);
- m_convertFn((float*)data, needSamples, m_converted);
+ AllocateConvIfNeeded(convertedBytes, !hasAudio);
+ if (hasAudio)
+ m_convertFn((float*)data, needSamples, m_converted);
data = m_converted;
}
@@ -1042,8 +1067,9 @@ int CSoftAE::RunRawOutputStage(bool hasAudio)
* tell it the needed format from here, so do it here for now (better than
* nothing)...
*/
- AllocateConvIfNeeded(m_sinkBlockSize);
- Endian_Swap16_buf((uint16_t *)m_converted, (uint16_t *)data, m_sinkBlockSize / 2);
+ AllocateConvIfNeeded(m_sinkBlockSize, !hasAudio);
+ if (hasAudio)
+ Endian_Swap16_buf((uint16_t *)m_converted, (uint16_t *)data, m_sinkBlockSize / 2);
data = m_converted;
}
@@ -1076,17 +1102,10 @@ int CSoftAE::RunTranscodeStage(bool hasAudio)
if (m_convertFn)
{
unsigned int newsize = m_encoderFormat.m_frames * m_encoderFormat.m_frameSize;
- if (m_convertedSize < newsize)
- {
- _aligned_free(m_converted);
- m_converted = (uint8_t *)_aligned_malloc(newsize, 16);
- m_convertedSize = newsize;
- }
- m_convertFn(
- (float*)m_buffer.Raw(block),
- m_encoderFormat.m_frames * m_encoderFormat.m_channelLayout.Count(),
- m_converted
- );
+ AllocateConvIfNeeded(newsize, !hasAudio);
+ if (hasAudio)
+ m_convertFn((float*)m_buffer.Raw(block),
+ m_encoderFormat.m_frames * m_encoderFormat.m_channelLayout.Count(), m_converted);
buffer = m_converted;
}
else
@@ -1169,16 +1188,17 @@ unsigned int CSoftAE::RunRawStreamStage(unsigned int channelCount, void *out, bo
unsigned int CSoftAE::RunStreamStage(unsigned int channelCount, void *out, bool &restart)
{
+ // no point doing anything if we have no streams,
+ // we do not have to take a lock just to check empty
+ if (m_playingStreams.empty())
+ return 0;
+
float *dst = (float*)out;
unsigned int mixed = 0;
/* identify the master stream */
CSingleLock streamLock(m_streamLock);
- /* no point doing anything if we have no streams */
- if (m_playingStreams.empty())
- return mixed;
-
/* mix in any running streams */
StreamList resumeStreams;
for (StreamList::iterator itt = m_playingStreams.begin(); itt != m_playingStreams.end(); ++itt)
@@ -1199,23 +1219,8 @@ unsigned int CSoftAE::RunStreamStage(unsigned int channelCount, void *out, bool
else
#endif
{
- /* unrolled loop for performance */
- unsigned int blocks = channelCount & ~0x3;
- unsigned int i = 0;
- for (i = 0; i < blocks; i += 4)
- {
- dst[i+0] += frame[i+0] * volume;
- dst[i+1] += frame[i+1] * volume;
- dst[i+2] += frame[i+2] * volume;
- dst[i+3] += frame[i+3] * volume;
- }
-
- switch (channelCount & 0x3)
- {
- case 3: dst[i] += frame[i] * volume; ++i;
- case 2: dst[i] += frame[i] * volume; ++i;
- case 1: dst[i] += frame[i] * volume;
- }
+ for (unsigned int i = 0; i < channelCount; ++i)
+ *dst++ += *frame++ * volume;
}
++mixed;
View
4 xbmc/cores/AudioEngine/Engines/SoftAE/SoftAE.h
@@ -130,6 +130,7 @@ class CSoftAE : public IThreadedAE
/* internal vars */
bool m_running, m_reOpen;
CEvent m_reOpenEvent;
+ CEvent m_wake;
CCriticalSection m_runningLock; /* released when the thread exits */
CCriticalSection m_streamLock; /* m_streams lock */
@@ -150,6 +151,7 @@ class CSoftAE : public IThreadedAE
float m_sinkFormatSampleRateMul;
float m_sinkFormatFrameSizeMul;
unsigned int m_sinkBlockSize;
+ bool m_sinkHandlesVolume;
AEAudioFormat m_encoderFormat;
float m_encoderFrameSizeMul;
unsigned int m_bytesPerSample;
@@ -186,7 +188,7 @@ class CSoftAE : public IThreadedAE
uint8_t *m_converted;
size_t m_convertedSize;
- void AllocateConvIfNeeded(size_t convertedSize);
+ void AllocateConvIfNeeded(size_t convertedSize, bool prezero = false);
/* thread run stages */
View
4 xbmc/cores/AudioEngine/Engines/SoftAE/SoftAEStream.cpp
@@ -136,7 +136,9 @@ void CSoftAEStream::Initialize()
m_aeChannelLayout = AE.GetChannelLayout();
m_aeBytesPerFrame = AE_IS_RAW(m_initDataFormat) ? m_bytesPerFrame : (m_samplesPerFrame * sizeof(float));
- m_waterLevel = AE.GetSampleRate() / 2;
+ // set the waterlevel to 75 percent of the number of frames per second.
+ // this lets us drain the main buffer down futher before flagging an underrun.
+ m_waterLevel = AE.GetSampleRate() - (AE.GetSampleRate() / 4);
m_refillBuffer = m_waterLevel;
m_format.m_dataFormat = useDataFormat;
View
10 xbmc/cores/AudioEngine/Interfaces/AESink.h
@@ -78,5 +78,15 @@ class IAESink
Drain the sink
*/
virtual void Drain() {};
+
+ /*
+ Indicates if sink can handle volume control.
+ */
+ virtual bool HasVolume() {return false;};
+
+ /*
+ This method sets the volume control, volume ranges from 0.0 to 1.0.
+ */
+ virtual void SetVolume(float volume) {};
};
View
1  xbmc/cores/AudioEngine/Sinks/AESinkNULL.h
@@ -42,6 +42,7 @@ class CAESinkNULL : public IAESink
virtual double GetCacheTotal () { return 0.0; }
virtual unsigned int AddPackets (uint8_t *data, unsigned int frames, bool hasAudio);
virtual void Drain ();
+
static void EnumerateDevices(AEDeviceList &devices, bool passthrough);
private:
int64_t m_ts;
View
5 xbmc/cores/AudioEngine/Utils/AEBuffer.h
@@ -132,7 +132,10 @@ class CAEBuffer
#endif
if (dst)
memcpy(dst, m_buffer, size);
- memmove(m_buffer, m_buffer + size, m_bufferSize - size);
+ // we can just reset m_bufferPos
+ // if there is nothing else inside.
+ if (m_bufferPos != size)
+ memmove(m_buffer, m_buffer + size, m_bufferSize - size);
m_bufferPos -= size;
}
View
229 xbmc/cores/AudioEngine/Utils/AEConvert.cpp
@@ -89,8 +89,13 @@ CAEConvert::AEConvertToFn CAEConvert::ToFloat(enum AEDataFormat dataFormat)
case AE_FMT_S24BE4: return &S24BE4_Float;
case AE_FMT_S24LE3: return &S24LE3_Float;
case AE_FMT_S24BE3: return &S24BE3_Float;
+#if defined(__ARM_NEON__)
+ case AE_FMT_S32LE : return &S32LE_Float_Neon
+ case AE_FMT_S32BE : return &S32BE_Float_Neon;
+#else
case AE_FMT_S32LE : return &S32LE_Float;
case AE_FMT_S32BE : return &S32BE_Float;
+#endif
case AE_FMT_DOUBLE: return &DOUBLE_Float;
default:
return NULL;
@@ -114,8 +119,13 @@ CAEConvert::AEConvertFrFn CAEConvert::FrFloat(enum AEDataFormat dataFormat)
case AE_FMT_S16BE : return &Float_S16BE;
case AE_FMT_S24NE4: return &Float_S24NE4;
case AE_FMT_S24NE3: return &Float_S24NE3;
+#if defined(__ARM_NEON__)
+ case AE_FMT_S32LE : return &Float_S32LE_Neon;
+ case AE_FMT_S32BE : return &Float_S32BE_Neon;
+#else
case AE_FMT_S32LE : return &Float_S32LE;
case AE_FMT_S32BE : return &Float_S32BE;
+#endif
case AE_FMT_DOUBLE: return &Float_DOUBLE;
default:
return NULL;
@@ -126,8 +136,8 @@ unsigned int CAEConvert::U8_Float(uint8_t *data, const unsigned int samples, flo
{
const float mul = 2.0f / UINT8_MAX;
- for (unsigned int i = 0; i < samples; ++i, ++data, ++dest)
- *dest = *(uint8_t*)data * mul - 1.0f;
+ for (unsigned int i = 0; i < samples; ++i)
+ *dest++ = *data++ * mul - 1.0f;
return samples;
}
@@ -136,8 +146,8 @@ unsigned int CAEConvert::S8_Float(uint8_t *data, const unsigned int samples, flo
{
const float mul = 1.0f / (INT8_MAX + 0.5f);
- for (unsigned int i = 0; i < samples; ++i, ++data, ++dest)
- *dest = *(int8_t*)data * mul;
+ for (unsigned int i = 0; i < samples; ++i)
+ *dest++ = *data++ * mul;
return samples;
}
@@ -147,7 +157,7 @@ unsigned int CAEConvert::S16LE_Float(uint8_t* data, const unsigned int samples,
static const float mul = 1.0f / (INT16_MAX + 0.5f);
#ifdef __arm__
- for (int i = 0; i < samples; i++)
+ for (unsigned int i = 0; i < samples; i++)
{
__asm__ __volatile__ (
"ldrsh r1,[%[in]] \n\t" // Read a halfword from the source address
@@ -166,8 +176,8 @@ unsigned int CAEConvert::S16LE_Float(uint8_t* data, const unsigned int samples,
dest++;
}
#else
- for (unsigned int i = 0; i < samples; ++i, data += 2, ++dest)
- *dest = Endian_SwapLE16(*(int16_t*)data) * mul;
+ for (unsigned int i = 0; i < samples; ++i, data += 2)
+ *dest++ = Endian_SwapLE16(*(int16_t*)data) * mul;
#endif
return samples;
@@ -178,7 +188,7 @@ unsigned int CAEConvert::S16BE_Float(uint8_t* data, const unsigned int samples,
static const float mul = 1.0f / (INT16_MAX + 0.5f);
#ifdef __arm__
- for (int i = 0; i < samples; i++)
+ for (unsigned int i = 0; i < samples; i++)
{
__asm__ __volatile__ (
"ldrsh r1,[%[in]] \n\t" // Read a halfword from the source address
@@ -197,8 +207,8 @@ unsigned int CAEConvert::S16BE_Float(uint8_t* data, const unsigned int samples,
dest++;
}
#else
- for (unsigned int i = 0; i < samples; ++i, data += 2, ++dest)
- *dest = Endian_SwapBE16(*(int16_t*)data) * mul;
+ for (unsigned int i = 0; i < samples; ++i, data += 2)
+ *dest++ = Endian_SwapBE16(*(int16_t*)data) * mul;
#endif
return samples;
@@ -206,40 +216,40 @@ unsigned int CAEConvert::S16BE_Float(uint8_t* data, const unsigned int samples,
unsigned int CAEConvert::S24LE4_Float(uint8_t *data, const unsigned int samples, float *dest)
{
- for (unsigned int i = 0; i < samples; ++i, ++dest, data += 4)
+ for (unsigned int i = 0; i < samples; ++i, data += 4)
{
int s = (data[2] << 24) | (data[1] << 16) | (data[0] << 8);
- *dest = (float)s * INT32_SCALE;
+ *dest++ = (float)s * INT32_SCALE;
}
return samples;
}
unsigned int CAEConvert::S24BE4_Float(uint8_t *data, const unsigned int samples, float *dest)
{
- for (unsigned int i = 0; i < samples; ++i, ++dest, data += 4)
+ for (unsigned int i = 0; i < samples; ++i, data += 4)
{
int s = (data[0] << 24) | (data[1] << 16) | (data[2] << 8);
- *dest = (float)s * INT32_SCALE;
+ *dest++ = (float)s * INT32_SCALE;
}
return samples;
}
unsigned int CAEConvert::S24LE3_Float(uint8_t *data, const unsigned int samples, float *dest)
{
- for (unsigned int i = 0; i < samples; ++i, ++dest, data += 3)
+ for (unsigned int i = 0; i < samples; ++i, data += 3)
{
int s = (data[2] << 24) | (data[1] << 16) | (data[0] << 8);
- *dest = (float)s * INT32_SCALE;
+ *dest++ = (float)s * INT32_SCALE;
}
return samples;
}
unsigned int CAEConvert::S24BE3_Float(uint8_t *data, const unsigned int samples, float *dest)
{
- for (unsigned int i = 0; i < samples; ++i, ++dest, data += 3)
+ for (unsigned int i = 0; i < samples; ++i, data += 3)
{
int s = (data[1] << 24) | (data[2] << 16) | (data[3] << 8);
- *dest = (float)s * INT32_SCALE;
+ *dest++ = (float)s * INT32_SCALE;
}
return samples;
}
@@ -249,7 +259,27 @@ unsigned int CAEConvert::S32LE_Float(uint8_t *data, const unsigned int samples,
static const float factor = 1.0f / (float)INT32_MAX;
int32_t *src = (int32_t*)data;
+ /* do this in groups of 4 to give the compiler a better chance of optimizing this */
+ for (float *end = dest + (samples & ~0x3); dest < end;)
+ {
+ *dest++ = (float)Endian_SwapLE32(*src++) * factor;
+ *dest++ = (float)Endian_SwapLE32(*src++) * factor;
+ *dest++ = (float)Endian_SwapLE32(*src++) * factor;
+ *dest++ = (float)Endian_SwapLE32(*src++) * factor;
+ }
+
+ /* process any remaining samples */
+ for (float *end = dest + (samples & 0x3); dest < end;)
+ *dest++ = (float)Endian_SwapLE32(*src++) * factor;
+
+ return samples;
+}
+
+unsigned int CAEConvert::S32LE_Float_Neon(uint8_t *data, const unsigned int samples, float *dest)
+{
#if defined(__ARM_NEON__)
+ static const float factor = 1.0f / (float)INT32_MAX;
+ int32_t *src = (int32_t*)data;
/* groups of 4 samples */
for (float *end = dest + (samples & ~0x3); dest < end; src += 4, dest += 4)
@@ -259,7 +289,7 @@ unsigned int CAEConvert::S32LE_Float(uint8_t *data, const unsigned int samples,
val = vrev64q_s32(val);
#endif
float32x4_t ret = vmulq_n_f32(vcvtq_f32_s32(val), factor);
- vst1q_f32((float32_t *)dest, ret);
+ vst1q_f32((float32_t*)dest, ret);
}
/* if there are >= 2 remaining samples */
@@ -279,33 +309,37 @@ unsigned int CAEConvert::S32LE_Float(uint8_t *data, const unsigned int samples,
if (samples & 0x1)
dest[0] = (float)src[0] * factor;
-#else /* !defined(__ARM_NEON__) */
+#endif /* !defined(__ARM_NEON__) */
+ return samples;
+}
+
+unsigned int CAEConvert::S32BE_Float(uint8_t *data, const unsigned int samples, float *dest)
+{
+ static const float factor = 1.0f / (float)INT32_MAX;
+ int32_t *src = (int32_t*)data;
/* do this in groups of 4 to give the compiler a better chance of optimizing this */
- for (float *end = dest + (samples & ~0x3); dest < end; src += 4, dest += 4)
+ for (float *end = dest + (samples & ~0x3); dest < end;)
{
- dest[0] = (float)Endian_SwapLE32(src[0]) * factor;
- dest[1] = (float)Endian_SwapLE32(src[1]) * factor;
- dest[2] = (float)Endian_SwapLE32(src[2]) * factor;
- dest[3] = (float)Endian_SwapLE32(src[3]) * factor;
+ *dest++ = (float)Endian_SwapBE32(*src++) * factor;
+ *dest++ = (float)Endian_SwapBE32(*src++) * factor;
+ *dest++ = (float)Endian_SwapBE32(*src++) * factor;
+ *dest++ = (float)Endian_SwapBE32(*src++) * factor;
}
/* process any remaining samples */
- for (float *end = dest + (samples & 0x3); dest < end; ++src, ++dest)
- dest[0] = (float)Endian_SwapLE32(src[0]) * factor;
-
-#endif
+ for (float *end = dest + (samples & 0x3); dest < end;)
+ *dest++ = (float)Endian_SwapBE32(*src++) * factor;
return samples;
}
-unsigned int CAEConvert::S32BE_Float(uint8_t *data, const unsigned int samples, float *dest)
+unsigned int CAEConvert::S32BE_Float_Neon(uint8_t *data, const unsigned int samples, float *dest)
{
+#if defined(__ARM_NEON__)
static const float factor = 1.0f / (float)INT32_MAX;
int32_t *src = (int32_t*)data;
-#if defined(__ARM_NEON__)
-
/* groups of 4 samples */
for (float *end = dest + (samples & ~0x3); dest < end; src += 4, dest += 4)
{
@@ -334,31 +368,15 @@ unsigned int CAEConvert::S32BE_Float(uint8_t *data, const unsigned int samples,
if (samples & 0x1)
dest[0] = (float)src[0] * factor;
-#else /* !defined(__ARM_NEON__) */
-
- /* do this in groups of 4 to give the compiler a better chance of optimizing this */
- for (float *end = dest + (samples & ~0x3); dest < end; src += 4, dest += 4)
- {
- dest[0] = (float)Endian_SwapBE32(src[0]) * factor;
- dest[1] = (float)Endian_SwapBE32(src[1]) * factor;
- dest[2] = (float)Endian_SwapBE32(src[2]) * factor;
- dest[3] = (float)Endian_SwapBE32(src[3]) * factor;
- }
-
- /* process any remaining samples */
- for (float *end = dest + (samples & 0x3); dest < end; ++src, ++dest)
- dest[0] = (float)Endian_SwapBE32(src[0]) * factor;
-
-#endif
-
+#endif /* !defined(__ARM_NEON__) */
return samples;
}
unsigned int CAEConvert::DOUBLE_Float(uint8_t *data, const unsigned int samples, float *dest)
{
double *src = (double*)data;
- for (unsigned int i = 0; i < samples; ++i, ++src, ++dest)
- *dest = CLAMP(*src / (float)INT32_MAX);
+ for (unsigned int i = 0; i < samples; ++i)
+ *dest++ = CLAMP(*src++ / (float)INT32_MAX);
return samples;
}
@@ -380,17 +398,17 @@ unsigned int CAEConvert::Float_U8(float *data, const unsigned int samples, uint8
}
const uint32_t even = count & ~0x3;
- for (uint32_t i = 0; i < even; i += 4, data += 4, dest += 4)
+ for (uint32_t i = 0; i < even; i += 4, data += 4)
{
__m128 in = _mm_mul_ps(_mm_add_ps(_mm_load_ps(data), add), mul);
__m64 con = _mm_cvtps_pi16(in);
int16_t temp[4];
memcpy(temp, &con, sizeof(temp));
- dest[0] = (uint8_t)temp[0];
- dest[1] = (uint8_t)temp[1];
- dest[2] = (uint8_t)temp[2];
- dest[3] = (uint8_t)temp[3];
+ *dest++ = (uint8_t)temp[0];
+ *dest++ = (uint8_t)temp[1];
+ *dest++ = (uint8_t)temp[2];
+ *dest++ = (uint8_t)temp[3];
}
if (count != even)
@@ -431,8 +449,8 @@ unsigned int CAEConvert::Float_U8(float *data, const unsigned int samples, uint8
}
_mm_empty();
#else /* no SSE */
- for (uint32_t i = 0; i < samples; ++i, ++data, ++dest)
- dest[0] = safeRound((data[0] + 1.0f) * ((float)INT8_MAX+.5f));
+ for (uint32_t i = 0; i < samples; ++i)
+ *dest++ = safeRound((*data++ + 1.0f) * ((float)INT8_MAX+.5f));
#endif
return samples;
@@ -490,8 +508,8 @@ unsigned int CAEConvert::Float_S8(float *data, const unsigned int samples, uint8
}
_mm_empty();
#else /* no SSE */
- for (uint32_t i = 0; i < samples; ++i, ++data, ++dest)
- dest[0] = safeRound(data[0] * ((float)INT8_MAX+.5f));
+ for (uint32_t i = 0; i < samples; ++i)
+ *dest++ = safeRound(*data++ * ((float)INT8_MAX+.5f));
#endif
return samples;
@@ -611,20 +629,20 @@ unsigned int CAEConvert::Float_S16LE(float *data, const unsigned int samples, ui
uint32_t i = 0;
uint32_t even = samples & ~0x3;
- for(; i < even; i += 4, data += 4, dst += 4)
+ for(; i < even; i += 4)
{
/* random round to dither */
float rand[4];
CAEUtil::FloatRand4(-0.5f, 0.5f, rand);
- dst[0] = Endian_SwapLE16(safeRound(data[0] * ((float)INT16_MAX + rand[0])));
- dst[1] = Endian_SwapLE16(safeRound(data[1] * ((float)INT16_MAX + rand[1])));
- dst[2] = Endian_SwapLE16(safeRound(data[2] * ((float)INT16_MAX + rand[2])));
- dst[3] = Endian_SwapLE16(safeRound(data[3] * ((float)INT16_MAX + rand[3])));
+ *dst++ = Endian_SwapLE16(safeRound(*data++ * ((float)INT16_MAX + rand[0])));
+ *dst++ = Endian_SwapLE16(safeRound(*data++ * ((float)INT16_MAX + rand[1])));
+ *dst++ = Endian_SwapLE16(safeRound(*data++ * ((float)INT16_MAX + rand[2])));
+ *dst++ = Endian_SwapLE16(safeRound(*data++ * ((float)INT16_MAX + rand[3])));
}
- for(; i < samples; ++i, ++data, ++dst)
- dst[0] = Endian_SwapLE16(safeRound(data[0] * ((float)INT16_MAX + CAEUtil::FloatRand1(-0.5f, 0.5f))));
+ for(; i < samples; ++i)
+ *dst++ = Endian_SwapLE16(safeRound(*data++ * ((float)INT16_MAX + CAEUtil::FloatRand1(-0.5f, 0.5f))));
#endif
@@ -745,20 +763,20 @@ unsigned int CAEConvert::Float_S16BE(float *data, const unsigned int samples, ui
uint32_t i = 0;
uint32_t even = samples & ~0x3;
- for(; i < even; i += 4, data += 4, dst += 4)
+ for(; i < even; i += 4)
{
/* random round to dither */
float rand[4];
CAEUtil::FloatRand4(-0.5f, 0.5f, rand);
- dst[0] = Endian_SwapBE16(safeRound(data[0] * ((float)INT16_MAX + rand[0])));
- dst[1] = Endian_SwapBE16(safeRound(data[1] * ((float)INT16_MAX + rand[1])));
- dst[2] = Endian_SwapBE16(safeRound(data[2] * ((float)INT16_MAX + rand[2])));
- dst[3] = Endian_SwapBE16(safeRound(data[3] * ((float)INT16_MAX + rand[3])));
+ *dst++ = Endian_SwapBE16(safeRound(*data++ * ((float)INT16_MAX + rand[0])));
+ *dst++ = Endian_SwapBE16(safeRound(*data++ * ((float)INT16_MAX + rand[1])));
+ *dst++ = Endian_SwapBE16(safeRound(*data++ * ((float)INT16_MAX + rand[2])));
+ *dst++ = Endian_SwapBE16(safeRound(*data++ * ((float)INT16_MAX + rand[3])));
}
- for(; i < samples; ++i, ++data, ++dst)
- dst[0] = Endian_SwapBE16(safeRound(data[0] * ((float)INT16_MAX + CAEUtil::FloatRand1(-0.5f, 0.5f))));
+ for(; i < samples; ++i, data++, dst++)
+ *dst++ = Endian_SwapBE16(safeRound(*data++ * ((float)INT16_MAX + CAEUtil::FloatRand1(-0.5f, 0.5f))));
#endif
@@ -819,8 +837,8 @@ unsigned int CAEConvert::Float_S24NE4(float *data, const unsigned int samples, u
}
_mm_empty();
#else /* no SSE */
- for (uint32_t i = 0; i < samples; ++i, ++data, ++dst)
- *dst = (safeRound(*data * ((float)INT24_MAX+.5f)) & 0xFFFFFF) << 8;
+ for (uint32_t i = 0; i < samples; ++i)
+ *dst++ = (safeRound(*data++ * ((float)INT24_MAX+.5f)) & 0xFFFFFF) << 8;
#endif
return samples << 2;
@@ -966,9 +984,23 @@ unsigned int CAEConvert::Float_S32LE(float *data, const unsigned int samples, ui
}
}
_mm_empty();
+ #else
+
+ /* no SIMD */
+ for (uint32_t i = 0; i < samples; ++i, ++data, ++dst)
+ {
+ dst[0] = safeRound(data[0] * (float)INT32_MAX);
+ dst[0] = Endian_SwapLE32(dst[0]);
+ }
+ #endif
+ return samples << 2;
+}
- #elif defined(__ARM_NEON__)
+unsigned int CAEConvert::Float_S32LE_Neon(float *data, const unsigned int samples, uint8_t *dest)
+{
+#if defined(__ARM_NEON__)
+ int32_t *dst = (int32_t*)dest;
for (float *end = data + (samples & ~0x3); data < end; data += 4, dst += 4)
{
float32x4_t val = vmulq_n_f32(vld1q_f32((const float32_t *)data), INT32_MAX);
@@ -996,17 +1028,7 @@ unsigned int CAEConvert::Float_S32LE(float *data, const unsigned int samples, ui
dst[0] = safeRound(data[0] * (float)INT32_MAX);
dst[0] = Endian_SwapLE32(dst[0]);
}
-
- #else
-
- /* no SIMD */
- for (uint32_t i = 0; i < samples; ++i, ++data, ++dst)
- {
- dst[0] = safeRound(data[0] * (float)INT32_MAX);
- dst[0] = Endian_SwapLE32(dst[0]);
- }
- #endif
-
+#endif
return samples << 2;
}
@@ -1071,9 +1093,22 @@ unsigned int CAEConvert::Float_S32BE(float *data, const unsigned int samples, ui
}
}
_mm_empty();
+ #else
+ /* no SIMD */
+ for (uint32_t i = 0; i < samples; ++i, ++data, ++dst)
+ {
+ dst[0] = safeRound(data[0] * (float)INT32_MAX);
+ dst[0] = Endian_SwapBE32(dst[0]);
+ }
+ #endif
- #elif defined(__ARM_NEON__)
+ return samples << 2;
+}
+unsigned int CAEConvert::Float_S32BE_Neon(float *data, const unsigned int samples, uint8_t *dest)
+{
+#if defined(__ARM_NEON__)
+ int32_t *dst = (int32_t*)dest;
for (float *end = data + (samples & ~0x3); data < end; data += 4, dst += 4)
{
float32x4_t val = vmulq_n_f32(vld1q_f32((const float32_t *)data), INT32_MAX);
@@ -1101,25 +1136,15 @@ unsigned int CAEConvert::Float_S32BE(float *data, const unsigned int samples, ui
dst[0] = safeRound(data[0] * (float)INT32_MAX);
dst[0] = Endian_SwapBE32(dst[0]);
}
-
- #else
-
- /* no SIMD */
- for (uint32_t i = 0; i < samples; ++i, ++data, ++dst)
- {
- dst[0] = safeRound(data[0] * (float)INT32_MAX);
- dst[0] = Endian_SwapBE32(dst[0]);
- }
- #endif
-
+#endif
return samples << 2;
}
unsigned int CAEConvert::Float_DOUBLE(float *data, const unsigned int samples, uint8_t *dest)
{
double *dst = (double*)dest;
- for (unsigned int i = 0; i < samples; ++i, ++data, ++dst)
- *dst = *data;
+ for (unsigned int i = 0; i < samples; ++i)
+ *dst++ = *data++;
return samples * sizeof(double);
}
View
6 xbmc/cores/AudioEngine/Utils/AEConvert.h
@@ -46,6 +46,12 @@ class CAEConvert{
static unsigned int Float_S32LE (float *data, const unsigned int samples, uint8_t *dest);
static unsigned int Float_S32BE (float *data, const unsigned int samples, uint8_t *dest);
static unsigned int Float_DOUBLE(float *data, const unsigned int samples, uint8_t *dest);
+
+ static unsigned int S32LE_Float_Neon (uint8_t *data, const unsigned int samples, float *dest);
+ static unsigned int S32BE_Float_Neon (uint8_t *data, const unsigned int samples, float *dest);
+ static unsigned int Float_S32LE_Neon (float *data, const unsigned int samples, uint8_t *dest);
+ static unsigned int Float_S32BE_Neon (float *data, const unsigned int samples, uint8_t *dest);
+
public:
typedef unsigned int (*AEConvertToFn)(uint8_t *data, const unsigned int samples, float *dest);
typedef unsigned int (*AEConvertFrFn)(float *data, const unsigned int samples, uint8_t *dest);
View
44 xbmc/cores/AudioEngine/Utils/AERemap.cpp
@@ -290,19 +290,20 @@ void CAERemap::Remap(float * const in, float * const out, const unsigned int fra
if (!info->in_dst)
{
unsigned int f = 0;
+ unsigned int odx = 0;
for(; f < frameBlocks; f += 4)
{
- out[((f + 0) * m_outChannels) + o] = 0.0f;
- out[((f + 1) * m_outChannels) + o] = 0.0f;
- out[((f + 2) * m_outChannels) + o] = 0.0f;
- out[((f + 3) * m_outChannels) + o] = 0.0f;
+ out[odx + o] = 0.0f, odx += m_outChannels;
+ out[odx + o] = 0.0f, odx += m_outChannels;
+ out[odx + o] = 0.0f, odx += m_outChannels;
+ out[odx + o] = 0.0f, odx += m_outChannels;
}
switch (frames & 0x3)
{
- case 3: out[(f * m_outChannels) + o] = 0.0f; ++f;
- case 2: out[(f * m_outChannels) + o] = 0.0f; ++f;
- case 1: out[(f * m_outChannels) + o] = 0.0f;
+ case 3: out[odx + o] = 0.0f, odx += m_outChannels;
+ case 2: out[odx + o] = 0.0f, odx += m_outChannels;
+ case 1: out[odx + o] = 0.0f;
}
continue;
}
@@ -311,20 +312,23 @@ void CAERemap::Remap(float * const in, float * const out, const unsigned int fra
if (info->srcCount == 1)
{
unsigned int f = 0;
+ unsigned int idx = 0;
+ unsigned int odx = 0;
+ unsigned int srcIndex = info->srcIndex[0].index;
/* the compiler has a better chance of optimizing this if it is done in parallel */
for (; f < frameBlocks; f += 4)
{
- out[((f + 0) * m_outChannels) + o] = in[((f + 0) * m_inChannels) + info->srcIndex[0].index];
- out[((f + 1) * m_outChannels) + o] = in[((f + 1) * m_inChannels) + info->srcIndex[0].index];
- out[((f + 2) * m_outChannels) + o] = in[((f + 2) * m_inChannels) + info->srcIndex[0].index];
- out[((f + 3) * m_outChannels) + o] = in[((f + 3) * m_inChannels) + info->srcIndex[0].index];
+ out[odx + o] = in[idx + srcIndex], idx += m_inChannels, odx += m_outChannels;
+ out[odx + o] = in[idx + srcIndex], idx += m_inChannels, odx += m_outChannels;
+ out[odx + o] = in[idx + srcIndex], idx += m_inChannels, odx += m_outChannels;
+ out[odx + o] = in[idx + srcIndex], idx += m_inChannels, odx += m_outChannels;
}
switch (frames & 0x3)
{
- case 3: out[(f * m_outChannels) + o] = in[(f * m_inChannels) + info->srcIndex[0].index]; ++f;
- case 2: out[(f * m_outChannels) + o] = in[(f * m_inChannels) + info->srcIndex[0].index]; ++f;
- case 1: out[(f * m_outChannels) + o] = in[(f * m_inChannels) + info->srcIndex[0].index];
+ case 3: out[odx + o] = in[idx + srcIndex], idx += m_inChannels, odx += m_outChannels;
+ case 2: out[odx + o] = in[idx + srcIndex], idx += m_inChannels, odx += m_outChannels;
+ case 1: out[odx + o] = in[idx + srcIndex];
}
}
else
@@ -341,17 +345,17 @@ void CAERemap::Remap(float * const in, float * const out, const unsigned int fra
int i = 0;
for (; i < blocks; i += 4)
{
- *outOffset += inOffset[info->srcIndex[i + 0].index] * info->srcIndex[i + 0].level;
- *outOffset += inOffset[info->srcIndex[i + 1].index] * info->srcIndex[i + 1].level;
- *outOffset += inOffset[info->srcIndex[i + 2].index] * info->srcIndex[i + 2].level;
- *outOffset += inOffset[info->srcIndex[i + 3].index] * info->srcIndex[i + 3].level;
+ *outOffset += inOffset[info->srcIndex[i].index] * info->srcIndex[i].level, i++;
+ *outOffset += inOffset[info->srcIndex[i].index] * info->srcIndex[i].level, i++;
+ *outOffset += inOffset[info->srcIndex[i].index] * info->srcIndex[i].level, i++;
+ *outOffset += inOffset[info->srcIndex[i].index] * info->srcIndex[i].level, i++;
}
/* unrolled loop for higher performance */
switch (info->srcCount & 0x3)
{
- case 3: *outOffset += inOffset[info->srcIndex[i].index] * info->srcIndex[i].level; ++i;
- case 2: *outOffset += inOffset[info->srcIndex[i].index] * info->srcIndex[i].level; ++i;
+ case 3: *outOffset += inOffset[info->srcIndex[i].index] * info->srcIndex[i].level, i++;
+ case 2: *outOffset += inOffset[info->srcIndex[i].index] * info->srcIndex[i].level, i++;
case 1: *outOffset += inOffset[info->srcIndex[i].index] * info->srcIndex[i].level;
}
}
Please sign in to comment.
Something went wrong with that request. Please try again.