Skip to content

Commit

Permalink
Linux/MacOS: Greatly improve performance (#370)
Browse files Browse the repository at this point in the history
std::unordered_set is implemented as a flat hashtable on libstdc++ which makes clearing expensive due to invoking memset on the entire table. To get the best performance across all platforms this replaces the unordered_set with a custom high-performance sparse bitset
  • Loading branch information
Exzap committed Oct 14, 2022
1 parent a19ed46 commit ada8bbb
Showing 1 changed file with 65 additions and 8 deletions.
73 changes: 65 additions & 8 deletions src/Cafe/HW/Latte/Core/LatteBufferCache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1005,8 +1005,67 @@ void LatteBufferCache_getStats(uint32& heapSize, uint32& allocationSize, uint32&
}

FSpinlock g_spinlockDCFlushQueue;
std::unordered_set<uint32>* g_DCFlushQueue = new std::unordered_set<uint32>(); // queued pages
std::unordered_set<uint32>* g_DCFlushQueueAlternate = new std::unordered_set<uint32>();

class SparseBitset
{
static inline constexpr size_t TABLE_MASK = 0xFF;

public:
bool Empty() const
{
return m_numNonEmptyVectors == 0;
}

void Set(uint32 index)
{
auto& v = m_bits[index & TABLE_MASK];
if (std::find(v.cbegin(), v.cend(), index) != v.end())
return;
if (v.empty())
{
m_nonEmptyVectors[m_numNonEmptyVectors] = &v;
m_numNonEmptyVectors++;
}
v.emplace_back(index);
}

template<typename TFunc>
void ForAllAndClear(TFunc callbackFunc)
{
auto vCurrent = m_nonEmptyVectors + 0;
auto vEnd = m_nonEmptyVectors + m_numNonEmptyVectors;
while (vCurrent < vEnd)
{
std::vector<uint32>* vec = *vCurrent;
vCurrent++;
for (const auto& it : *vec)
callbackFunc(it);
vec->clear();
}
m_numNonEmptyVectors = 0;
}

void Clear()
{
auto vCurrent = m_nonEmptyVectors + 0;
auto vEnd = m_nonEmptyVectors + m_numNonEmptyVectors;
while (vCurrent < vEnd)
{
std::vector<uint32>* vec = *vCurrent;
vCurrent++;
vec->clear();
}
m_numNonEmptyVectors = 0;
}

private:
std::vector<uint32> m_bits[TABLE_MASK + 1];
std::vector<uint32>* m_nonEmptyVectors[TABLE_MASK + 1];
size_t m_numNonEmptyVectors{ 0 };
};

SparseBitset* s_DCFlushQueue = new SparseBitset();
SparseBitset* s_DCFlushQueueAlternate = new SparseBitset();

void LatteBufferCache_notifyDCFlush(MPTR address, uint32 size)
{
Expand All @@ -1017,20 +1076,18 @@ void LatteBufferCache_notifyDCFlush(MPTR address, uint32 size)
uint32 lastPage = (address + size - 1) / CACHE_PAGE_SIZE;
g_spinlockDCFlushQueue.acquire();
for (uint32 i = firstPage; i <= lastPage; i++)
g_DCFlushQueue->emplace(i);
s_DCFlushQueue->Set(i);
g_spinlockDCFlushQueue.release();
}

void LatteBufferCache_processDCFlushQueue()
{
if (g_DCFlushQueue->empty()) // accessing this outside of the lock is technically undefined/unsafe behavior but on all known implementations this is fine and we can avoid the spinlock
if (s_DCFlushQueue->Empty()) // quick check to avoid locking if there is no work to do
return;
g_spinlockDCFlushQueue.acquire();
std::swap(g_DCFlushQueue, g_DCFlushQueueAlternate);
std::swap(s_DCFlushQueue, s_DCFlushQueueAlternate);
g_spinlockDCFlushQueue.release();
for (auto& itr : *g_DCFlushQueueAlternate)
LatteBufferCache_invalidatePage(itr * CACHE_PAGE_SIZE);
g_DCFlushQueueAlternate->clear();
s_DCFlushQueueAlternate->ForAllAndClear([](uint32 index) {LatteBufferCache_invalidatePage(index * CACHE_PAGE_SIZE); });
}

void LatteBufferCache_notifyDrawDone()
Expand Down

1 comment on commit ada8bbb

@Docmine17
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, this has greatly improved the performance on my Fedora.

Please sign in to comment.