Skip to content

Cache Hit Ratio Benchmark

Hüseyin Tuğrul BÜYÜKIŞIK edited this page Mar 16, 2021 · 1 revision

Active pages are updated with Least-Recently-Used eviction algorithm. This lets all recently visited pages remain in cache and be as fast as RAM or L1,L2,L3 caches depending on size of LRU cache and the access pattern. All benchmarks were run on FX8150 3.6GHz + 3 low end graphics cards with pcie v2.0 4x-8x-4x setup + 4GB DDR3 RAM (single channel, 1333MHz)

64 Threads, 60kB Object Size, Random-Access:

  • first speed-up over pcie bridge is seen when randomly-accessed data set size is same as virtual array's cache size
  • second speed-up starts from fully L3-sized random-accesses
  • L2 cache of CPU is 2MB per module so there is another speedup on x=2MB point
  • For this data-set only, before full L1 cache (384kB) efficiency is achieved, page-lock contention decreases bandwidth

Single Thread, 60kB Object Size, Random-Access:

  • without page-locking contention, performance only increases with decreased data set size or increased LRU cache size
  • This can be also visible on multiple CPU threads accessing different virtual arrays

Source code for benchmark:

#include "GraphicsCardSupplyDepot.h"
#include "VirtualMultiArray.h"
#include "PcieBandwidthBenchmarker.h"
#include "CpuBenchmarker.h"

// testing
#include <random>
#include <iostream>
#include "omp.h"
#include <fstream>
#include <cstring>

constexpr bool TEST_BANDWIDTH=true;
constexpr bool TEST_LATENCY=false;
constexpr bool testType = TEST_BANDWIDTH;

constexpr int TEST_OBJ_SIZE = 1024*60;

class Object
{
public:
	Object():id(-1){}
	Object(int p):id(p){
		constexpr int size = TEST_OBJ_SIZE-sizeof(int);
		data[id%size]='A';
	}
	const int getId() const {
		constexpr int size = TEST_OBJ_SIZE-sizeof(int);
		if(data[id%size]=='A')
			return id;
		else
			return -1;
	}

	Object(const Object& o)=default;
	Object(Object && o)=default;
	Object& operator = (Object && o)=default;
	Object& operator = (const Object& o)=default;



private:

	char data[TEST_OBJ_SIZE-sizeof(int)];
	int id;
};

int main(int argc, const char * argv[])
{
	const size_t numThr = 64;
	std::vector<int> numLRU = {5,5,5}; // 5 OpenCL data channels + 5 LRU caches per physical GPU (development computer has gt1030 + k420 + k420)
	int totalLRUs = 0;
	for(const auto& e:numLRU)
		totalLRUs += e;
	const long long pageSize = 1; // cache line size (elements)
	const int pagesPerLRU = 50; // cache lines
	int numElementsForAllLRUs = totalLRUs*pagesPerLRU * pageSize;
	const long long n = numElementsForAllLRUs*100;
	std::cout<<"total array size = "<<n*sizeof(Object)<<" bytes"<<std::endl;
	std::cout<<"total cache size = "<<numElementsForAllLRUs*sizeof(Object)<<" bytes"<<std::endl;
	VirtualMultiArray<Object> test(n,GraphicsCardSupplyDepot().requestGpus(),pageSize,pagesPerLRU,numLRU);


	// init
	std::cout<<"init..."<<std::endl;

	struct Graph2D
	{
		float x;
		float y;
		Graph2D():x(0),y(0){}
		Graph2D(float a, float b):x(a),y(b){}
	};
	std::vector<Graph2D> log1;
	std::vector<Graph2D> log2;
	#pragma omp parallel for num_threads(64)
	for(long long j=0;j<n;j++)
	{
			test.set(j,Object(j));
	}
	std::cout<<"...complete"<<std::endl;


	// benchmark
	double limit = (numElementsForAllLRUs/100>=2)?(numElementsForAllLRUs/100):2;
	for(double size = n-1; size>=limit; size*=0.95)
	{

		double hitRatio = (numElementsForAllLRUs / (double)size)*(100.0);
		int numTestsPerThread = 8000/numThr;

		std::string benchString;
		if(hitRatio<100.001)
		{
			benchString = std::string("hit-rate=")+std::to_string(hitRatio)+std::string("%");
		}
		else
		{
			benchString = std::string("cache size=")+std::to_string((hitRatio/100.0))+std::string("x of data set");
		}

		double seconds = 0;

		{
			CpuBenchmarker bench(numThr*numTestsPerThread * sizeof(Object),benchString);
			bench.addTimeWriteTarget(&seconds);

			#pragma omp parallel for num_threads(numThr)
			for(int i=0;i<numThr;i++)
			{
				double optimizationStopper = 0;
				std::random_device rd;
				std::mt19937 rng(rd());
				std::uniform_real_distribution<float> rnd(0,size);
				{
					for(int k=0;k<numTestsPerThread;k++)
					{
						int rndIndex = rnd(rng);
						const Object && obj = test.get(rndIndex);
						if(obj.getId()!=rndIndex)
						{
							throw std::invalid_argument("Error: index != data");
						}
					}
				}
			}
		}
		// MB/s per hit ratio
		log1.push_back(Graph2D(hitRatio,(numThr*numTestsPerThread * sizeof(Object)/seconds)/1000000.0));

		// MB/s per data set size
		log2.push_back(Graph2D(size*sizeof(Object)/1000000.0,(numThr*numTestsPerThread * sizeof(Object)/seconds)/1000000.0));
	}

	std::ofstream logFile("logfileHitRatioVsBandwidth.txt", std::ios_base::app | std::ios_base::out);
	for(const auto& point:log1)
	{
		std::string line;
		line += std::to_string(point.x);
		line += std::string(" ");
		line += std::to_string(point.y);
		line += std::string("\r\n");
		logFile<< std::string(line);
	}
	std::ofstream logFile2("logfileDataSetSizeVsBandwidth.txt", std::ios_base::app | std::ios_base::out);
	for(const auto& point:log2)
	{
		std::string line;
		line += std::to_string(point.x);
		line += std::string(" ");
		line += std::to_string(point.y);
		line += std::string("\r\n");
		logFile2<< std::string(line);
	}
	return 0;
}

When doing only scalar-accesses for small objects (less than 4kB), the LRU overhead and page locking make object copying performance less efficient.

64 Threads, 64-Byte Object Size, Scalar Random Access

Single Thread, 64-Byte Object Size, Scalar Random Access

Source Code:

#include "GraphicsCardSupplyDepot.h"
#include "VirtualMultiArray.h"
#include "PcieBandwidthBenchmarker.h"
#include "CpuBenchmarker.h"

// testing
#include <random>
#include <iostream>
#include "omp.h"
#include <fstream>
#include <cstring>

constexpr bool TEST_BANDWIDTH=true;
constexpr bool TEST_LATENCY=false;
constexpr bool testType = TEST_BANDWIDTH;

constexpr int TEST_OBJ_SIZE = 64;

class Object
{
public:
	Object():id(-1){}
	Object(int p):id(p){
		constexpr int size = TEST_OBJ_SIZE-sizeof(int);
		data[id%size]='A';
	}
	const int getId() const {
		constexpr int size = TEST_OBJ_SIZE-sizeof(int);
		if(data[id%size]=='A')
			return id;
		else
			return -1;
	}

	Object(const Object& o)=default;
	Object(Object && o)=default;
	Object& operator = (Object && o)=default;
	Object& operator = (const Object& o)=default;



private:

	char data[TEST_OBJ_SIZE-sizeof(int)];
	int id;
};

int main(int argc, const char * argv[])
{
	const size_t numThr = 64;
	std::vector<int> numLRU = {10,10,10}; // 10 OpenCL data channels + 10 LRU caches per physical GPU (development computer has gt1030 + k420 + k420)
	int totalLRUs = 0;
	for(const auto& e:numLRU)
		totalLRUs += e;
	const long long pageSize = 40; // cache line size (elements)
	const int pagesPerLRU = 500; // cache lines
	int numElementsForAllLRUs = totalLRUs*pagesPerLRU * pageSize;
	const long long n = numElementsForAllLRUs*100;
	std::cout<<"total array size = "<<n*sizeof(Object)<<" bytes"<<std::endl;
	std::cout<<"total cache size = "<<numElementsForAllLRUs*sizeof(Object)<<" bytes"<<std::endl;
	VirtualMultiArray<Object> test(n,GraphicsCardSupplyDepot().requestGpus(),pageSize,pagesPerLRU,numLRU);


	// init
	std::cout<<"init..."<<std::endl;

	struct Graph2D
	{
		float x;
		float y;
		Graph2D():x(0),y(0){}
		Graph2D(float a, float b):x(a),y(b){}
	};
	std::vector<Graph2D> log1;
	std::vector<Graph2D> log2;
	#pragma omp parallel for num_threads(64)
	for(long long j=0;j<n;j++)
	{
			test.set(j,Object(j));
	}
	std::cout<<"...complete"<<std::endl;


	// benchmark
	double limit = (numElementsForAllLRUs/100>=2)?(numElementsForAllLRUs/100):2;
	for(double size = n-1; size>=limit; size*=0.95)
	{

		double hitRatio = (numElementsForAllLRUs / (double)size)*(100.0);
		int numTestsPerThread = 8000/numThr;

		std::string benchString;
		if(hitRatio<100.001)
		{
			benchString = std::string("hit-rate=")+std::to_string(hitRatio)+std::string("%");
		}
		else
		{
			benchString = std::string("cache size=")+std::to_string((hitRatio/100.0))+std::string("x of data set");
		}

		double seconds = 0;

		{
			CpuBenchmarker bench(numThr*numTestsPerThread * sizeof(Object),benchString);
			bench.addTimeWriteTarget(&seconds);

			#pragma omp parallel for num_threads(numThr)
			for(int i=0;i<numThr;i++)
			{
				double optimizationStopper = 0;
				std::random_device rd;
				std::mt19937 rng(rd());
				std::uniform_real_distribution<float> rnd(0,size);
				{
					for(int k=0;k<numTestsPerThread;k++)
					{
						int rndIndex = rnd(rng);
						const Object && obj = test.get(rndIndex);
						if(obj.getId()!=rndIndex)
						{
							throw std::invalid_argument("Error: index != data");
						}
					}
				}
			}
		}
		// MB/s per hit ratio
		log1.push_back(Graph2D(hitRatio,(numThr*numTestsPerThread * sizeof(Object)/seconds)/1000000.0));

		// MB/s per data set size
		log2.push_back(Graph2D(size*sizeof(Object)/1000000.0,(numThr*numTestsPerThread * sizeof(Object)/seconds)/1000000.0));
	}

	std::ofstream logFile("logfileHitRatioVsBandwidth.txt", std::ios_base::app | std::ios_base::out);
	for(const auto& point:log1)
	{
		std::string line;
		line += std::to_string(point.x);
		line += std::string(" ");
		line += std::to_string(point.y);
		line += std::string("\r\n");
		logFile<< std::string(line);
	}
	std::ofstream logFile2("logfileDataSetSizeVsBandwidth.txt", std::ios_base::app | std::ios_base::out);
	for(const auto& point:log2)
	{
		std::string line;
		line += std::to_string(point.x);
		line += std::string(" ");
		line += std::to_string(point.y);
		line += std::string("\r\n");
		logFile2<< std::string(line);
	}
	return 0;
}

Sequential Access, 16 Threads, 2500 Elements Per Page, 4000 Elements Bulk Access (Partially Overlapped between accesses per thread)

  • Sequential access works better with bigger page size
  • Bulk access works better with bigger page size
  • LRU caching benefits multiple threads as long as number of LRU pages / access size is enough to hold data for all threads

Source Code:

#include "GraphicsCardSupplyDepot.h"
#include "VirtualMultiArray.h"
#include "PcieBandwidthBenchmarker.h"
#include "CpuBenchmarker.h"

// testing
#include <random>
#include <iostream>
#include "omp.h"
#include <fstream>
#include <cstring>

constexpr bool TEST_BANDWIDTH=true;
constexpr bool TEST_LATENCY=false;
constexpr bool testType = TEST_BANDWIDTH;

constexpr int TEST_OBJ_SIZE = 64;

class Object
{
public:
	Object():id(-1){}
	Object(int p):id(p){
		constexpr int size = TEST_OBJ_SIZE-sizeof(int);
		data[id%size]='A';
	}
	const int getId() const {
		constexpr int size = TEST_OBJ_SIZE-sizeof(int);
		if(data[id%size]=='A')
			return id;
		else
			return -1;
	}

	Object(const Object& o)=default;
	Object(Object && o)=default;
	Object& operator = (Object && o)=default;
	Object& operator = (const Object& o)=default;



private:

	char data[TEST_OBJ_SIZE-sizeof(int)];
	int id;
};

int main(int argc, const char * argv[])
{
	const size_t numThr = 16;
	std::vector<int> numLRU = {4,6,6}; // 4-6 OpenCL data channels and 4-6 LRU caches per physical GPU (development computer has gt1030 + k420 + k420)
	int totalLRUs = 0;
	for(const auto& e:numLRU)
		totalLRUs += e;
	const long long pageSize = 2500; // cache line size (elements)
	const int pagesPerLRU = 15; // cache lines
	int numElementsForAllLRUs = totalLRUs*pagesPerLRU * pageSize;
	const long long n = numElementsForAllLRUs*100;
	std::cout<<"total array size = "<<n*sizeof(Object)<<" bytes"<<std::endl;
	std::cout<<"total cache size = "<<numElementsForAllLRUs*sizeof(Object)<<" bytes"<<std::endl;
	VirtualMultiArray<Object> test(n,GraphicsCardSupplyDepot().requestGpus(),pageSize,pagesPerLRU,numLRU);


	// init
	std::cout<<"init..."<<std::endl;

	struct Graph2D
	{
		float x;
		float y;
		Graph2D():x(0),y(0){}
		Graph2D(float a, float b):x(a),y(b){}
	};
	std::vector<Graph2D> log1;
	std::vector<Graph2D> log2;
	#pragma omp parallel for num_threads(64)
	for(long long j=0;j<n;j++)
	{
			test.set(j,Object(j));
	}
	std::cout<<"...complete"<<std::endl;


	// benchmark
	double limit = n/10000.0;
	for(double size = n-4000; size>=limit; size*=0.8)
	{

		double hitRatio = (numElementsForAllLRUs / (double)size)*(100.0);
		int numTestsPerThread = size/2000;

		std::string benchString= std::string("cache size / data set size=")+std::to_string(hitRatio)+std::string("%");



		double seconds = 0;

		{
			CpuBenchmarker bench(4000*numThr*numTestsPerThread * sizeof(Object),benchString);
			bench.addTimeWriteTarget(&seconds);

			#pragma omp parallel for num_threads(numThr)
			for(int i=0;i<numThr;i++)
			{
				alignas(4096)
				Object tmp[4000];
				{
						for(int seq = 0; seq<size; seq+=2000)
						{
							test.mappedReadWriteAccess(seq,4000,[&,seq](const Object * ptr){
								for(int j=0;j<4000;j++)
								{
									if(ptr[seq+j].getId()!=seq+j)
									{
										throw std::invalid_argument("Error: index != data");
									}
								}
							},false,true,false,tmp);
						}
				}
			}
		}
		// MB/s per hit ratio
		log1.push_back(Graph2D(hitRatio,(4000*numThr*numTestsPerThread * sizeof(Object)/seconds)/1000000.0));

		// MB/s per data set size
		log2.push_back(Graph2D(size*sizeof(Object)/1000000.0,(4000*numThr*numTestsPerThread * sizeof(Object)/seconds)/1000000.0));
	}

	std::ofstream logFile("logfileHitRatioVsBandwidth.txt", std::ios_base::app | std::ios_base::out);
	for(const auto& point:log1)
	{
		std::string line;
		line += std::to_string(point.x);
		line += std::string(" ");
		line += std::to_string(point.y);
		line += std::string("\r\n");
		logFile<< std::string(line);
	}
	std::ofstream logFile2("logfileDataSetSizeVsBandwidth.txt", std::ios_base::app | std::ios_base::out);
	for(const auto& point:log2)
	{
		std::string line;
		line += std::to_string(point.x);
		line += std::string(" ");
		line += std::to_string(point.y);
		line += std::string("\r\n");
		logFile2<< std::string(line);
	}
	return 0;
}