Skip to content

Coherent Multithreaded Single Level Cache Example

Hüseyin Tuğrul BÜYÜKIŞIK edited this page Oct 13, 2021 · 2 revisions
#include<fstream>
#include<iostream>
#include<vector>
#include"NWaySetAssociativeMultiThreadCache.h"
// or #include"DirectMappedMultiThreadCache.h"
int main()
{
	  int imageWidth, imageHeight, maxN;
	  double minR, maxR, minI, maxI;
	  const int tileSize = 8;

	  maxN = 512;
	  minR = -1.5;
	  maxR = 0.7;
	  minI = -1.0;
	  maxI = 1.0;

	  size_t cacheScaling = 1;
	  for (int cacheScalingIter = 0; cacheScalingIter < 7; cacheScalingIter++) {
		imageWidth = tileSize*4*cacheScaling;
		imageHeight = tileSize*4*cacheScaling;
	    cacheScaling *= 2;

	    const int cacheSize = 1024 * 512;

	    // multithreaded read/write to vector is ok as long as each key is guarded by its own lock (which NWaySetAssociativeMultiThreadCache does)
	    std::vector < int > backingStore(imageWidth*imageHeight);

            // first parameter is number of sets
            // second parameter is number of tags per set
            // total size = tags * sets
            // You can also use DirectMappedMultiThreadCache <int,int> (totalSize,funcMissRead,funcMissWrite)
	    NWaySetAssociativeMultiThreadCache <int, int > cache(1024,cacheSize/(1024),[ & ](int key) {
	    	// no two threads will read same key simultaneously here
	    	// safe
	        return backingStore[key];
	      },
	      [ & ](int key, int value) {
		    // no two threads will write to same key simultaneously here
	    	// safe
	        backingStore[key] = value;
	      });

	    std::ofstream output("output_image_scaling" + std::to_string(cacheScaling) + ".ppm");
	    output << "P6" << std::endl;
	    output << imageWidth << " " << imageHeight << std::endl;
	    output << "255" << std::endl;

	    for (int i = 0; i < imageHeight; i++) {
	      for (int j = 0; j < imageWidth; j++) {
	        double cr = mapToReal(j, imageWidth, minR, maxR);
	        double ci = mapToImaginary(i, imageHeight, minI, maxI);

	        cache.setThreadSafe(i + j * imageWidth, findMandelbrot(cr, ci, maxN));


	      }
	    }

	    // benchmark
	    {

	      const int nGauss = 9;
	      const int GaussianBlurKernel[nGauss] = {
	        1,        2,        1,
	        2,        4,        2,
	        1,        2,        1
	      };
	      size_t nRepeats = 5;
	      size_t nReadsPerIter = nGauss;
	      size_t nWritesPerIter = 1;
	      size_t totalLookups = nRepeats * (imageHeight - 2) * (imageWidth - 2) * (nReadsPerIter + nWritesPerIter);
	      CpuBenchmarker bench(totalLookups * sizeof(int), "cacheSize = " + std::to_string(cacheSize) +"  image size="+std::to_string(imageWidth)+"x"+std::to_string(imageHeight) + "   performance", totalLookups);

	      // image softening (may not be accurate)
	      // column-major iteration better for the L1 but doesn't matter for L2
	      // "nRepeats" iterations better for benchmarking
	      // tiled-computing to make even better cache-hit ratio
	      for (size_t k = 0; k < nRepeats; k++) {
			#pragma omp parallel for
	        for (int tiley = 0; tiley < imageHeight/tileSize; tiley++) {
	          for (int tilex = 0; tilex < imageWidth/tileSize; tilex++) {
	            for (int jj = 0; jj < tileSize; jj++) {
	              for (int ii = 0; ii < tileSize; ii++) {
	                const int i = tilex * tileSize + ii;
	                const int j = tiley * tileSize + jj;
	                if (j >= 1 && j <= imageHeight - 2 && i >= 1 && i <= imageWidth - 2) {
	                  unsigned int pixelAccumulator1 = 0;
	                  unsigned int pixelAccumulator2 = 0;
	                  unsigned int pixelAccumulator3 = 0;

	                  pixelAccumulator1 += cache.getThreadSafe(i - 1 + (j - 1) * imageWidth) * GaussianBlurKernel[-1 + 1 + (-1 + 1) * 3];
	                  pixelAccumulator2 += cache.getThreadSafe(i + 0 + (j - 1) * imageWidth) * GaussianBlurKernel[+0 + 1 + (-1 + 1) * 3];
	                  pixelAccumulator3 += cache.getThreadSafe(i + 1 + (j - 1) * imageWidth) * GaussianBlurKernel[+1 + 1 + (-1 + 1) * 3];

	                  pixelAccumulator1 += cache.getThreadSafe(i - 1 + (j - 0) * imageWidth) * GaussianBlurKernel[-1 + 1 + (-0 + 1) * 3];
	                  pixelAccumulator2 += cache.getThreadSafe(i + 0 + (j - 0) * imageWidth) * GaussianBlurKernel[+0 + 1 + (-0 + 1) * 3];
	                  pixelAccumulator3 += cache.getThreadSafe(i + 1 + (j - 0) * imageWidth) * GaussianBlurKernel[+1 + 1 + (-0 + 1) * 3];

	                  pixelAccumulator1 += cache.getThreadSafe(i - 1 + (j + 1) * imageWidth) * GaussianBlurKernel[-1 + 1 + (+1 + 1) * 3];
	                  pixelAccumulator2 += cache.getThreadSafe(i + 0 + (j + 1) * imageWidth) * GaussianBlurKernel[+0 + 1 + (+1 + 1) * 3];
	                  pixelAccumulator3 += cache.getThreadSafe(i + 1 + (j + 1) * imageWidth) * GaussianBlurKernel[+1 + 1 + (+1 + 1) * 3];

	                  const int n = (pixelAccumulator1 + pixelAccumulator2 + pixelAccumulator3) >> 4;
	                  cache.setThreadSafe(i + j * imageWidth, n);
	                }
	              }
	            }
	          }
	        }
	      }


	    }
	    for (int i = 0; i < imageHeight; i++) {
	      for (int j = 0; j < imageWidth; j++) {

	        int n = cache.getThreadSafe(i + j * imageWidth);

	        int r = ((int) sqrt(n) % 256);
	        int gr = (2 * n % 256);
	        int b = (n % 256);

	        output << (char) r << (char) gr << (char) b;
	      }

	    }
	    std::cout << "Finished!" << std::endl;

	    cache.flush(); // every thread needs to call flush on its own CacheThreader object after work is complete
	    output.flush();

	  }
	return 0;
}

Results on FX8150 (3.6 GHz):

cacheSize = 524288  image size=32x32   performance: 3199512 nanoseconds     (bandwidth = 56.26 MB/s)      (throughput = 71.10 nanoseconds per iteration) 
Finished!
cacheSize = 524288  image size=64x64   performance: 7966699 nanoseconds     (bandwidth = 96.50 MB/s)      (throughput = 41.45 nanoseconds per iteration) 
Finished!
cacheSize = 524288  image size=128x128   performance: 24354866 nanoseconds     (bandwidth = 130.37 MB/s)      (throughput = 30.68 nanoseconds per iteration) 
Finished!
cacheSize = 524288  image size=256x256   performance: 69921027 nanoseconds     (bandwidth = 184.54 MB/s)      (throughput = 21.68 nanoseconds per iteration) 
Finished!
cacheSize = 524288  image size=512x512   performance: 183688499 nanoseconds     (bandwidth = 283.20 MB/s)      (throughput = 14.12 nanoseconds per iteration) 
Finished!
cacheSize = 524288  image size=1024x1024   performance: 889042572 nanoseconds     (bandwidth = 234.97 MB/s)      (throughput = 17.02 nanoseconds per iteration) 
Finished!
cacheSize = 524288  image size=2048x2048   performance: 5144069803 nanoseconds     (bandwidth = 162.76 MB/s)      (throughput = 24.58 nanoseconds per iteration) 
Finished!

14 nanoseconds per pixel access = ~70 million lookups per second, coherently with writes and reads (but order of reads/writes are not taken into consideration here, only cache coherency tested, not image quality)