# Stencilgen Volume Comparisons

2D Box stencils with varying range

In [None]:
import sys
sys.path.insert(0, '..')
sys.path.insert(0, '../pystencils_notebooks/')
sys.path.insert(0, '../measutils/')
sys.path.insert(0, '../applications/')
sys.path.insert(0, '../warpspeed/')

import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import measure_metric.measureMetric as measureMetric

from stencilgen.stencil import *
import stencilgen.bench as stencilbench
from predict import *
from volumes_isl import *
from plot_utils import *

#% matplotlib notebook

In [None]:
%load_ext autoreload
%autoreload 1
%aimport stencilgen.stencil
%aimport stencilgen.bench
%aimport predict
%aimport volumes_isl

In [None]:

predMemLoadVolumes = dict()
predMemLoadVolumesISL = dict()
measMemLoadVolumes = dict()

measMemStoreVolumes = dict()
predMemStoreVolumes = dict()

measL2LoadVolumes = dict()
predL2LoadVolumes = dict()

measL2StoreVolumes = dict()
predL2StoreVolumes = dict()

xticks = []
xtickLabels = []
xtickCounter = 0


for r in range(0,4):
    kernel = Kernel2DBoxStencil(stencil_range=r, l1only=False)

    print()
    print("r=" + str(r))
    print( "                 mem     mem      L2      L2")
    print( "                load   store    load   store       L1")
    print()
    
    for xblock in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
        for yblock in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
            if xblock * yblock > 1024 or xblock * yblock < 32:
                continue
            block = (xblock, yblock, 1)
        
            grid = kernel.getGrid(1, block, 15000, 15000)
            concurrentGrid = getConcurrentGrid(getBlocksPerSM(block, 32)*80, grid)
            truncatedConcurrentGrid = tuple(min(4, c) for c in concurrentGrid)
            threadsPerBlock = block[0] * block[1] * block[2]
    
   
            measureMetric.measureBandwidthStart()
            stencilbench.runKernel(kernel, kernel.getGrid(1, block, 15000, 15000), block)
            result = [r / stencilbench.h / stencilbench.w for r in measureMetric.measureMetricStop()]

            L2LoadBlockVolume = getL2LoadBlockVolume(block, truncatedConcurrentGrid, kernel.genLoads())  / threadsPerBlock
            L2StoreBlockVolume = getL2StoreBlockVolume(block, truncatedConcurrentGrid, kernel.genStores())  / threadsPerBlock

            memLoadBlockVolume = getMemLoadBlockVolume(block, concurrentGrid, kernel.genLoads()) /  threadsPerBlock
            memStoreBlockVolume = getMemStoreBlockVolume(block, concurrentGrid, kernel.genStores())  / threadsPerBlock

            memLoadBlockVolumeISL = getMemLoadBlockVolumeISL(block, concurrentGrid, grid,  kernel.genLoadExprs(), kernel.getValidDomain(stencilbench.w, stencilbench.h)) /  threadsPerBlock

            L1Cycles = getL1Cycles(block, truncatedConcurrentGrid, {**kernel.genLoads(), ** kernel.genStores()})
            print("{:12}   {:5.2f}   {:5.2f}   {:5.2f}   {:5.2f}".format(str(block), result[0], result[1], result[2]*32, result[3]*32))
            print("        {:5.2f} / {:4.2f}   {:5.2f}   {:5.2f}   {:5.2f}   {:6.1f}".format(memLoadBlockVolume, memLoadBlockVolumeISL, memStoreBlockVolume, L2LoadBlockVolume, L2StoreBlockVolume, L1Cycles))

            print()

            key = (r, *block)
            
            measMemLoadVolumes[key] = result[0]
            predMemLoadVolumes[key] = memLoadBlockVolume
            predMemLoadVolumesISL[key] = memLoadBlockVolumeISL
            
            measMemStoreVolumes[key] = result[1]
            predMemStoreVolumes[key] = memStoreBlockVolume

            measL2LoadVolumes[key] = result[2]*32
            predL2LoadVolumes[key] = L2LoadBlockVolume

            measL2StoreVolumes[key] = result[3]*32
            predL2StoreVolumes[key] = L2StoreBlockVolume
            
            
            newLabel = "r={},{}x{}x{}".format( r, block[0], block[1], block[2])
            if xtickCounter % 5 == 0:
                xticks.append(xtickCounter)
                xtickLabels.append(newLabel)        
            xtickCounter += 1

In [None]:
volumeScatterPlot(measMemLoadVolumes,  predMemLoadVolumes, "Memory Load Volume")

In [None]:

volumeScatterPlot(measMemLoadVolumes,  predMemLoadVolumesISL, "Memory Load Volume (ISL)")

In [None]:
volumeScatterPlot(measMemStoreVolumes, predMemStoreVolumes, "Memory Store Volume")

In [None]:
volumeScatterPlot(measL2LoadVolumes, predL2LoadVolumes, "L2 load Volume")


In [None]:
volumeScatterPlot(measL2StoreVolumes,predL2StoreVolumes, "L2 store volume")