# Pystencils 3D Star Stencil - Domain Size Scan

This notebook uses a simpler stencil and scans a range of domain shapes, to see how the data volumes change

In [None]:

import sys 
sys.path.append('../../pystencils')
sys.path.append('../pystencils')
sys.path.append('../measutils')
sys.path.append('../warpspeed')



%load_ext autoreload
%autoreload 1

%aimport predict
%aimport predict_metrics


In [None]:
#import pycuda.driver as drv

from pystencils_stencil_utils import PS3DStencil
from pystencilswarpspeedkernel import PyStencilsWarpSpeedKernel
from measured_metrics import MeasuredMetrics, ResultComparer
from predict_metrics import *

from plot_utils import *
import matplotlib.pyplot as plt

from meas_db import MeasDB
meas_db = MeasDB("3dstencils.db")


In [None]:
totalSize = 512*1024*1024
device = selectDevice("A100")

blockSizes = [(1024 // (z*y), y, z, (1, tfy, 1)) for z in (1,4,16, 64) for y in [1,4,8,16,32,64] for tfy in [1] for tfz in [1] if z*y <= 1024]
print(blockSizes)



In [None]:
def benchmarkSeries(stencilRange, xyStart, xyEnd, steps, blockSizes, measValues, predValues):
    

    base = pow(xyEnd / xyStart, 1  / (steps-1))
    for i in range(0, steps):
        xy = int(xyStart * base ** i)
        xy = int(xy // 2) * 2

        size = ( xy, xy, max( 10, min(50000, totalSize // (xy)**2)) )
        print("    ---    Domain Size: " + str(size))


        SS = None 

        for blockSize in blockSizes:
            t1 = time.process_time()
            print("  --  Block Size: " + str(blockSize))
            
            
            
            if blockSize not in predValues:
                predValues[blockSize] = {}
            if blockSize not in measValues:
                measValues[blockSize] = {}
            if not xy in measValues[blockSize] or not xy in predValues[blockSize]:
                lc, basic, meas = meas_db.getEntry(stencilRange, blockSize[:3], blockSize[3], [s-2*stencilRange for s in size], device)
                if  meas is None or basic is None:        
                   
                    if SS is None:
                        SS = PS3DStencil(size, stencilRange)
                    kernel = SS.getStarKernel(blockSize[:3], stencilRange, blockSize[3])
                    runFunc = SS.getRunFunc(kernel)
                    wsKernel = PyStencilsWarpSpeedKernel(kernel.ast)     
                    wsKernel.registers = kernel.num_regs
                    lc = LaunchConfig.compute(wsKernel, blockSize[:3], SS.size, blockSize[3], device)
            
            
                    meas = MeasuredMetrics.measure(runFunc, lc)    
                    basic = BasicMetrics.compute(lc, device, wsKernel)
                    metrics = DerivedMetrics(lc, basic, device, meas)        
                    #meas_db.insertValue(stencilRange, blockSize[:3], blockSize[3], device, basic, meas, lc)
                else:
                    print("cached")
                metrics = DerivedMetrics(lc, basic, device, meas)

            else:
                metrics = predValues[blockSize][xy]
                meas = measValues[blockSize][xy]
                
            predValues[blockSize][xy] = metrics
            measValues[blockSize][xy] = meas

            #meas_db.commit()
            print(ResultComparer(meas, metrics))
            t2 = time.process_time()
            print("{:5.1f} ms".format((t2-t1) * 1000))
            
    def sortedDict(d):
        return {k : d[k] for k in sorted(d.keys())}
    for blockSize in blockSizes:    
        predValues[blockSize] = sortedDict(predValues[blockSize])
        measValues[blockSize] = sortedDict(measValues[blockSize])

In [None]:
measValues = {}
predValues = {}

In [None]:
benchmarkSeries(1, 400, 1200, 30, blockSizes, measValues, predValues)

In [None]:
fig, ax = plt.subplots(figsize=(13,6))

i = 0
for blockSize in blockSizes:    
    ax.plot( measValues[blockSize].keys(), [measValues[blockSize][v].memLoad for v in measValues[blockSize]], "-o", label = str(blockSize) + " meas", color="C" + str(i))
    ax.plot( measValues[blockSize].keys(), [predValues[blockSize][v].memLoadV2 for v in measValues[blockSize]], "--+", label = str(blockSize) + " pred", color="C" +str(i))
    #ax.plot( measValues[blockSize].keys(), [predValues[blockSize][v].memAll40MB  for v in measValues[blockSize]], "-..", label = str(blockSize) + " pred", color="C" +str(i))
    #ax.plot( measValues[blockSize].keys(), [predValues[blockSize][v].memLoadV2 + predValues[blockSize][v].memStoreV1  for v in measValues[blockSize]], "--o", label = str(blockSize) + " pred", color="C" +str(i))
    i += 1
    
    
ax.grid()
ax.set_yticks([8, 12, 16, 20, 24])
#ax.set_ylim((0, ax.ylim()[1]))
#ax.set_yscale("log")
if len(blockSizes) < 10:
    ax.legend()
ax.set_title("Varying Domain Shape - XY Plane vs DRAM Volume - 3D7pt Stencil")
ax.set_xlabel("XY plane dimensions")
ax.set_ylabel("DRAM Load Volume, B/Lup")
ax.set_ylim((0, ax.get_ylim()[1]))
fig.savefig("sizescan_range1_dram_load.svg")
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(13,6))

i = 0
for blockSize in blockSizes:    
    ax.plot( measValues[blockSize].keys(), [measValues[blockSize][v].memStore for v in measValues[blockSize]], "--+", label = str(blockSize) + " meas", color="C" + str(i))
    ax.plot( measValues[blockSize].keys(), [predValues[blockSize][v].memStoreV1 for v in measValues[blockSize]], "-o", label = str(blockSize) + " pred", color="C" +str(i))
    #ax.plot( measValues[blockSize].keys(), [predValues[blockSize][v].memAll40MB  for v in measValues[blockSize]], "-..", label = str(blockSize) + " pred", color="C" +str(i))
    #ax.plot( measValues[blockSize].keys(), [predValues[blockSize][v].memLoadV2 + predValues[blockSize][v].memStoreV1  for v in measValues[blockSize]], "--o", label = str(blockSize) + " pred", color="C" +str(i))
    i += 1
    
    
ax.grid()
#ax.set_yticks([16, 20, 32])
#ax.set_ylim((0, ax.ylim()[1]))
#ax.set_yscale("log")
if len(blockSizes) < 8:
    ax.legend()
ax.set_title("Varying Domain Shape - XY Plane vs DRAM Volume - 3D7pt Stencil")
ax.set_xlabel("XY plane dimensions")
ax.set_ylabel("DRAM Load Volume, B/Lup")
fig.savefig("sizescan_range1_dram_store.svg")
plt.show()

In [None]:
blockSizes25 = [(512, 2, 1, (1,1,1)), (128, 4, 2, (1,1,1)),  (32, 8, 4, (1,1,1)),
                (8, 16, 8, (1,1,1)), (8, 8, 16, (1,1,1)), (4, 8, 32, (1,1,1))] 
#[(1024 // (z*y), y, z, (1, tfy, 1)) for z in (1, 8, 64) for y in [1, 32, 256] for tfy in [1, 2] if z*y <= 1024]
print(blockSizes25)

In [None]:

measValues25 = {b : {} for b in blockSizes25}
predValues25 = {b : {} for b in blockSizes25}

In [None]:
benchmarkSeries(4, 300, 800, 15, blockSizes25, measValues25, predValues25)

In [None]:
fig, ax = plt.subplots(figsize=(10,5),dpi=300)

i = 0

markers = ["o", "D", "^", "s", "P", "v"]

for blockSize in blockSizes25:    
    meas = measValues25[blockSize]
    pred = predValues25[blockSize]

    print(meas.keys())
    
    ax.plot(meas.keys(), [meas[v].memLoad for v in meas], "-", label = str(blockSize[:3]) + " meas",
            color=getColor(blockSize[:3]), marker=markers[i])
    
    ax.plot( meas.keys(), [predValues25[blockSize][v].memLoadV2  for v in measValues25[blockSize]],
            "--", label = str(blockSize[:3]) + " pred", color=getColor(blockSize[:3]),
            markerfacecolor="gray" , markeredgecolor="gray", marker=markers[i], markersize=4)
    

    i += 1

ax.axvline(572, linestyle="--", color="gray")
    
ax.grid()
ax.set_yticks([0,8,16, 24, 40, 72])
#ax.set_yscale("log")
#ax.set_ylim((, ))
if len(blockSizes25) < 8:
    ax.legend()
ax.set_title("Varying Domain Shape - XY Plane vs DRAM Volume - 3D25pt Stencil")
ax.set_xlabel("XY plane dimensions")
ax.set_ylabel("DRAM Load Volume, B/Lup")
ax.set_ylim((0, ax.get_ylim()[1]))
fig.savefig("sizescan_range4_dram_load.pdf")
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(15,7))

i = 0
for blockSize in blockSizes25:    
    meas = measValues25[blockSize]
    pred = predValues25[blockSize]
    ax.plot( meas.keys(), [meas[v].memStore for v in meas], "--+", label = str(blockSize) + " meas", color="C" + str(i))
    ax.plot( measValues25[blockSize].keys(), [predValues25[blockSize][v].memStoreV1  for v in measValues25[blockSize]], "-o", label = str(blockSize) + " pred", color="C" +str(i))
    #ax.plot( meas.keys(), [pred[v].memAll40MB  for v in meas], "-.o", label = str(blockSize) + " pred", color="C" +str(i))

    i += 1

    
ax.grid()
ax.set_yticks([8, 9])
#ax.set_yscale("log")
#ax.set_ylim((, ))
if len(blockSizes25) < 8:
    ax.legend()

ax.set_title("Varying Domain Shape - XY Plane vs DRAM Volume - 3D25pt Stencil")
ax.set_xlabel("XY plane dimensions")
ax.set_ylabel("DRAM Load Volume, B/Lup")
fig.savefig("sizescan_range4_dram_store.svg")
plt.show()

In [None]:
def fitValues(measValues, predValues, a):
    fig, ax = plt.subplots()
    fig.set_figwidth(4)
    fig.set_figheight(4)
    fig.set_dpi(140)

    xdata = []
    ydata = []
        
    for a in [a]:
        for s in measValues:
            meas = measValues[s]
            pred = predValues[s]
            keys = [k for k in meas if (pred[k].memLoadV1 - meas[k].memLoad) > 0.01 and
                   pred[k].memLoadOverlap[a] > 0.01 and
                    (pred[k].memLoadV1 - (pred[k].memLoadOverlapHit[0] if a == 1 else 0) 
                                       - (pred[k].memLoadOverlapHit[1] if a == 0 else 0)
                                       - meas[k].memLoad) / pred[k].memLoadOverlap[a]  < 1.0 and
                    (pred[k].memLoadV1 - (pred[k].memLoadOverlapHit[0] if a == 1 else 0) 
                                       - (pred[k].memLoadOverlapHit[1] if a == 0 else 0)
                                       - meas[k].memLoad) / pred[k].memLoadOverlap[a]  >= -0.1 and
                   pred[k].basic.waveMemOld[a] / (20*1024*1024) < 3]
            
            x = [ min(3,  pred[k].basic.waveMemOld[a] / (20*1024*1024)   ) for k in keys]
            y = [ max(0.0, min(1.0, (pred[k].memLoadV1  
                                       - (pred[k].memLoadOverlapHit[0] if a == 1 else 0) 
                                       - (pred[k].memLoadOverlapHit[1] if a == 0 else 0)
                                       - meas[k].memLoad) / pred[k].memLoadOverlap[a] ))  for k in keys]
            
            plt.scatter(x, y, alpha=0.1, s=[150]*len(x), c=[getColor(pred[k].lc.block) for k in keys],  edgecolors="None")
            plt.scatter(x, y, alpha=0.8, s=[15]*len(x), c=[getColor(pred[k].lc.block) for k in keys],  edgecolors="None")

            xdata.extend(x)
            ydata.extend(y)

    
    bucketWidth = 0.15
    buckets = [0] * int(3 // bucketWidth)
    bucketCounts = [0]* int(3 // bucketWidth)
    
    for x,y in zip(xdata, ydata):
        bucket = round(x/bucketWidth)
        if bucket >= 0 and bucket < len(buckets):
            buckets[bucket ] += y
            bucketCounts[bucket] += 1
            
    for b in range(len(buckets)):
        buckets[b] /= max(1, bucketCounts[b])

    b = 0
    while b*bucketWidth < 0.6:
        buckets[b] = 1.0
        b += 1
        
    values = np.arange(0.0, 10.0, 0.1)

     
    from scipy.optimize import curve_fit

    def func(x, a, b, c):
        return a * np.exp(b*np.exp(c*x))


    popt, pcov = curve_fit(func, np.arange(len(buckets))*bucketWidth , buckets, bounds=([0.9999, -20, -np.inf], [1.0, 23, np.inf]), maxfev=20000)
    print(popt)
    print(pcov)

    xdata = np.array([*list(xdata), *[  i / 25 for i in range(1,70) ]])
    xdata.sort()

    #plt.plot(np.arange(len(buckets))*bucketWidth , buckets, color="#999999")
    
    plt.plot(xdata, func(xdata, *popt), '-.', color="#777777",
             label="fit: ${:2.1f} \; e^ {{{:2.3f} \: e^ {{{:2.2f}\:O^{{L2}}}}}}$".format(* tuple(popt)))


    ax.axvline(1.0)

    ax.legend()
    ax.set_xlim(0, ax.get_xlim()[1])

    return fig, ax

    
allStencils = meas_db.getBasicMetricsRange(4, device)    

allMeas = {}
allPred = {}

for a in allStencils:
    key = (*(a[0].blocking_factors), *(a[0].block)) 
    if key not in allMeas:
        allMeas[key] = {}
        allPred[key] = {}
    
    allMeas[key][tuple(a[0].domain)] = a[2]
    allPred[key][tuple(a[0].domain)] = DerivedMetrics(a[0], a[1], device, a[2])
    
    #allMeas = {0: { s : allStencils[s][2] for s in range(len(allStencils)) }}
    #allPred = {0: { s : DerivedMetrics(allStencils[s][0], allStencils[s][1], device, allStencils[s][2]) for s in range(len(allStencils))}}

                                   
                                   
fig, ax = fitValues(allMeas, allPred, 0)
ax.axvline(1, color="gray")
ax.set_xlabel("oversubscription factor $O^{L2}$")
ax.set_ylabel("hitrate $R_{hit}^{L2,over,Y}$")
fig.tight_layout()
fig.savefig("paperplots/r_overy.pdf")

fig, ax = fitValues(allMeas, allPred, 1)
ax.axvline(1, color="gray")
ax.set_xlabel("oversubscription factor $O^{L2}$")
ax.set_ylabel("hitrate $R_{hit}^{L2,over,Y}$")
fig.tight_layout()
fig.savefig("paperplots/r_overz.pdf")