# Pystencils 3D Star Stencil Volume Comparison - NVIDIA Version

3D7pt stencil with different block sizes

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

import sys 
sys.path.append('../../pystencils')
sys.path.append('../pystencils')
sys.path.append('../measutils')
sys.path.append('../warpspeed')


%load_ext autoreload
%autoreload 1

%aimport predict
%aimport plot_utils
%aimport predict_metrics



In [None]:
import cProfile
import re


import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import sys
from subprocess import run, PIPE



from predict_metrics import *
from plot_utils import *
from measured_metrics import MeasuredMetrics, ResultComparer


from meas_db import MeasDB

In [None]:
meas_db = MeasDB("stencils.db")


In [None]:
fieldSize = (1026, 512, 200)
datatype = "double"
device = DeviceRX6900XT()

In [None]:
def getBlockSizes(threadCounts):
    blockSizes = []
    for xblock in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
        for yblock in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]:
            for zblock in [1, 2, 4, 8, 16, 32, 64]:
                if xblock*yblock*zblock in threadCounts:
                    blockSizes.append((xblock, yblock, zblock))
    return blockSizes


predValues = dict()
measValues = dict()

print(device.name)


results = meas_db.getRangeKeys({"domainx" : fieldSize[0],
                                "domainy" : fieldSize[1],
                                "datatype" : '"' + datatype + '"', 
                                "device": '"' + str(device.name) + '"'},
                               ("range", "blockx", "blocky", "blockz", "tfoldx", "tfoldy", "tfoldz"))

print(len(results), " Values")
for row in results:

    meas = row[4]
    key = row[0]
    print(key)
    print(meas)    
    metrics = DerivedMetrics(row[2], row[3], device, meas)

    measValues[key] = meas
    predValues[key] = metrics
    rc = ResultComparer(meas, metrics)
    

    

In [None]:

volumeScatterPlot([(k[1:4], measValues[k].memLoad, predValues[k].memLoadV1, k[0]) for k in measValues], "Memory Load Volumes V1 " + str(device.name))
volumeScatterPlot([(k[1:4], measValues[k].memLoad, predValues[k].memLoadV2, k[0], predValues[k].memLoadV1) for k in measValues], "Memory Load Volumes V2 " + str(device.name))
volumeScatterPlot([(k[1:4], measValues[k].memLoad, predValues[k].memLoadV3, k[0], predValues[k].memLoadV2) for k in measValues], "Memory Load Volumes V3 " + str(device.name))
volumeScatterPlot([(k[1:4], measValues[k].memLoad, predValues[k].memLoadV4, k[0], predValues[k].memLoadV3) for k in measValues], "Memory Load Volumes V4 " + str(device.name))


In [None]:
volumeScatterPlot([(k[1:4], measValues[k].memStore, predValues[k].memStoreV1, k[0]) for k in measValues], "Memory Store Volumes V1 " + str(device.name))
volumeScatterPlot([(k[1:4], measValues[k].memStore, predValues[k].memStoreV2, k[0], predValues[k].memStoreV1, ) for k in measValues], "Memory Store Volumes V2 " + str(device.name))


In [None]:
volumeScatterPlot([(k[1:4], measValues[k].L2Load_tex, predValues[k].L2LoadV1, k[0]) for k in measValues], "Stencil L2 Load Volumes V1  " + str(device.name))
volumeScatterPlot([(k[1:4], measValues[k].L2Load_tex, predValues[k].L2LoadV2, k[0], predValues[k].L2LoadV1) for k in measValues], "Stencil L2 Load Volumes V2 " + str(device.name))

In [None]:
volumeScatterPlot([(k[1:4], measValues[k].L2Store, predValues[k].L2Store, k[0]) for k in measValues], "L2 Store Volumes V1  " + str(device.name))

In [None]:
fig, ax = volumeScatterPlot([(k[1:4], measValues[k].L1DataPipeWavefronts*32, predValues[k].L1DataPipeCycles, k[0]) for k in measValues], "L1 Data Pipe Cycles  " + str(device.name))
#for a in [(k[1:4], measValues[k].L1DataPipeWavefronts*32, predValues[k].L1DataPipeCycles, k[0]) for k in measValues]:
#    print(a)
fig, ax = volumeScatterPlot([(k[1:4], measValues[k].L1TagWavefronts*32, predValues[k].L1TagCycles, k[0]) for k in measValues], "L1 Tage Wavefront Cycles  " + str(device.name))
#for a in [(k[1:4], measValues[k].L1TagWavefronts*32, predValues[k].L1TagCycles, k[0]) for k in measValues]:
#    print(a)
fig, ax = volumeScatterPlot([(k[1:4], max( measValues[k].L1DataPipeWavefronts, measValues[k].L1TagWavefronts)*32, max(predValues[k].L1TagCycles, predValues[k].L1DataPipeCycles) , k[0]) for k in measValues], "L1 Cache Cycles  " + str(device.name))
fig, ax = volumeScatterPlot([(k[1:4], measValues[k].L1DataPipeWavefronts*32, measValues[k].L1TagWavefronts*32  , k[0]) for k in measValues], "L1 Tag vs Datapipe Cycles  " + str(device.name))
fig, ax = volumeScatterPlot([(k[1:4], predValues[k].L1DataPipeCycles, predValues[k].L1TagCycles  , k[0]) for k in measValues], "L1 Tag vs Datapipe Cycles  " + str(device.name))

In [None]:
fig, ax = plt.subplots()
fig.set_figwidth(4)
fig.set_figheight(4)
fig.set_dpi(150)

xdata = []
ydata = []

for r in [(1,1,1), (1,1,2), (1,2,1), (1,2,2)]:
    keys = [k for k in measValues if k[4:7] == r and measValues[k].L2Load_tex - predValues[k].L2LoadV1 > 0.01 and (predValues[k].L1Load - predValues[k].L2LoadV1) > 0.001]

    rxdata = [predValues[k].L1coverage for k in keys]
    rydata = [(measValues[k].L2Load_tex - predValues[k].L2LoadV1) / 
                (predValues[k].L1Load - predValues[k].L2LoadV1) for k in keys ] 
    
    ax.plot(rxdata,           rydata, ".", alpha=0.2)
    xdata.extend(rxdata)
    ydata.extend(rydata)

if len(ydata) == 0:
    ydata.extend([1,2])
    xdata.extend([1,2])

xdata = np.array(xdata)
ydata = np.array(ydata)

from functools import partial
from scipy.optimize import curve_fit

def func(x, a, b, c):
    return a * np.exp(-b*np.exp(-c*x))



popt, pcov = curve_fit(func, xdata, ydata, bounds=([0.2, -np.inf, -np.inf], [1.0, np.inf, np.inf]), maxfev=20000)
    
print(popt)
print(pcov)

errorsCurrent = np.abs(DerivedMetrics.L1rover(xdata) - ydata)
errorsNew = np.abs( partial(func, a=popt[0], b=popt[1], c=popt[2])(xdata) - ydata)
maeCurrent = np.sum(errorsCurrent) / len(errorsCurrent) * 100
maeNew = np.sum(errorsNew) / len(errorsNew) * 100

#for i in range(len(ydata)):
#    print("{:.2f} {:.2f} {:.2f} {:.2f}".format(xdata[i], ydata[i], DerivedMetrics.L1rover(xdata[i]), errors[i]))

print("Average Error Current: {:.1f} pp".format(maeCurrent))
print("Average Error new: {:.1f} pp".format(maeNew))


values = np.arange(0.0, 10.0, 0.1)

ax.plot (values, popt[0]*np.exp(-popt[1]*np.exp(-popt[2]*values)), label=r"new fit: ${:.2f} * e^{{-{:.1f}*e^{{-{:.1f}*x}}}}$, {:.1f} pp".format(*popt, maeNew))

ax.plot (values, DerivedMetrics.L1rover(values), label=r"cur fit: ${:.2f} * e^{{-{:.1f}*e^{{-{:.1f}*x}}}}$, {:.1f} pp".format(*DerivedMetrics.popt_L1rover, maeCurrent))
#ax.plot (values, 0.43*np.exp(-9.0*np.exp(-0.65*values))) 
#ax.plot (values, 0.25*np.exp(-9.0*np.exp(-0.5*values))) 

ax.set_title("L1 cache capacity eviction rate")
ax.legend()
ax.axvline(1.0)

In [None]:
fig, ax = plt.subplots()
fig.set_figwidth(4)
fig.set_figheight(4)
fig.set_dpi(150)

xdata = []
ydata = []
for a in [0,1]:
    keys = [k for k in measValues if (predValues[k].memLoadV1 - measValues[k].memLoad) > 1.0 and
        (predValues[k].basic.waveMemLoadOverlap[a] / predValues[k].basic.waveValidCells) > 4.0]
    x = [ device.sizeL2 / (predValues[k].basic.waveMemOld[a] + predValues[k].basic.waveMemLoadNew - predValues[k].basic.waveMemLoadOverlap[a]) for k in keys]
    y = [ (predValues[k].memLoadV1 - measValues[k].memLoad) / 
          (predValues[k].basic.waveMemLoadOverlap[a] / predValues[k].basic.waveValidCells / predValues[k].lc.blocking_factors[1] / predValues[k].lc.blocking_factors[2])  for k in keys]
    ax.plot(x, y, ".", alpha=0.2)
    xdata.extend(x)
    ydata.extend(y)

xdata = np.array(xdata)
ydata = np.array(ydata)

from functools import partial
from scipy.optimize import curve_fit

def func(x, a, b, c):
    return a * np.exp(-b*np.exp(-c*x))

popt, pcov = curve_fit(func, xdata, ydata, bounds=([0.2, -np.inf, -np.inf], [1.0, np.inf, np.inf]), maxfev=20000)
print(popt)
print(pcov)

errorsCurrent = np.abs(DerivedMetrics.L1rover(xdata) - ydata)
errorsNew = np.abs( partial(func, a=popt[0], b=popt[1], c=popt[2])(xdata) - ydata)
maeCurrent = np.sum(errorsCurrent) / len(errorsCurrent) * 100
maeNew = np.sum(errorsNew) / len(errorsNew) * 100

#for i in range(len(ydata)):
#    print("{:.2f} {:.2f} {:.2f} {:.2f}".format(xdata[i], ydata[i], DerivedMetrics.L1rover(xdata[i]), errors[i]))

print("Average Error Current: {:.1f} pp".format(maeCurrent))
print("Average Error new: {:.1f} pp".format(maeNew))


values = np.arange(0.0, 10.0, 0.1)

ax.plot (values, popt[0]*np.exp(-popt[1]*np.exp(-popt[2]*values)), label=r"new fit: ${:.2f} * e^{{-{:.1f}*e^{{-{:.1f}*x}}}}$, {:.1f} pp".format(*popt, maeNew))

ax.plot (values, DerivedMetrics.L1rover(values), label=r"cur fit: ${:.2f} * e^{{-{:.1f}*e^{{-{:.1f}*x}}}}$, {:.1f} pp".format(*DerivedMetrics.popt_L1rover, maeCurrent))
#ax.plot (values, 0.43*np.exp(-9.0*np.exp(-0.65*values))) 
#ax.plot (values, 0.25*np.exp(-9.0*np.exp(-0.5*values))) 

ax.set_title("L2 Cache Wave Overlap ")
ax.legend()
ax.axvline(1.0)

In [None]:
def fitValues(measValues, predValues):
    fig, ax = plt.subplots()
    fig.set_figwidth(5)
    fig.set_figheight(5)
    fig.set_dpi(140)

    xdata = []
    ydata = []
     


    meas = measValues
    pred = predValues
    for a in [0,1]:
        keys = [k for k in meas if (pred[k].memLoadV1 - meas[k].memLoad) > 2.0 and
               (pred[k].basic.waveMemLoadOverlap[a] / pred[k].basic.waveValidCells) > 2.0]
        x = [ max(0.0, min(13.0, device.sizeL2 / (pred[k].basic.waveMemOld[a] + pred[k].basic.waveMemLoadNew - pred[k].basic.waveMemLoadOverlap[a]))) for k in keys]
        y = [ max(0.0, min(1.0, (pred[k].memLoadV1 - meas[k].memLoad) / (pred[k].basic.waveMemLoadOverlap[a] / pred[k].basic.waveValidCells / pred[k].lc.blocking_factors[1] / pred[k].lc.blocking_factors[2])))  for k in keys]
        ax.plot(x, y, ".", alpha=0.2)
        xdata.extend(x)
        ydata.extend(y)

    values = np.arange(0.0, 10.0, 0.1)
    
    if len(keys) == 0:
        return
     
    from scipy.optimize import curve_fit

    def func(x, a, b, c):
        return a * np.exp(-b*np.exp(-c*x))


    popt, pcov = curve_fit(func, xdata, ydata, bounds=([0.2, 0, -np.inf], [1.0, np.inf, np.inf]), maxfev=20000)
    print(popt)
    print(pcov)

    xdata = np.array([*list(xdata), *[  i / 25 for i in range(1,25) ]])
    xdata.sort()


    
    plt.plot(xdata, func(xdata, *popt), 'r-',
             label='fit: a=%5.3f, \n     b=%5.3f,\n     c=%5.3f' % tuple(popt))

    ax.axvline(1.0)
    ax.legend()

fitValues(measValues, predValues)

In [None]:
categories = ["Flops", "L1", "L2", "DRAM"]

for r in range(-2,5):    
    keys = [k for k in measValues if k[0] == r]
    if len(keys) == 0: 
        continue
        
        
    volumeScatterPlot([(k[1:4], measValues[k].lups, predValues[k].perfV1, categories[predValues[k].limV1]) for k in keys], "R" + str(r) + " Extended Roofline V1,  " + str(device.name))
    volumeScatterPlot([(k[1:4], measValues[k].lups, predValues[k].perfV2, categories[predValues[k].limV2], predValues[k].perfV1) for k in keys], "R" + str(r) + " Extended Roofline V2,  " + str(device.name))
    fig, ax = volumeScatterPlot([(k[1:4], measValues[k].lups, predValues[k].perfV3, categories[predValues[k].limV3], predValues[k].perfV2) for k in keys], "R" + str(r) + " Extended Roofline V3,  " + str(device.name))
    ax.set_xlabel("measured performance, GLup/s")
    ax.set_ylabel("predicted performance, GLup/s")
    plt.savefig("./autoplots/extroofline_v3_range" + str(3) + "_" + device.name +  ".svg")
    volumeScatterPlot([(k[1:4], measValues[k].lups, predValues[k].perfV4, categories[predValues[k].limV4], predValues[k].perfV3) for k in keys], "R" + str(r) + " Extended Roofline V4,  " + str(device.name))
    fig, ax = volumeScatterPlot([(k[1:4], measValues[k].lups, predValues[k].perfPheno, categories[predValues[k].limPheno], predValues[k].perfV3) for k in keys], "R" + str(r) + " Extended Roofline Pheno,  " + str(device.name))
    ax.set_xlabel("measured performance, GLup/s")
    ax.set_ylabel("predicted performance, GLup/s")
    plt.savefig("./autoplots/extroofline_pheno_range" + str(3) + "_" + device.name +  ".svg")

    

    volumeScatterPlot([(k[1:4], measValues[k].lups, predValues[k].perf2LimV4, categories[predValues[k].lim2LimPheno], predValues[k].perfPheno) for k in keys], "R" + str(r) + " Standard Roofline Pheno,  " + str(device.name))
    volumeScatterPlot([(k[1:4], measValues[k].lups, predValues[k].perfEPMV3*0.65, categories[predValues[k].limV4], predValues[k].perfPheno) for k in keys], "R" + str(r) + " EPM Pheno,  " + str(device.name), linear=True)

In [None]:
predTop = [(predValues[k].perfV3, k[1:]) for k in measValues.keys() if k[0] == -2]
measTop = [(measValues[k].lups,  k[1:]) for k in measValues.keys() if k[0] == -2]

print(len(predTop))

predTop = sorted(predTop)[-145:]
measTop = sorted(measTop)[-145:]


print("Top Preds:")
for p in predTop:    
    print("{} {:5.1f} {!r:12}".format("**" if p[1:] == measTop[-1][1:] else "  ", *p))

print()
print("Top Meas")
for p in measTop:    
    print("{} {:5.1f} {!r:12}".format("**" if p[1:] == predTop[-1][1:] else "  ", *p))