# Pystencils 3D Star Stencil Volume Comparison - AMD Version

3D7pt stencil with different block sizes

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

import sys 
sys.path.append('../../pystencils')
sys.path.append('../pystencils')
sys.path.append('../measutils')
sys.path.append('../warpspeed')


%load_ext autoreload
%autoreload 1

%aimport predict
%aimport griditeration
%aimport volumes_isl
%aimport pystencils.astnodes
%aimport plot_utils
%aimport predict_metrics



In [None]:
import cProfile
import re


import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import sys
from subprocess import run, PIPE


import pystencils as ps
from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers
from pystencils.display_utils import show_code, get_code_str
from griditeration import *
from predict_metrics import *
from volumes_isl import *
from plot_utils import *
from measured_metrics import MeasuredMetrics, ResultComparer

from pystencils_stencil_utils import getStencilKernel
from pystencilswarpspeedkernel import PyStencilsWarpSpeedKernel


import sympy as sp


import timeit

from meas_db import MeasDB

In [None]:
meas_db = MeasDB("3dstencils2_AMD.db")
#meas_db.clearDB()

In [None]:
fieldSize = (1023, 512, 200)
device = DeviceMI210()

In [None]:
kernel, domainSize, bufferSizeBytes, alignmentBytes = getStencilKernel(0, "star",  (32, 32, 1), (1,1,1),  fieldSize)
wsKernel = PyStencilsWarpSpeedKernel(kernel)     
for f in wsKernel.loadFields:
    for a in f.NDAddresses:
        print(a)
for f in wsKernel.loadExprs:
    for a in wsKernel.loadExprs[f]:
        print(a)
print()
for f in wsKernel.storeExprs:
    for a in wsKernel.storeExprs[f]:
        print(a)

show_code(kernel)
    
    
print(device.name)


In [None]:
predValues = dict()
measValues = dict()

print(device.name)

def getBlockSizes(threadCounts):
    blockSizes = []
    for xblock in [ 1,2,4, 8, 16, 32, 64, 128, 256, 512, 1024]:
        for yblock in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]:
            for zblock in [1, 2, 4, 8, 16, 32, 64]:
                if xblock*yblock*zblock in threadCounts:
                    blockSizes.append((xblock, yblock, zblock))
    return blockSizes




for r in [4]:
    for blockingFactors in [(1,1,1)]:
        for blockSize in getBlockSizes([1024]):
            t1 = time.process_time()

            key = (r, *blockSize, blockingFactors)

            print(key)
            
            lc, basic, meas = meas_db.getEntry(r, blockSize, blockingFactors, fieldSize, device)          

            #basic = None
            #meas = None
            
            if meas is None or basic is None:
                 
                print("measured")
                if r >= 0:
                    kernel, domainSize, bufferSizeBytes, alignmentBytes = getStencilKernel(r, "star", blockSize, blockingFactors, fieldSize)
                else:
                    kernel, domainSize, bufferSizeBytes, alignmentBytes = getStencilKernel(-r, "box", blockSize,  blockingFactors, fieldSize)
                    

                wsKernel = PyStencilsWarpSpeedKernel(kernel)                     
                
                
                wsKernel.registers = 32
                
                buffers = [bufferSizeBytes, bufferSizeBytes]                
                lc = LaunchConfig.compute(wsKernel, blockSize, domainSize, blockingFactors, device, buffers, alignmentBytes)
                
                
                if basic is None:
                    basic = BasicMetrics.compute(lc, device, wsKernel)
                if meas is None:
                    meas = MeasuredMetrics.measure(kernel, lc)

                meas_db.insertValue(r, blockSize, blockingFactors, device, basic, meas, lc, fieldSize)
            else:
                print("cached")

            metrics = DerivedMetrics(lc, basic, device, meas)

            measValues[key] = meas
            predValues[key] = metrics

            rc = ResultComparer(meas, metrics)
            
            display(HTML(metrics.html()))
            display(HTML(rc.html()))

            meas_db.commit()
            t2 = time.process_time()
            print("{:5.1f} ms".format((t2-t1) * 1000))
            print()

In [None]:

volumeScatterPlot([(k[1:4], measValues[k].memLoad, predValues[k].memLoadV1, k[4]) for k in measValues], "Memory Load Volumes V1 " + str(device.name))
volumeScatterPlot([(k[1:4], measValues[k].memLoad, predValues[k].memLoadV2, k[4], predValues[k].memLoadV1) for k in measValues], "Memory Load Volumes V2 " + str(device.name))
volumeScatterPlot([(k[1:4], measValues[k].memLoad, predValues[k].memLoadV3, k[4], predValues[k].memLoadV2) for k in measValues], "Memory Load Volumes V3 " + str(device.name))
volumeScatterPlot([(k[1:4], measValues[k].memLoad, predValues[k].memLoadV4, k[4], predValues[k].memLoadV3) for k in measValues], "Memory Load Volumes V4 " + str(device.name))


In [None]:
volumeScatterPlot([(k[1:4], measValues[k].memStore, predValues[k].memStoreV1, k[4]) for k in measValues], "Memory Store Volumes V1 " + str(device.name))
volumeScatterPlot([(k[1:4], measValues[k].memStore, predValues[k].memStoreV2, k[4], predValues[k].memStoreV1, ) for k in measValues], "Memory Store Volumes V2 " + str(device.name))


In [None]:
volumeScatterPlot([(k[1:4], measValues[k].L2Load_tex, predValues[k].L2LoadV1, k[4]) for k in measValues], "Stencil L2 Load Volumes V1  " + str(device.name))
volumeScatterPlot([(k[1:4], measValues[k].L2Load_tex, predValues[k].L2LoadV2, k[4], predValues[k].L2LoadV1) for k in measValues], "Stencil L2 Load Volumes V2 " + str(device.name))

In [None]:
volumeScatterPlot([(k[1:4], measValues[k].L2Store, predValues[k].L2Store, k[4]) for k in measValues], "L2 Store Volumes V1  " + str(device.name))

In [None]:

if False:
    fig, ax = volumeScatterPlot([(k[1:4], measValues[k].L1Wavefronts_TD, predValues[k].L1Cycles / 32 , k[4]) for k in measValues], "L1 Cache Cycles  " + str(device.name), linear=True)
    ax.set_xlabel("measured L1 Cache cycles / Lup")
    ax.set_ylabel("predicted L1 Cache cycles / Lup")
    plt.savefig("./autoplots/l1cycles_" + device.name +  ".svg")

    fig, ax = plt.subplots(figsize=(8,6), dpi=150)
    ax.scatter(np.arange(len(measValues)) ,  [predValues[k].L1TLBPages / 2 for k in measValues], color="black", marker="_", label="predicted 512K pages / block")
    ax.bar(np.arange(len(measValues)),  [measValues[k].UTCL1_miss * 1024 for k in measValues], color=[getColor(k[1:4]) for k in measValues.keys()], width=1.0, zorder=-1, label="UTCL1 request misses")
    ax.scatter(np.arange(len(measValues)), [-0.02] * len(measValues), color=[getColor(k[1:4]) for k in measValues.keys()])
    #ax.scatter([predValues[k].L1TLBPages for k in measValues], [measValues[k].UTCL1_miss / measValues[k].UTCL1_requests for k in measValues], color=[getColor(k[1:4]) for k in measValues.keys()])
    ax.grid()
    ax.legend()
    #volumeScatterPlot([(k[1:4], measValues[k].L1Wavefronts_TA, measValues[k].L1Wavefronts_TD, k[4]) for k in measValues.keys()], "TA vs TD range" + str(r) )
    #volumeScatterPlot([(k[1:4], device.clock * device.smCount  / measValues[k].L1Wavefronts_TA, measValues[k].lups, k[4]) for k in measValues.keys()], "TA range" + str(r) )

    volumeScatterPlot([(k[1:4], measValues[k].UTCL1_miss * 1024, predValues[k].L1TLBPages*predValues[k].L1TLBPages*0.02, k[4]) for k in measValues], "TLB misses" + str(device.name))

#fig, ax = plt.subplots(figsize=(6,6), dpi=150)


volumeScatterPlot([(k[1:4],  measValues[k].lups, 6000/(10 + measValues[k].L2Store + measValues[k].L2Load + measValues[k].UTCL1_miss*1000 + predValues[k].L1Cycles*1.5), k[4]) for k in measValues], "TLBperf_" + str(device.name), linear=True)

#ax.scatter([measValues[k].lups for k in measValues], [1600/(measValues[k].L2Store + measValues[k].L2Load + measValues[k].UTCL1_miss*1000 + predValues[k].L1Cycles*0.1) for k in measValues], color=[getColor(k[1:4]) for k in measValues.keys()], marker="o", label="performance")
#ax.plot([measValues[k].L1Wavefronts_TD for k in measValues], [measValues[k].L1Wavefronts_TD for k in measValues] ) 

#ax.set_xlabel("L1Wavefronts_TD")
#ax.set_ylabel("pred cycles")
#ax.legend()

fig, ax = plt.subplots(figsize=(8,5), dpi=150)
ax.scatter([predValues[k].L1TLBPages for k in measValues],  [measValues[k].lups for k in measValues], color=[getColor(k[1:4]) for k in measValues.keys()], marker="o", label="performance")
ax.plot([predValues[k].L1TLBPages for k in measValues], 90 / np.maximum(0.1, 1 + np.array( [predValues[k].L1TLBPages / 64 for k in measValues])), "x", label="a/x fit with x=predicted 512k L1T LB pages", color="black")
ax.set_xlabel("predicted L1 TLB pages")
ax.set_ylabel("perf, Lup/s")
ax.legend()


volumeScatterPlot([(k[1:4],  measValues[k].lups, min(predValues[k].perfL1, 1000 / predValues[k].L1Load / (1/8 + measValues[k].UTCL1_miss)), k[4]) for k in measValues], "TLBperf_" + str(device.name))
volumeScatterPlot([(k[1:4],  measValues[k].lups, min(predValues[k].perfL1, 12 / (1/8 + predValues[k].L1TLBPages/1024)), k[4]) for k in measValues], "TLBperf_" + str(device.name))

volumeScatterPlot([(k[1:4],  measValues[k].lups, min(predValues[k].perfV3, 12 / (1/8 + measValues[k].UTCL1_miss)), k[4]) for k in measValues], "perf_pheno_tlb" + str(device.name))
volumeScatterPlot([(k[1:4],  measValues[k].lups, min(predValues[k].perfV3, 12 / (1/8 + predValues[k].L1TLBPages/1024)), k[4]) for k in measValues], "perfv3_tlb" + str(device.name))


In [None]:
fig, ax = plt.subplots()
fig.set_figwidth(4)
fig.set_figheight(4)
fig.set_dpi(150)

for r in [(1,1,1), (1,1,2), (1,2,1), (1,2,2)]:
    keys = [k for k in measValues if k[4] == r]
    ax.plot([predValues[k].smL1Alloc  / device.sizeL1 for k in keys],
            [(measValues[k].L2Load_tex - predValues[k].L2LoadV1) / max(0.001, predValues[k].L1Load - predValues[k].L2LoadV1) for k in keys], ".", alpha=0.2)
    

values = np.arange(0.0, 10.0, 0.1)


ax.plot (values, 0.45*np.exp(-9.0*np.exp(-0.8*values))) 
#ax.plot (values, 0.43*np.exp(-9.0*np.exp(-0.65*values))) 
#ax.plot (values, 0.25*np.exp(-9.0*np.exp(-0.5*values))) 

ax.axvline(1.0)

In [None]:
def fitValues(measValues, predValues):
    fig, ax = plt.subplots()
    fig.set_figwidth(5)
    fig.set_figheight(5)
    fig.set_dpi(140)

    xdata = []
    ydata = []
        


    meas = measValues
    pred = predValues
    for a in [0,1]:
        keys = [k for k in meas if (pred[k].memLoadV1 - meas[k].memLoad) > 0.1 and
               (pred[k].basic.waveMemLoadOverlap[a] / pred[k].basic.waveValidCells) > 4.0]
        x = [ max(0.0, min(3.0, device.sizeL2 / (pred[k].basic.waveMemOld[a] + pred[k].basic.waveMemLoadNew - pred[k].basic.waveMemLoadOverlap[a]))) for k in keys]
        y = [ max(0.0, min(1.0, (pred[k].memLoadV1 - meas[k].memLoad) / (pred[k].basic.waveMemLoadOverlap[a] / pred[k].basic.waveValidCells / pred[k].lc.blocking_factors[1] / pred[k].lc.blocking_factors[2])))  for k in keys]
        ax.plot(x, y, ".", alpha=0.2)
        xdata.extend(x)
        ydata.extend(y)

    values = np.arange(0.0, 10.0, 0.1)
    
    if len(keys) == 0:
        return
     
    from scipy.optimize import curve_fit

    def func(x, a, b, c):
        return a * np.exp(-b*np.exp(-c*x))


    popt, pcov = curve_fit(func, xdata, ydata, bounds=([0.999999, -np.inf, -np.inf], [1.0, np.inf, np.inf]), maxfev=20000)
    print(popt)
    print(pcov)

    xdata = np.array([*list(xdata), *[  i / 25 for i in range(1,25) ]])
    xdata.sort()


    
    plt.plot(xdata, func(xdata, *popt), 'r-',
             label='fit: a=%5.3f, \n     b=%5.3f,\n     c=%5.3f' % tuple(popt))

    ax.axvline(1.0)
    ax.legend()

fitValues(measValues, predValues)

In [None]:
categories = ["L1", "L2", "DRAM"]

for r in range(-2,5):    
    keys = [k for k in measValues if k[0] == r]
    if len(keys) == 0: 
        continue
        
    volumeScatterPlot([(k[1:4], measValues[k].lups, predValues[k].perfEPMV3*0.65, categories[predValues[k].limV4], predValues[k].perfPheno) for k in keys], "R" + str(r) + " EPM Pheno,  " + str(device.name), linear=True)
        
    volumeScatterPlot([(k[1:4], measValues[k].lups, predValues[k].perfV1, categories[predValues[k].limV1]) for k in keys], "R" + str(r) + " Extended Roofline V1,  " + str(device.name))
    volumeScatterPlot([(k[1:4], measValues[k].lups, predValues[k].perfV2, categories[predValues[k].limV2], predValues[k].perfV1) for k in keys], "R" + str(r) + " Extended Roofline V2,  " + str(device.name))
    fig, ax = volumeScatterPlot([(k[1:4], measValues[k].lups, predValues[k].perfV3, categories[predValues[k].limV3], predValues[k].perfV2) for k in keys], "R" + str(r) + " Extended Roofline V3,  " + str(device.name))
    ax.set_xlabel("measured performance, GLup/s")
    ax.set_ylabel("predicted performance, GLup/s")
    plt.savefig("./autoplots/extroofline_v3_range" + str(3) + "_" + device.name +  ".svg")
    volumeScatterPlot([(k[1:4], measValues[k].lups, predValues[k].perfV4, categories[predValues[k].limV4], predValues[k].perfV3) for k in keys], "R" + str(r) + " Extended Roofline V4,  " + str(device.name))
    fig, ax = volumeScatterPlot([(k[1:4], measValues[k].lups, predValues[k].perfPheno, categories[predValues[k].limPheno], predValues[k].perfV3) for k in keys], "R" + str(r) + " Extended Roofline Pheno,  " + str(device.name))
    ax.set_xlabel("measured performance, GLup/s")
    ax.set_ylabel("predicted performance, GLup/s")
    plt.savefig("./autoplots/extroofline_pheno_range" + str(3) + "_" + device.name +  ".svg")

    

    volumeScatterPlot([(k[1:4], measValues[k].lups, predValues[k].perf2LimV4, categories[predValues[k].lim2LimPheno], predValues[k].perfPheno) for k in keys], "R" + str(r) + " Standard Roofline Pheno,  " + str(device.name))


In [None]:
predTop = [(predValues[k].perfV3, k[1:4], k[4]) for k in measValues.keys() if k[0] == 4]
measTop = [(measValues[k].lups,  k[1:4], k[4]) for k in measValues.keys() if k[0] == 4]

print(len(predTop))

predTop = sorted(predTop)[-145:]
measTop = sorted(measTop)[-145:]


print("Top Preds:")
for p in predTop:    
    print("{} {:5.1f} {!r:12} {!r:10}".format("**" if p[1:3] == measTop[-1][1:3] else "  ", *p))

print()
print("Top Meas")
for p in measTop:    
    print("{} {:5.1f} {!r:12} {!r:10}".format("**" if p[1:3] == predTop[-1][1:3] else "  ", *p))

In [None]:
predTop = [k for k in measValues.keys() if k[0] == 0]
predTop = sorted(predTop, key = lambda a : measValues[a].lups)

for a in predTop:
    print("{:30} {:5.1f}  {:5.1f}".format(str(a),
                                          measValues[a].lups * 2*8 / device.smCount / device.clock,
                                          predValues[a].perfL1 * 2*8 / device.smCount / device.clock))
    