# Pystencils 3D Star Stencil Volume Comparison

3D7pt stencil with different block sizes

In [None]:

import sys 

sys.path.append('..')
sys.path.append('../warpspeed')
sys.path.append('../measutils')
sys.path.append('../pystencils/')
sys.path.append('../../pystencils/pystencils')
sys.path.append('../../pystencils')


%load_ext autoreload
%autoreload 1

%aimport predict
%aimport griditeration
%aimport volumes_isl
%aimport plot_utils



In [None]:
import cProfile
import re


import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import sys
from subprocess import run, PIPE



from griditeration import *
from volumes_isl import *


from plot_utils import *
from measured_metrics import *
from pystencils_stencil_utils import *
from pystencilswarpspeedkernel import *



import sympy as sp


import timeit


In [None]:
size = (640, 512, 512)
SS = PS3DStencil(size, 4)

In [None]:
kernel = SS.getStarKernel((32, 4, 4), 1)

print(kernel.ast)


In [None]:
predValues = dict()
measValues = dict()
kernelCache = dict()
wsKernelCache = dict()

In [None]:
xticks = []
xtickLabels = []
xtickCounter = 0

print()
print( "                      mem     mem      L2      L2")
print( "                     load   store    load   store       L1")
print()


for r in [1]:
    for xblock in [2, 4, 8, 16, 32, 64, 128]:
        for yblock in [1, 2, 4, 8, 16, 32, 64, 128]:
            for zblock in [1, 2, 4, 8, 16, 32, 64, 128]:
                if xblock*yblock*zblock not in [1024]:
                    continue
                block = (xblock, yblock, zblock)
                key = (r, *block)
                
                if key in kernelCache:
                    kernel = kernelCache[key]
                else:                
                    kernel = SS.getStarKernel(block, r)            
                    kernelCache[key] = kernel
                if key in wsKernelCache:
                    wsKernel = wsKernelCache[key]
                else:
                    wsKernel = PyStencilsWarpSpeedKernel(kernel.ast)     
                    wsKernelCache[key] = wsKernel
                
                runFunc = SS.getRunFunc(kernel)

                grid = tuple( max(1, SS.size[i] // block[i])  for i in range(3))

                print("block:" + str(block))
                predV = getVolumes(wsKernel, block, grid, (r, r, r, *SS.size))
                
                measV = measureMetrics(runFunc, SS.size)
                
                times = [benchKernel(runFunc) for i in range(0, 7)]
                time = times[len(times)//2]
                measV["lups"] = (SS.size[0] - 2*r) * (SS.size[1] - 2*r) * (SS.size[2] - 2*r) / time / 1e6
                
                
                print("r={}  {:12}   {:5.2f}   {:5.2f}   {:5.2f}   {:5.2f}".format(r, str(block), measV["memLoad"], measV["memStore"], measV["L2Load"], measV["L2Store"] ))
                print("            {:5.2f} / {:4.2f}   {:5.2f}   {:5.2f}   {:5.2f}   {:6.1f}".format(predV["memLoad"], predV["memLoadISL"], predV["memStore"], predV["L2Load"], predV["L2Store"], predV["L1cycles"]))
                print(str(measV["lups"] ) + " MLup/s")
                
                key = (r, *block)
                predValues[key] = predV
                measValues[key] = measV            
                
                #print((measV["memStore"] - predV["memStore"]) / (predV["L2Store"] - predV["memStore"]))
                print()

In [None]:

for key in predValues.keys():
  
    r, xblock, yblock, zblock = key
    
    block = (xblock, yblock, zblock)

    results = predValues[key]


    threadsPerBlock = xblock*yblock*zblock
    concurrentBlocks = min(32, 1024 // threadsPerBlock) * 80
    vMemComplete = results["memLoadISL"] * concurrentBlocks * threadsPerBlock
    sizeL2 = 6 * 1024 * 1024

    vMemStore = results["memStore"] * concurrentBlocks * threadsPerBlock
    vL2Store = results["L2Store"] * concurrentBlocks * threadsPerBlock
    vL2Load = results["L2Load"] * concurrentBlocks * threadsPerBlock

    vMem = vMemComplete



    vStoreEvicted = 0
    if vMemStore > 0:
        effectiveL2 = sizeL2*0.3
        effectiveL2 *= vMemStore / (vMemStore + vMemComplete)
        effectiveL2 *= vMemComplete / (vMemComplete + vMemStore)
        vStoreEvicted = (vL2Store - vMemStore) * max(0, (vMemStore - effectiveL2)) / vMemStore

    results["memStoreExt2"] = ( vMemStore + vStoreEvicted) / concurrentBlocks / threadsPerBlock

    
    vMemEvicted = 0
    if vMemStore > 0:
        vMemEvicted += vStoreEvicted #(vL2Store - vMemStore) * max(0, (vMemStore - sizeL2 * (vMemStore / (vMemStore + vMemComplete))  * min( 1,  (block[0] * concurrentBlocks) / 2000)  )) / vMemStore           
        #vMemEvicted += (vL2Load - vMemComplete)
    #if vMemComplete > 0:
    #    effectiveL2 = sizeL2
    #    effectiveL2 *= vMemComplete / (vMemComplete + vMemStore)
    #    effectiveL2 *= max(0.2, min( 1,  (block[0] * concurrentBlocks) / domain_size[0]))
    #    vMemEvicted += (vL2Load - vMemComplete) * 0.1*max(0, (vMemComplete / effectiveL2))


    results["memLoadISLext2"] = (vMem + vMemEvicted) / concurrentBlocks / threadsPerBlock
    #print(results["memLoadISLext"])

    results["memTotal"] = results["memLoadISLext"] + results["memStoreExt"]
    #print(vL2Load / 80 / 1024)
    #print( ((measValues[key]["memLoad"] * threadsPerBlock*concurrentBlocks) - vMemComplete) / (vL2Load - vMemComplete) )

In [None]:
volumeScatterPlot({key: v["memLoad"] for key, v in measValues.items()}, {key: v["memLoad"] for key, v in predValues.items()}, "Memory Load Volumes", (8.5, 96))
volumeScatterPlot({key: v["memLoad"] for key, v in measValues.items()}, {key: v["memLoadISL"] for key, v in predValues.items()}, "Memory Load Volumes ISL", (8.5, 96))
volumeScatterPlot({key: v["memLoad"] for key, v in measValues.items()}, {key: v["memLoadISLext"] for key, v in predValues.items()}, "Memory Load Volumes ISL Ext", (8.5, 96))
volumeScatterPlot({key: v["memLoad"] for key, v in measValues.items()}, {key: v["memLoadISLext2"] for key, v in predValues.items()}, "Memory Load Volumes ISL Ext2", (8.5, 96))

In [None]:
volumeScatterPlot({key: v["memStore"] for key, v in measValues.items()}, {key: v["memStore"] for key, v in predValues.items()}, "Memory Store Volumes")
volumeScatterPlot({key: v["memStore"] for key, v in measValues.items()}, {key: v["memStoreExt"] for key, v in predValues.items()}, "Memory Store Volumes Ext")

In [None]:
#volumeScatterPlot({key: v["L2Load"] for key, v in measValues.items()}, {key: v["L2Load"] for key, v in predValues.items()}, "L2 Load Volumes", (24, 180) )
#volumeScatterPlot({key: v["L2Load"] for key, v in measValues.items()}, {key: v["L2LoadExt"] for key, v in predValues.items()}, "L2 Load Volumes Ext", (24, 180))
#volumeScatterPlot({key: v["L2Store"] for key, v in measValues.items()}, {key: v["L2Store"] for key, v in predValues.items()}, "L2 Store Volumes")


print(sorted([ (v["memLoad"], key)  for key, v in measValues.items()]))


In [None]:
def differenceScatterPlot( measValues, predValues, measKey, predKey1, predKey2, title, lims = None ):  
    
    fig, ax = plt.subplots()
    fig.set_figwidth(4)
    fig.set_figheight(4)
    fig.set_dpi(150)
       
    keys = differences = [key for key, v in predValues.items() if abs(v[predKey1] - v[predKey2]) / v[predKey1] > 0.1]
    random.shuffle(keys)
    
    measValues = [measValues[key][measKey] for key in keys]
    predValues1 = [predValues[key][predKey1] for key in keys]
    predValues2 = [predValues[key][predKey2] for key in keys]
    
    colors = [tuple(math.log2(bc) for bc in b[1:]) for b in keys]    
    colors = [tuple([min(1.0, color[i] / math.log2(256) * 1.5)
                        for i in range(len(colors[0]))
                    ] + [0] * (3 - len(colors[0])))
            for color in colors]

    #ax.scatter(
    #    measValues, predValues2,
    #    s=[40] * len(colors), c=colors,
    #    marker="P", alpha=1,
    #    edgecolors="none",label=str(r),
    #)
    ax.scatter(
        measValues,      predValues1,
        s=[30] * len(colors),color="gray",
        marker="o",alpha=1,
        edgecolors="none",label=str(r),
    )
    #ax.vlines(measValues, predValues1, predValues2, color="grey", linewidth=0.5, linestyle="solid")

    for v in zip(measValues, predValues1, predValues2):
        ax.arrow( v[0], v[1], 0, v[2] - v[1], linewidth=0.5, head_width=0.05*v[0], fill=False, head_length=0.05*v[2], length_includes_head=True)

    ax.set_xscale("log", base=1.5)
    ax.set_yscale("log", base=1.5)
    
     
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()
    
    ax.set_xticks(
        [0.1, 1.0, 2, 4, 8, 9, 12, 16, 20, 24, 32, 48, 64, 96, 128, 192, 256, 384, 512]
    )
    ax.set_yticks(
        [0.1, 1.0, 2, 4, 8, 9, 12, 16, 20, 24, 32, 48, 64, 96, 128, 192, 256, 384, 512]
    )
    formatter = matplotlib.ticker.FuncFormatter(lambda x, pos: "{0:g}".format(x))
    ax.get_xaxis().set_major_formatter(formatter)
    ax.get_yaxis().set_major_formatter(formatter)

   

    if not lims is None:
        ax.set_xlim(lims)
        ax.set_ylim(lims)
    else:
        lim = (min(ylim[0], xlim[0]), max(ylim[1], xlim[1]))
        ax.set_xlim(
            (max(min(lim[0] * 0.9, lim[0] - 0.1), 0.2), max(1, lim[1] * 1.1, lim[1] + 0.1))
        )
        ax.set_ylim(
            (max(min(lim[0] * 0.9, lim[0] - 0.1), 0.2), max(1, lim[1] * 1.1, lim[1] + 0.1))
        )

    ax.set_xlabel("actual Volume, B/Lup")
    ax.set_ylabel("predicted Volume, B/Lup")
    ax.set_title(title)

        
    fig.tight_layout()
    ax.grid()

    plt.savefig( "./autoplots/diff_" + title + ".svg")
    plt.show()


    



In [None]:
differenceScatterPlot(measValues, predValues, "memLoad", "memLoad", "memLoadISLext", "memory load", (8.5,96))    
differenceScatterPlot(measValues, predValues, "memLoad", "memLoadISL", "memLoadISLext2", "memory load isl ext" ,(8.5,96) )    
differenceScatterPlot(measValues, predValues, "L2Load", "L2Load", "L2LoadExt", "l2 load isl ext" ,(24,180) )    


In [None]:
for key in measValues.keys():

    memBalance = predValues[key]["memLoadISLext2"] + predValues[key]["memStoreExt2"]
    l2Balance = predValues[key]["L2LoadExt"] #+ predValues[key]["L2Store"]
    
    predValues[key]["lupsMem"] = 800 / memBalance
    predValues[key]["lupsL2"] = 2300 / l2Balance
    predValues[key]["lupsL1"] = 80 * 1.38 * 32 / (predValues[key]["L1cycles"] * 18 / 17) * 0.95
    predValues[key]["lups"] = min(predValues[key]["lupsMem"], predValues[key]["lupsL2"], predValues[key]["lupsL1"] )

predLups = sorted([(round( predValues[key]["lups"], 2) , key[1:]) for key in predValues.keys()]  )
measLups = sorted([(round(measValues[key]["lups"], 2) , key[1:]) for key in measValues.keys()]  )

print(predLups[-4:])
print(measLups[-4:])



limiters = { key : 1 if predValues[key]["lupsMem"] == predValues[key]["lups"] else (2 if predValues[key]["lupsL2"] == predValues[key]["lups"] else 3)  for key in measValues.keys() }

print(limiters)


volumeScatterPlot({ (limiters[key], *key[1:]) : v["lups"] for key, v in measValues.items()},
                  { (limiters[key], *key[1:]) : v["lups"] for key, v in predValues.items()}, "roofline_limiters", (5.7, 48))



volumeScatterPlot({key: v["lups"] for key, v in measValues.items()}, {key: v["lups"] for key, v in predValues.items()}, "roofline", (5.7,48))
volumeScatterPlot({key: v["lups"] for key, v in measValues.items()}, {key: v["lupsL1"] for key, v in predValues.items()}, "roofline_cycles")
volumeScatterPlot({key: v["lups"] for key, v in measValues.items()}, {key: v["lupsL2"] for key, v in predValues.items()}, "roofline_l2")
volumeScatterPlot({key: v["lups"] for key, v in measValues.items()}, {key: v["lupsMem"] for key, v in predValues.items()}, "roofline_mem")


