# Pystencils 2D Star Stencil Volume Comparison

2D stencil with different block sizes

In [None]:

import sys 
sys.path.append('../pystencils')
sys.path.append('../genpredict')

%load_ext autoreload
%autoreload 1
%aimport pystencils.warpspeed.warpspeed
%aimport predict
%aimport griditeration
%aimport volumes_isl
%aimport pystencils.astnodes
%aimport plot_utils



In [None]:
import cProfile
import re


import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import sys
from subprocess import run, PIPE


import pystencils as ps
from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers
from pystencils.warpspeed.warpspeed import PyStencilsWarpSpeedKernel, getFieldExprs, lambdifyExprs, simplifyExprs
from griditeration import *
from volumes_isl import *


from plot_utils import *
from meas_utils import *
from pystencils_stencil_utils import PS3DStencil, PS2DStencil



import sympy as sp

import pycuda
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import pycuda.driver as drv

import timeit


In [None]:
domain_size = (1000, 1000)
SS = PS2DStencil(domain_size, 1)

In [None]:
print(SS.getStarAssignments(1)[0])
ps.show_code(SS.getStarKernel((32, 4), 1))
print(SS.getStarKernel((32,4), 1).ast)
wsKernel = PyStencilsWarpSpeedKernel(SS.getStarKernel((32, 4), 1).ast)

    
printSASS("#define FUNC_PREFIX __global__\n#define RESTRICT const __restrict__\n" + str(ps.get_code_obj(SS.getStarKernel((32, 4), 1))))

In [None]:
predValues = dict()
measValues = dict()
kernelCache = dict()
wsKernelCache = dict()

In [None]:
xticks = []
xtickLabels = []
xtickCounter = 0

print()
print( "                      mem     mem      L2      L2")
print( "                     load   store    load   store       L1")
print()


for r in [1]:
    for xblock in [1,2, 4, 8, 16, 32,128, 256, 512, 1024]:
        for yblock in [1, 2, 4, 8, 16, 32,64,128, 256, 512, 1024]:
            zblock = 1
            if xblock*yblock*zblock not in [256, 512]:
                continue
            block = (xblock, yblock, zblock)
            key = (r, *block)
            
            if key in kernelCache:
                kernel = kernelCache[key]
            else:                
                kernel = SS.getStarKernel(block[:2], r)            
                kernelCache[key] = kernel
            if key in wsKernelCache:
                wsKernel = wsKernelCache[key]
            else:
                wsKernel = PyStencilsWarpSpeedKernel(kernel.ast)     
                wsKernelCache[key] = wsKernel
            runFunc = SS.getRunFunc(kernel)
                
            print(str(kernel.num_regs)  + " Registers")

            
            grid = ( *tuple(SS.size[i] // block[i] +1 for i in range(2)), 1)

            print("block:" + str(block))
            wsKernel.registers = kernel.num_regs
            predV = getVolumes(wsKernel, block, grid, (1, 1, 0, *SS.size, 1))
 
            
            if key in measValues:
                measV = measValues[key]
            else:
                measV = measureMetrics(runFunc, SS.size)
            
            print("r={}  {:12}   {:5.2f}   {:5.2f}   {:5.2f}   {:5.2f}".format(r, str(block), measV["memLoad"], measV["memStore"], measV["L2Load"], measV["L2Store"] ))
            print("            {:5.2f} / {:4.2f}   {:5.2f}   {:5.2f}   {:5.2f}   {:6.1f}".format(predV["memLoad"], predV["memLoadISL"], predV["memStore"], predV["L2Load"], predV["L2Store"], predV["L1cycles"]))

            print()

            predValues[key] = predV
            measValues[key] = measV     

In [None]:
for key in predValues.keys():
    r, xblock, yblock, zblock = key
    
    block = (xblock, yblock, zblock)

    results = predValues[key]


    threadsPerBlock = xblock*yblock*zblock
    concurrentBlocks = min(32, 1024 // threadsPerBlock) * 80
    vMemComplete = results["memLoadISL"] * concurrentBlocks * threadsPerBlock
    sizeL2 = 6 * 1024 * 1024

    vMemStore = results["memStore"] * concurrentBlocks * threadsPerBlock
    vL2Store = results["L2Store"] * concurrentBlocks * threadsPerBlock
    vL2Load = results["L2Load"] * concurrentBlocks * threadsPerBlock

    vMem = vMemComplete



    vStoreEvicted = 0
    if vMemStore > 0:
        effectiveL2 = sizeL2*0.3
        effectiveL2 *= vMemStore / (vMemStore + vMemComplete)
        effectiveL2 *= vMemComplete / (vMemComplete + vMemStore)
        vStoreEvicted = (vL2Store - vMemStore) * max(0, (vMemStore - effectiveL2)) / vMemStore

    results["memStoreExt"] = ( vMemStore + vStoreEvicted) / concurrentBlocks / threadsPerBlock

    
    vMemEvicted = 0
    if vMemStore > 0:
        vMemEvicted += vStoreEvicted #(vL2Store - vMemStore) * max(0, (vMemStore - sizeL2 * (vMemStore / (vMemStore + vMemComplete))  * min( 1,  (block[0] * concurrentBlocks) / 2000)  )) / vMemStore           
        #vMemEvicted += (vL2Load - vMemComplete)
    #if vMemComplete > 0:
    #    effectiveL2 = sizeL2
    #    effectiveL2 *= vMemComplete / (vMemComplete + vMemStore)
    #    effectiveL2 *= max(0.2, min( 1,  (block[0] * concurrentBlocks) / domain_size[0]))
    #    vMemEvicted += (vL2Load - vMemComplete) * 0.1*max(0, (vMemComplete / effectiveL2))


    results["memLoadISLext"] = (vMem + vMemEvicted) / concurrentBlocks / threadsPerBlock
    #print(results["memLoadISLext"])

    results["memTotal"] = results["memLoadISLext"] + results["memStoreExt"]
    #print(vL2Load / 80 / 1024)
    #print( ((measValues[key]["memLoad"] * threadsPerBlock*concurrentBlocks) - vMemComplete) / (vL2Load - vMemComplete) )

In [None]:
volumeScatterPlot({key: v["memLoad"] for key, v in measValues.items()}, {key: v["memLoad"] for key, v in predValues.items()}, "2D Memory Load Volumes")
volumeScatterPlot({key: v["memLoad"] for key, v in measValues.items()}, {key: v["memLoadISL"] for key, v in predValues.items()}, "2D Memory Load Volumes ISL")
volumeScatterPlot({key: v["memLoad"] for key, v in measValues.items()}, {key: v["memLoadISLext"] for key, v in predValues.items()}, "2D Memory Load Volumes ISL Ext")

In [None]:
volumeScatterPlot({key: v["L2Load"] for key, v in measValues.items()}, {key: v["L2Load"] for key, v in predValues.items()}, "2D L2 Load Volumes")

In [None]:
volumeScatterPlot({key: v["memStore"] for key, v in measValues.items()}, {key: v["memStore"] for key, v in predValues.items()}, "Memory Store Volumes")

In [None]:
volumeScatterPlot({key: v["memStore"] for key, v in measValues.items()}, {key: v["memStoreExt"] for key, v in predValues.items()}, "Memory Store Volumes Ext")

In [None]:
volumeScatterPlot({key: v["L2Store"] for key, v in measValues.items()}, {key: v["L2Store"] for key, v in predValues.items()}, "L2 Store Volumes")