# Pystencils 3D Star Stencil Performance Comparison

3D7pt stencil with different block sizes

In [None]:

import sys 
sys.path.append('../pystencils')
sys.path.append('../genpredict')



%load_ext autoreload
%autoreload 1
%aimport pystencils.warpspeed.warpspeed
%aimport predict
%aimport griditeration
%aimport volumes_isl
%aimport pystencils.astnodes

In [None]:
import cProfile
import re

import math

import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import sys
from subprocess import run, PIPE

from predict import *

import pystencils as ps
from pystencils.slicing import add_ghost_layers, make_slice, remove_ghost_layers
from pystencils.warpspeed.warpspeed import PyStencilsWarpSpeedKernel, getFieldExprs, lambdifyExprs, simplifyExprs
from griditeration import *
from volumes_isl import *

import measure_metric.measureMetric as measureMetric


from pystencils.astnodes import (
    KernelFunction,
    LoopOverCoordinate,
    ResolvedFieldAccess,
    SympyAssignment,
)
from pystencils.field import get_layout_from_strides
from pystencils.transformations import filtered_tree_iteration

import sympy as sp

import pycuda
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import pycuda.driver as drv






In [None]:
size = ((512 + 128)-2, (512+128)-2, (512+128)-2)

dh = ps.create_data_handling((size[0], size[1], size[2]), default_target='gpu')

dst_field = dh.add_array("dst", values_per_cell=1)
dh.fill("dst", 0.0, ghost_layers=True)
src_field = dh.add_array("src", values_per_cell=1)
dh.fill("src", 0.0, ghost_layers=True)
dh.all_to_gpu()

def bench_kernel(kernel):
    
    start = drv.Event()
    end = drv.Event()
    
    start.record()
    dh.run_kernel(kernel)
    dh.run_kernel(kernel)
    dh.run_kernel(kernel)
    dh.run_kernel(kernel)
    end.record()
    end.synchronize()
    msec = start.time_till(end)
    return msec / 4



In [None]:
stencil3D_assignments = ps.Assignment(dst_field[0,0,0],  0.25 * ( src_field[0,0,0] + src_field[0,0,-1] + src_field[0,0,1] + src_field[0,1,0] + src_field[0,-1,0] + src_field[1,0,0] + src_field[-1,0,0]))
#stencil2D_assignments = ps.Assignment(dst_field[0,0],  0.25 * (src_field[0,1] + src_field[0,-1] + src_field[1,0] + src_field[-1,0]))
ast = ps.create_kernel([stencil3D_assignments], target="gpu", gpu_indexing_params={"block_size": (1, 1024, 1)})

kernel = ast.compile()
ps.show_code(ast)


In [None]:
xtickLabels= []
predValues = dict()
measValues = dict()
xticks = []
xtickCounter = 0

for xblock in [1, 2, 4, 8, 16, 32, 64, 128, 256]:
    for yblock in [1, 2, 4, 8, 16, 32, 64, 128, 256]:
        for zblock in [1, 2, 4, 8, 16, 32, 64, 128, 256]:
            if yblock * zblock * xblock > 1024 or yblock * zblock * xblock < 64:
                continue
            block = (xblock, yblock, zblock)
            print(block)

            ast = ps.create_kernel([stencil3D_assignments], target="gpu", gpu_indexing_params={"permute_block_size_dependent_on_layout": False, "block_size": block})
            kernel = ast.compile()

            wpKernel = PyStencilsWarpSpeedKernel(ast)


            predLups = predict.predictPerformance(wpKernel, block, tuple(max(1, (size[i] -1) // block[i] + 1) for i in range(3)) )

            predValues[block] = predLups*16

            times = [bench_kernel(kernel) for i in range(0, 7)]
            time = times[len(times)//2]
            measLups = size[0] * size[1] * size[2] / time / 1e6

            measValues[block] = measLups*16
            newLabel = "{}x{}x{}".format( block[0], block[1], block[2])
            xticks.append(xtickCounter)
            xtickLabels.append(newLabel)        
            xtickCounter += 1

            print("{:7.1f} {:7.1f}".format(predLups*16, measLups*16))
            print()

In [None]:
def performanceScatterPlot(measuredValues, predictedValues):
    fig, ax = plt.subplots()
    fig.set_figwidth(4)
    fig.set_figheight(4)
    fig.set_dpi(150)

    
    colors = [tuple( math.log2(bc) / math.log2(256) for bc in b) for b in measuredValues.keys()]
    
    ax.scatter(measuredValues.values(), predictedValues.values(), s=[200] * len(colors), c=colors, alpha=0.01, edgecolors="none")    
    ax.scatter(measuredValues.values(), predictedValues.values(), s=[10] * len(colors), c=colors, alpha=1, edgecolors="none")
    #ax.scatter(list(measuredValues.values()), list(predictedValues.values()), ".", markersize=1)
    
    fig.tight_layout()

    ax.set_xscale("log", subsx=[])
    ax.set_yscale("log", subsy=[])

    #ax.set_xlim((7.5,30))
    #ax.set_ylim((7.5,30))

    xlim = ax.get_xlim()
    ylim = ax.get_ylim()
    
    ax.plot([7,1150], [7,1150], color="black", alpha = 0.2)
    
    
    ax.set_xticks([10, 50, 100, 200, 400, 600, 800])
    ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
    ax.set_yticks([10, 50, 100, 200, 400, 600, 800])
    ax.get_yaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())

    ax.set_xlim((min(xlim[0], ylim[0]), 900))
    ax.set_ylim((min(xlim[0], ylim[0]), 900))

    
    ax.set_xlabel("measured Memory Bandwidth, GB/s")
    ax.set_ylabel("predicted Memory Bandwidth, GB/s")
    
    ax.grid()
    plt.show()

performanceScatterPlot(measValues, predValues)