In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

import sys 
sys.path.append('../../../warpspeed/')
sys.path.append('../../../measutils/')
sys.path.append('../../../pystencils_notebooks')
sys.path.append('..')

%load_ext autoreload
%autoreload 1



In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from meas_db import MeasDB

from measured_metrics import MeasuredMetrics, ResultComparer
from predict_metrics import *
from plot_utils import *
import genconv
import json

In [None]:
meas_db = MeasDB("../bconv.db")
#meas_db.clearDB()

In [None]:
width = 4096 
height = 4096
input_channels = output_channels = 64
batch_size = 1

In [None]:



devices = [DeviceAmpereA40(), DeviceL40(),  DeviceAmpereA100_80GB(),  DeviceHopperH200(), DeviceMI210(), DeviceRX6900XT(), DeviceMI300(), DeviceMI300A()]

devicePredValues = dict()
deviceMeasValues = dict()

completeDevicePredValues = dict()
completeDeviceMeasValues = dict()


for d in devices:
    deviceResults = meas_db.getRangeKeys({#"input_channels": input_channels,
                                          #"output_channels": output_channels,
                                          "width" : width,
                                          "height" : height,
                                          "batch_size" : batch_size,                                        
                                           "device": '"' + str(d.name) + '"'} ,
                                           ( "xblock", "yblock", "zblock", "c_in_per_thread", "x_per_thread", "input_channels", "output_channels"))
    deviceMeasValues[d] = {}
    devicePredValues[d] = {}
    completeDeviceMeasValues[d] = {}
    completeDevicePredValues[d] = {}
    
    for row in deviceResults:
        meas = row[4]
        key = row[0]
        #print(key, end=" ")
        metrics = DerivedMetrics(row[2], row[3], d, meas)

        if key[0] >= 16:
            completeDeviceMeasValues[d][key] = meas
            completeDevicePredValues[d][key] = metrics   
        
        if key[0] >= 8 and key[5] == 64 and key[6] == 64:
            deviceMeasValues[d][key[:-2]] = meas
            devicePredValues[d][key[:-2]] = metrics   

    print(d)
    print(len(deviceMeasValues[d]))
    print( len(completeDeviceMeasValues[d]))
                    

In [None]:
propertyPairs = [("memLoad", "memLoadV1"), ("memLoad", "memLoadV2"), ("memLoad", "memLoadV3"), ("memLoad", "memLoadV4"),
                     ("memStore", "memStoreV1"), ("memStore", "memStoreV2"),
                     ("L2Load_tex", "L2LoadV1"), ("L2Load_tex", "L2LoadV2"), ("L2total", "L2totalV1"), ("L2total", "L2totalV2"),                     
                     ("L2Store", "L2Store"),
                     ("L1TagWavefronts", "L1TagCycles"),
                     ("L1DataPipeWavefronts", "L1DataPipeCycles")
                    ]

propertyPairs = [
    ("L1TagWavefronts", "L1TagCycles", "L1 Tag"),
     ("L1DataPipeWavefronts", "L1DataPipeCycles", "L1 Data"),
    ("L2Load_tex", "L2LoadV2", "L2 Load"),
    ("L2Store", "L2Store", "L2 Store"),
    ("memLoad", "memLoadV3", "\\makecell{DRAM \\\\ Load}"), 
                     ("memStore", "memStoreV2", "\\makecell{DRAM \\\\ Store}"),
                     
                 #("tflops", "perfTFlopsV1", "performance"),
                 #("tflops", "perfTFlopsV2", "performance"),
                 #("tflops", "perfTFlopsV3", "performance"),
                 #("tflops", "perfTFlopsV4", "performance"),
                 #("tflops", "perfTFlopsPheno", "performance")
                ]


for p in propertyPairs:
    #print( "& \\multicolumn{2}{c |}{" + p[2] + "} ", end="")
    print( "& " + p[2] , end="")
print("\\\\")                


mapes = {}
taus = {}
mpes  = {}
for d in devices:
    mapes[d] = []
    taus[d] = []
    mpes[d] = []
    
   
    for measProp, predProp, name in propertyPairs:
        mape = 0
        mpe = 0
        count = 0
        largestDev = 0
        
        for k in completeDeviceMeasValues[d].keys():
            meas = getattr(completeDeviceMeasValues[d][k], measProp, -10)
            pred = getattr(completeDevicePredValues[d][k], predProp, -1)
            if  meas > 0.01:
                mape += abs(meas - pred) / meas
                mpe += (pred-meas) / meas
                largestDev = max(largestDev, abs(meas-pred)/meas)
                
                count += 1
                
          
            
        tau, p_value = stats.kendalltau([ getattr(completeDeviceMeasValues[d][k], measProp, -1) for k in completeDeviceMeasValues[d].keys()], 
                                        [ getattr(completeDevicePredValues[d][k], predProp, -1) for k in completeDeviceMeasValues[d].keys()])
        
                
        if count > 10:
            mape /= count
            mpe /= count
        else:
            mape = -1
            tau = -1
        print("{:10.1f}".format(mape*100), end=" ")
        
        mapes[d].append(mape)
        taus[d].append(tau)
        mpes[d].append(mpe)
    print()

for p in propertyPairs:
    print( "& \\multicolumn{2}{c |}{\\textbf{" + p[2] + "}} ", end="")
    #print( "& " + p[2] , end="")
print("\\\\")
for d in devices:
    print(" {:10}  ".format(d.displayName), end=" ")
    for m, mp in zip(mapes[d], mpes[d]):
                        
        if m < 0 or m > 0.72:
            print("& - & ", end=" ")
            continue
       
        angle = (mp / m)*90
        
        if m < 0.04:
            print(" &   ${:~>3.0f}\\%$ & ".format(  m*100), end=" ")
        else:
            print("  & \\cellcolor{{white!{}!c1}} ${:~>3.0f}\\%$ & \\rotArrow{{{:.0f}}} ".format( 105-int(m*102), m*100, angle), end=" ")
    print("\\\\")
    
    




In [None]:
from matplotlib.ticker import MaxNLocator

colors = ["#CC1343", "#349999", "#649903", "#FFAA11"]

def polarPlot(keys, hlKeys, measValues, predValues, device):   


    
    def getData(keys):
        data = [ [ predValues[k].perfTFlopsMemV3,
                   predValues[k].perfTFlopsL2V2,
                   predValues[k].perfTFlopsL1] for k in keys if k in predValues]       
       
        #data = [[ v * device.peakFP32() for v in d ] for d in data]
        data = [ [*d , d[0]] for d in data ]
        return data
    
    data = getData(keys)
    hlData = getData(hlKeys)
        
    
    categories = 3


    label_loc = np.linspace(start=0, stop=2 * np.pi, num=categories + 1)


    plt.figure(figsize=(4, 4), facecolor="white", dpi=100)
    ax=plt.subplot(polar=True)

    #background black data
    for i in range(len(data)):
        ax.plot(label_loc, data[i], '.-', color="black", alpha=0.01, linewidth=1, markersize=8, markeredgewidth=0)

    # white background line at fp32
    ax.axhline(device.peakFP32()/1000, color="white", linewidth=4).get_path()._interpolation_steps = 180

    #highlighted data
    for i in range(len(hlData)):
        ax.plot(label_loc, hlData[i], '-', label= "({},{},{}) {}x{}".format(*hlKeys[i]), color=colors[i], linewidth=2)
        #if hlKeys[i] in measValues:
        #    ax.axhline( measValues[hlKeys[i]].tflops, color=colors[i], linewidth=1).get_path()._interpolation_steps = 180
        
    # DeviceName
    ax.text(0.07, 0.85, device.getDisplayName(), fontsize=18, horizontalalignment='left', fontweight="bold",
     verticalalignment='center', color="k", transform = ax.transAxes, backgroundcolor="#FFFFFFAA")


    
    # Fix axis to go in the right order and start at 12 o'clock.
    ax.set_theta_offset(np.pi / 2)
    ax.set_theta_direction(-1)

    # Draw axis lines for each angle and label.
    ax.set_thetagrids(np.degrees(label_loc), ["DRAM", "L2", "L1", "DRAM"])
  

    # Ensure radar goes from 0 to 100. it also removes the extra line
    #ax.set_ylim(0, 5)
    # You can also set gridlines manually like this:
    # ax.set_rgrids([20, 40, 60, 80, 100])

    # Set position of y-labels (0-100) to be in the middle
    # of the first two axes.
    ax.set_rlabel_position(180 / 4)

    # Make the y-axis (0-100) labels smaller.
    ax.tick_params(axis='y', labelsize=11)
    ax.tick_params(axis='x', labelsize=12)
    ax.set_ylabel("TFlop/s")
    # Change the color of the circular gridlines.
    #ax.grid(color='#AAAAAA')
    # Change the color of the outermost gridline (the spine).
    ax.spines['polar'].set_color('#999999')
    # Change the background color inside the circle itself.
    
    ax.set_facecolor('#FAFAFA')
    #ax.yaxis.set_major_locator(MaxNLocator(integer=True))


    print(device.name)
    print(device.peakFP32())
    
    # circle at device Peak
    ax.axhline(device.peakFP32() / 1000, color="black").get_path()._interpolation_steps = 180

    #ax.set_yscale("log")

    # Add a legend as well.
    ax.legend(loc='lower center') #, bbox_to_anchor=(1.3, 1.1))

    ax.set_ylim((0, 140))
    
    return ax

#keys = [k for k in measValues.keys() if k[2] > 8 and k[4] < 32 and k[3] < 32]
#fig, ax = polarPlot(keys, [(64, 1, 4, 1, 1), (64, 1, 4, 2, 2), (64, 1, 4, 4, 4), (64, 1, 4, 8, 8)], measValues, predValues, device)

for d in devices:

    keys = [k for k in deviceMeasValues[d].keys() if k[0] > 1 and k[2] > 0 ]
    ax = polarPlot(keys, [(256, 1, 1, 1, 1), (256, 1, 1, 2, 2), (256, 1, 1, 4, 4), (256, 1, 1, 8,8)], deviceMeasValues[d], devicePredValues[d], d)

    plt.tight_layout(pad=0)
    plt.savefig("./plots/polarplot_" + d.displayName +  ".pdf")
    plt.savefig("./plots/polarplot_" + d.displayName +  ".svg")
    plt.show()

In [None]:
#print(devicePredValues.keys())

device = devices[-1]
print(device)

predValues = devicePredValues[device]
measValues = deviceMeasValues[device]

key = list(predValues.keys())[100]


rc = ResultComparer(measValues[key], predValues[key])
print(key)
print(measValues[key])
print( rc )


In [None]:
volumeScatterPlot([(k[0:3], deviceMeasValues[devices[-1]][k].memLoad, deviceMeasValues[devices[-2]][k].memLoad, str(k[3]) + "x" + str(k[4])) for k in deviceMeasValues[devices[1]].keys() if k[3]==k[4]])

In [None]:
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator, FixedLocator)

def plotLimiterBars(firstKeys, secondKeys = None, thirdKeys=None):
    

    colors = ["#CC1343", "#349999", "#649903"]

    plt.rcParams.update({'hatch.color': 'w'})
    
    def plotKeys(ax, keys, role = None):
        bottoms = [[0, 0, 0]] * len(keys)
        hatch = 0
        for field in list(predValues[keys[0]].fieldL2LoadV2):
            if field == "total":
                    continue

            fieldMem = [ predValues[k].fieldMemLoadV2[field] / predValues[k].flopsPerLup / device.memBW * device.peakFP32() for k in keys]
            fieldL2 = [predValues[k].fieldL2LoadV2[field] / predValues[k].flopsPerLup / device.L2BW * device.peakFP32() for k in keys]
            fieldL1 = [predValues[k].fieldL1Cycles[field][2] / predValues[k].flopsPerLup / (device.smCount * device.clock) * device.peakFP32() for k in keys]

            ax.bar(np.arange(len(keys)) - 0.29, fieldMem, width=0.26, bottom = [b[0] for b in bottoms], color=colors[hatch], label=field, zorder=3 )
            ax.bar(np.arange(len(keys)) - 0.0, fieldL2, width=0.26, bottom = [b[1] for b in bottoms], color=colors[hatch], zorder=3 )
            ax.bar(np.arange(len(keys)) + 0.29, fieldL1, width=0.26, bottom = [b[2] for b in bottoms], color=colors[hatch], zorder=3 )
            bottoms = [[bottoms[i][0] + fieldMem[i], bottoms[i][1] + fieldL2[i], bottoms[i][2] + fieldL1[i]] for i in range(len(bottoms))]
            hatch += 1


        for field in list(predValues[keys[0]].fieldL2Store):
            if field == "total":
                    continue

            fieldMem = [predValues[k].memStoreV1 / predValues[k].flopsPerLup / device.memBW * device.peakFP32() for k in keys]
            fieldL2 = [predValues[k].fieldL2Store[field] / predValues[k].flopsPerLup / device.L2BW * device.peakFP32() for k in keys]
            fieldL1 = [predValues[k].fieldL1Cycles[field][2] / predValues[k].flopsPerLup / (device.smCount * device.clock) * device.peakFP32() for k in keys]


            ax.bar(np.arange(len(keys)) - 0.29, fieldMem, width=0.26, bottom = [b[0] for b in bottoms], color=colors[hatch], label=field, zorder=3 )
            ax.bar(np.arange(len(keys)) - 0.0, fieldL2, width=0.26, bottom = [b[1] for b in bottoms],  color=colors[hatch], zorder=3 )
            ax.bar(np.arange(len(keys)) + 0.29, fieldL1, width=0.26, bottom = [b[2] for b in bottoms],  color=colors[hatch], zorder=3) 
            bottoms = [[bottoms[i][0] + fieldMem[i], bottoms[i][1] + fieldL2[i], bottoms[i][2] + fieldL1[i]] for i in range(len(bottoms))]
            hatch += 1


        #outline
        ax.bar(np.arange(len(keys)) - 0.29, [b[0] for b in bottoms], width=0.26, color="None", edgecolor="k", linewidth=2, zorder=2 )
        ax.bar(np.arange(len(keys)) - 0.0,  [b[1] for b in bottoms], width=0.26, color="None", edgecolor="k", linewidth=2, zorder=2 )
        ax.bar(np.arange(len(keys)) + 0.29, [b[2] for b in bottoms], width=0.26, color="None", edgecolor="k", linewidth=2, zorder=2)


        #actual
        ax.plot(np.arange(len(keys)) - 0.29, [ (perFlop(measValues[k], "memLoad") + perFlop(measValues[k], "memStore")) / device.memBW * device.peakFP32() for k in keys], "o", markersize=3, color="black", zorder=5)
        ax.plot(np.arange(len(keys)) - 0.0, [ (perFlop(measValues[k], "L2Load") + perFlop(measValues[k], "L2Store")) / device.L2BW * device.peakFP32() for k in keys], "o", markersize=3, color="black" , zorder=5)
        if device.name != "MI210":
            ax.plot(np.arange(len(keys)) + 0.29, [ perFlop(measValues[k], "L1TagWavefronts") / (108 * device.clock) * device.peakFP32() for k in keys], "o", markersize=3, color="black", label="measured balance" , zorder=5)

        #performance
        #ax.plot(np.arange(len(keys)), [ device.peakFP32() / measValues[k].tflops / 1000   for k in keys], "_", color="black", markersize=28, label="measured performance", markeredgewidth=2 )
        ax.bar(np.arange(len(keys)), [ device.peakFP32() / measValues[k].tflops / 1000   for k in keys], width=0.89, label="measured performance",  color="#DDDDDD", edgecolor="#AAAAAA" , zorder=1)


        #bar text labels
        for i in range(len(keys)):  
            if bottoms[i][0] / ax.get_ylim()[1] > 0.12 and  bottoms[i][1] / ax.get_ylim()[1] > 0.05 and bottoms[i][2] / ax.get_ylim()[1] > 0.05:
                ax.text(-0.28 + i, 0.024 * ax.get_ylim()[1], "DRAM", color="w", rotation=90, horizontalalignment="center")
                ax.text(0.01 + i, 0.024 * ax.get_ylim()[1], "L2", color="w", rotation=90, horizontalalignment="center")
                ax.text(0.30 + i, 0.024 * ax.get_ylim()[1], "L1", color="w", rotation=90, horizontalalignment="center")
                
            #elif bottoms[i][0] / ax.get_ylim()[1] < 0.12 and  bottoms[i][1] / ax.get_ylim()[1] < 0.05 and bottoms[i][2] / ax.get_ylim()[1] < 0.25:
            #    ax.text(-0.28 + i, 0.1 * ax.get_ylim()[1], "DRAM", color="k", rotation=90, horizontalalignment="center")
            #    ax.text(0.01 + i, 0.1 * ax.get_ylim()[1], "L2", color="k", rotation=90, horizontalalignment="center")
            #    ax.text(0.30 + i, 0.1 * ax.get_ylim()[1], "L1", color="w", rotation=90, horizontalalignment="center")


        ax.axhline(1, color="black")

        ax.set_xticks((np.arange(len(keys))) )
        ax.set_xticklabels([ "{3}x{4}\n{0},{1},{2}".format(*k) for k in keys],  rotation = 30)
 
           
        ax.set_xlim([-0.6,len(keys)-0.4])
        ax.set_ylim([0, ax.get_ylim()[1]])

            
            
 
    def r2f(r):
        return 100 / r

    
    
    if secondKeys is None:
        fig, ax = plt.subplots()
        
        plotKeys(ax, firstKeys)
        yticks = ax.get_yticks() + [1]
        ax.set_ylabel("Ratio: Code Balance / Machine Balance")
        ax.set_yticks(yticks)
 
        ax2 = ax.twinx()        
        ax2.set_ylim(ax.get_ylim())    
        
        ax2.set_ylabel("Percentage of FP32 Peak")
        ax2.set_yticks(yticks)           
        ax2.set_yticklabels([ "{:.0f}%".format(r2f(l)) if l >= 1 else "" for l in yticks] )            
 
        
        ax.minorticks_on()
        ax.xaxis.set_minor_locator(FixedLocator( np.array(ax.get_xticks())+0.5 ))        
        ax.grid(True, "major","y", zorder=-30)        
        ax.grid(True, "minor","x", zorder=-30)
        ax.tick_params(axis='y', which='minor', left=False)
        ax.set_axisbelow(True)

        
        ax.legend()        
    else:
        fig, ax = plt.subplots(1, 3,  gridspec_kw={'width_ratios': [len(firstKeys), len(secondKeys), len(thirdKeys)]})
        plotKeys(ax[0], firstKeys, "first")
        plotKeys(ax[1], secondKeys, "second")
        plotKeys(ax[2], thirdKeys, "second")
        
        
        ylim = max(ax[0].get_ylim()[1], ax[1].get_ylim()[1], ax[2].get_ylim()[1]) * 0.9
        ax[0].set_ylim(0, ylim)        
        ax[1].set_ylim(0, ylim)
        ax[2].set_ylim(0, ylim)
        
        ax[0].set_yticks(ax[0].get_yticks() + [1])
        ax[1].set_yticks(ax[1].get_yticks() + [1])
        ax[2].set_yticks(ax[2].get_yticks() + [1])
        
        plt.minorticks_on()
        
        ax[0].grid(True, "major","y")
        ax[1].grid(True, "major","y")
        ax[2].grid(True, "major","y")
        
        ax[0].grid(True, "minor","x")
        ax[1].grid(True, "minor","x")
        ax[2].grid(True, "minor","x")

        ax[0].set_ylabel("Ratio: Code Balance / Machine Balance")
        #ax[0].set_yticks(yticks)
        
        ax[0].minorticks_on()
        ax[0].xaxis.set_minor_locator(FixedLocator( np.array(ax[0].get_xticks())+0.5 ))        
        ax[0].set_axisbelow(True)
        ax[0].tick_params(axis='y', which='minor', left=False)
        
        ax[1].minorticks_on()
        ax[1].xaxis.set_minor_locator(FixedLocator( np.array(ax[1].get_xticks())+0.5 ))        
        ax[1].set_axisbelow(True)
        ax[1].tick_params(axis='y', which='minor', left=False)

        ax[2].minorticks_on()
        ax[2].xaxis.set_minor_locator(FixedLocator( np.array(ax[1].get_xticks())+0.5 ))        
        ax[2].set_axisbelow(True)
        ax[2].tick_params(axis='y', which='minor', left=False)
        
        ax2 = ax[0].twinx()
        ax2.set_ylim(ax[0].get_ylim())
        ax2.set_yticks(ax[0].get_yticks())
        ax2.set_yticklabels([ "{:.0f}%".format(r2f(l)) if l >= 1 else "" for l in ax2.get_yticks()] )            
      
        
        ax2 = ax[1].twinx()
        ax2.set_ylim(ax[1].get_ylim())
        ax2.set_yticks(ax[1].get_yticks())    
        ax2.set_yticks(ax[1].get_yticks())           
        ax2.set_yticklabels([ "{:.0f}%".format(r2f(l)) if l >= 1 else "" for l in ax2.get_yticks()] )            

        
        ax2 = ax[2].twinx()
        ax2.set_ylim(ax[2].get_ylim())
        ax2.set_yticks(ax[2].get_yticks())
        ax2.set_ylabel("Percentage of FP32 Peak")
        ax2.set_yticks(ax[1].get_yticks())           
        ax2.set_yticklabels([ "{:.0f}%".format(r2f(l)) if l >= 1 else "" for l in ax2.get_yticks()] )            

        
        #ax2.grid(True, "major","y")
        ax[0].tick_params(axis='y', which='minor', left=False)
        ax[1].tick_params(axis='y', which='minor', left=False)
        ax[2].tick_params(axis='y', which='minor', left=False)

        
        #ax[0].set_ylim((0, max(ax[0].get_ylim()[1], ax[1].get_ylim()[1])))
        #ax[1].set_ylim((0, max(ax[0].get_ylim()[1], ax[1].get_ylim()[1])))
    
        

    #fig.set_figwidth(5)
    #fig.set_figheight(5)
    #fig.set_dpi(140)
    
    
    fig.tight_layout()

    return fig,ax
    
    

In [None]:
keys = [k for k in measValues.keys() if k[0] == 256 and k[1] == 1 and k[3] == 1 and k[4] < 64]
fig,ax = plotLimiterBars( keys )
ax.set_xticklabels([ str(k[4]) for k in keys],  rotation = 0)
ax.set_xlabel("y per thread")
fig.tight_layout(pad=0)
plt.savefig("./plots/limbars_xperthread_" + device.name +  ".svg")
plt.savefig("./plots/limbars_xperthread_" + device.name +  ".pdf")



#plt.show()
#plt.close()

In [None]:
keys = [k for k in measValues.keys() if k[0] == 256 and k[1] == 1 and k[4] == 1 and k[3] < 64]
fig,ax = plotLimiterBars( keys )
ax.set_xticklabels([ str(k[3]) for k in keys],  rotation = 0)
ax.set_xlabel("c_out per thread")
fig.tight_layout(pad=0)
plt.savefig("./plots/limbars_cinperthread_" + device.name +  ".svg")
plt.savefig("./plots/limbars_cinperthread_" + device.name +  ".pdf")

In [None]:
keysBoth =   [(16, 16, 1, 1, 2)] +[ (*k, 2, 1) for k in [(32, 8, 1), (32, 4, 2), (32, 2, 4), (32, 1, 8)]]
print(keysBoth)
keysBlocks1 = [(256,1,1, *k) for k in [(1,1), (1,2), (1,4), (1,8), (1,16), (1,32)]]
keysBlocks2 = [(256,1,1, *k) for k in [(1,1), (2,1), (4,1), (8,1), (16,1), (32,1), (8,8)]]

fig,ax = plotLimiterBars( keysBoth, keysBlocks1, keysBlocks2 )
fig.set_size_inches(14, 5)

#ax.set_ylim(ax.get_ylim()[0], ax.get_ylim()[1] - 4)
#ax.set_xticklabels([ str(k[0:3]) for k in keys],  rotation = 0)
#ax.set_xlabel("c_out per thread")
ax[0].legend()

bbox_props = dict(boxstyle="round", fc="#649903", alpha=0.8, ec="k", lw=1, pad=0.4)
font_props = dict(size = 12, color = "w", fontweight = "bold")

ax[0].text(3, 4.5, "thread block\n shape", ha="center", va="center", rotation=0,
            bbox=bbox_props, **font_props)


bbox_props["boxstyle"] = "rarrow"
ax[1].text(2.8, 6, "Y_PER_THREAD", ha="center", va="center", rotation=0,
            bbox=bbox_props, **font_props)


ax[2].text(3.5, 5, "C_OUT_PER_THREAD", ha="center", va="center", rotation=0,
            bbox=bbox_props, **font_props)

ax[2].text(6, 3, "both", ha="center", va="center", rotation=-90,
            bbox=bbox_props, **font_props)

ax[2].text(0.7, 0.9, device.getDisplayName(), fontsize=24, horizontalalignment='left', fontweight="bold",
     verticalalignment='center', color="k", transform = ax[2].transAxes, backgroundcolor="w")




fig.tight_layout(pad=0)
plt.savefig("./plots/limbars_both_" + device.name +  ".svg")
plt.savefig("./plots/limbars_both_" + device.name +  ".pdf")

In [None]:
fig, ax = volumeScatterPlot([((k[0], k[1], k[2]),
                              (perFlop(measValues[k], "memLoad") + perFlop(measValues[k], "memStore"))*1000,
                              (perFlop(predValues[k], "memLoadV3") + perFlop(predValues[k], "memStoreV2"))*1000,
                              "{}x{}".format(k[3], k[4]),
                              (predValues[k].memLoadV2 + predValues[k].memStoreV1) / predValues[k].lc.flopsPerLup *1000)
                             for k in measValues if k[3] == k[4]],
                            "DRAM Volume " + str(device.displayName), unit="mB/Flop", lims=(5,800))

ax.set_xlabel("Measured Volume, mB/Flop")
ax.set_ylabel("Predicted Volume, mB/Flop")
fig.tight_layout(pad=0)
fig.savefig("./plots/dram_volume_" + device.displayName + ".pdf")


In [None]:
volumeScatterPlot([(k[0:3],
                    perFlop(measValues[k], "memStore")*1000,
                    perFlop(predValues[k], "memStoreV1")*1000,
                    str(k[3]) + "x" + str(k[4])) for k in measValues if k[3] == k[4]],
                  "Memory Store Volumes V1 " + str(device.displayName), unit="mB/Flop")

volumeScatterPlot([(k[0:3],
                    perFlop(measValues[k], "memStore")*1000,
                    perFlop(predValues[k], "memStoreV2")*1000,
                    str(k[3]) + "x" + str(k[4]),
                    perFlop(predValues[k], "memStoreV1")*1000 ) for k in measValues if k[3] == k[4]],
                 "Memory Store Volumes V2 " + str(device.displayName), unit="mB/Flop")

In [None]:
fig, ax = volumeScatterPlot([(k[0:3],
                             (perFlop(measValues[k], "L2Load_tex") + perFlop(measValues[k], "L2Store"))*1000,
                              (perFlop(predValues[k], "L2LoadV2")  + perFlop(predValues[k], "L2Store"))*1000,
                              str(k[3]) + "x" + str(k[4]),
                              (perFlop(predValues[k], "L2LoadV1")  - perFlop(predValues[k], "L2LoadOverlap") + perFlop(predValues[k], "L2Store"))*1000) for k in measValues if k[3] == k[4]],
                            "L2 Cache Volume " + str(device.displayName), unit="mB/Flop", lims=(12,1100))

ax.set_ylabel("Measured Volume, mB/Flop")
ax.set_xlabel("Predicted Volume, mB/Flop")
fig.tight_layout(pad=0)
fig.savefig("plots/l2_cache_" + device.displayName + ".pdf")

volumeScatterPlot([(k[0:3],
                    perFlop(measValues[k], "L2Store")*1000,
                    perFlop(predValues[k], "L2Store")*1000,
                    str(k[3]) + "x" + str(k[4]) ) for k in measValues if k[3] == k[4]],
                  "Stencil L2 Store Volumes " + str(device.displayName), unit="mB/Flop")

In [None]:
#fig, ax = volumeScatterPlot([(k[0:3], perFlop(measValues[k], "L1DataPipeWavefronts")*1000, 1000*perFlop(predValues[k], "L1DataPipeCycles") + perFlop(predValues[k], "smL1Alloc") / 512, str(k[3]) + "x" + str(k[4])) for k in measValues if k[3] == k[4]], "L1 Data Pipe Cycles  " + str(device.displayName))
#fig, ax = volumeScatterPlot([(k[0:3], perFlop(measValues[k], "L1TagWavefronts")*1000, 1000*perFlop(predValues[k], "L1TagCycles") ,  str(k[3]) + "x" + str(k[4])) for k in measValues if k[3] == k[4]], "L1 Tage Wavefront Cycles  " + str(device.displayName))

fig, ax = volumeScatterPlot([(k[0:3], 32*max( perFlop(measValues[k], "L1DataPipeWavefronts"), perFlop(measValues[k], "L1TagWavefronts")),
                                      32*max( perFlop(predValues[k], "L1TagCycles"), perFlop(predValues[k],"L1DataPipeCycles")), str(k[3]) + "x" + str(k[4])) for k in measValues if k[3] == k[4]], "L1 Cache Cycles  " + str(device.displayName), unit="Cy/Flop")
ax.set_xlabel("Predicted L1 cycles, cy/Flop")
ax.set_ylabel("Measured L1 cycles, cyc/Flop")
fig.tight_layout(pad=0)
fig.savefig("./plots/L1_cycles_" + device.displayName + ".pdf")

for k in sorted(list(predValues), key = lambda p : max( measValues[p].L1TagWavefronts, measValues[p].L1DataPipeWavefronts))[:1]:
    print("{:17s}  {:7.2f} {:7.2f} : {:7.2f} {:7.2f} {:7.2f}".format( str(k), measValues[k].L1TagWavefronts, measValues[k].L1DataPipeWavefronts, predValues[k].L1TagCycles, predValues[k].L1DataPipeCycles, predValues[k].L1DataPipeCycles + predValues[k].smL1Alloc / 512 ))

In [None]:
categories = ["flops", "L1", "L2", "DRAM" ]
r = 0
keys = measValues.keys()



fig, ax = volumeScatterPlot([( k[0:3],
                              measValues[k].tflops,
                              predValues[k].perfTFlopsV3,
                              categories[predValues[k].limV3],
                              predValues[k].perfTFlopsPheno)
                             for k in keys if k[3] == k[4]],
                            "Performance " + str(device.displayName),
                            categories=categories,
                           lims=(1, 130))

ax.set_xlabel("Measured Performance, GFlop/s")
ax.set_ylabel("Predicted Performance, GFlop/s")
fig.tight_layout(pad=0)
plt.savefig("./plots/performance_" + device.displayName +  ".pdf")



fig, ax = performanceScatterPlot([(k[0:3],
                                   measValues[k].tflops,
                                   predValues[k].perfTFlopsPheno,
                                   categories[predValues[k].limPheno],
                                   predValues[k].perfTFlopsV3) for k in keys  if k[3] == k[4]],
                                 "Performance Phenomenological " + str(device.displayName),
                                 categories=categories,
                                lims=(1,130))
ax.set_xlabel("Measured Performance, GFlops/s")
ax.set_ylabel("Predicted Performance, GFlops/s")
fig.tight_layout(pad=0)
plt.savefig("./plots/performance_pheno_" + device.displayName +  ".pdf")


In [None]:
for d in devices[:1]:

    predKeys = devicePredValues[d]
    measKeys = deviceMeasValues[d]
                       
    predTopKeys = sorted( predKeys, key = lambda k : -(predValues[k].perfTFlopsV3 + predValues[k].perfTFlopsL2V2*0.01 + predValues[k].perfTFlopsMemV3*0.001))
    measTopKeys = sorted( measKeys, key = lambda k : -measValues[k].tflops )

    print()
    print("range = ", r)    
    print("Top Preds:")

    num = 3       
    for p in predTopKeys[:num]:    
        print("{} {:3} {:5.4f} {:5.1f} {!r:12} {:5.3f}".format("**" if p == measTopKeys[0] else "  ", predTopKeys.index(p), predValues[p].perfV3, predValues[p].perfTFlopsV3, p, predValues[p].memLoadV3))
        print(predValues[p])
    if measTopKeys[0] not in predTopKeys[:num]:
        p = measTopKeys[0]        
        print("{} {:3} {:5.4f} {:5.1f} {!r:12} {:5.3f}".format("**" if p == measTopKeys[0] else "  ", predTopKeys.index(p), predValues[p].perfV3, predValues[p].perfTFlopsV3, p,  predValues[p].memLoadV3))   
        print(predValues[p])
    print("Top Meas")

    for p in measTopKeys[:num]:    
        print("{} {:3} {:5.4f} {:5.1f} {!r:12} {:5.3f}".format("**" if p == predTopKeys[0] else "  ", measTopKeys.index(p), measValues[p].lups, measValues[p].tflops, p,  measValues[p].memLoad))
        print(measValues[p])
        
    if predTopKeys[0] not in measTopKeys[:num]:
        p = predTopKeys[0]        
        print("{} {:3} {:5.4f} {:5.1f} {!r:12} {:5.3f} ".format("**" if p == predTopKeys[0] else "  ", measTopKeys.index(p), measValues[p].lups, measValues[p].tflops, p, measValues[p].memLoad))
        print(measValues[p])
        

In [None]:
fig, ax = plt.subplots(figsize=(5, 5), dpi=200)



balances = []

pMaxMax = 0

def plotLimiter(q1, q2, color):    
    machineBalance = device.peakFP32() / getattr(device, q2) *0.8
    codeBalance = [ 1 / sum([perFlop(measValues[k], m) for m in q1]) for k in measValues]
    print(machineBalance)
    
    p1 = min([measValues[k].tflops for k in measValues])  / getattr(device, q2) * 1000
    
    p2 = max(codeBalance)
    plt.plot( (  machineBalance, p2), (device.peakFP32() / 1000, device.peakFP32() / 1000), color="gray" )
    plt.plot(  (0, machineBalance), (0 * getattr(device, q2) / 1000, device.peakFP32() / 1000), color=color , solid_capstyle="round" )
    plt.scatter( codeBalance, [measValues[k].tflops for k in measValues], color=color, marker=["v", "s", "^"][plotLimiter.marker], alpha=0.6 )
    plotLimiter.marker += 1
    return codeBalance

plotLimiter.marker = 0    
    
balances.append( plotLimiter( ["memLoad", "memStore"], "memBW", "C0") )
balances.append( plotLimiter(["L2Load_tex", "L2Store"], "L2BW", "C1") )


peakRate = device.smCount * device.clock * 64
machineBalance = device.peakFP32() / peakRate

codeBalance = [1 /  perFlop(predValues[k], "L1Cycles") / 64 for k in measValues]
p1 = min([measValues[k].tflops for k in measValues]) / peakRate * 1000
p2 = max(codeBalance)

print(p1)

plt.plot( (  machineBalance, 20), (device.peakFP32() / 1000, device.peakFP32() / 1000), color="gray", solid_capstyle="butt" )
plt.plot(  (0, machineBalance), (0 * peakRate  / 1000, device.peakFP32() / 1000), color="C2" , solid_capstyle="round"  )
plt.scatter( codeBalance, [measValues[k].tflops for k in measValues], color="C2", alpha=0.6  )

balances.append( codeBalance )


for k in range(len(balances[0])):
    plt.plot( (balances[0][k], balances[1][k]), [ measValues[ list(measValues.keys())[k] ].tflops ] * 2, color= getColor(list(measValues.keys())[k][0:3]), zorder=-1, alpha=0.2  )
    plt.plot( (balances[1][k], balances[2][k]), [ measValues[ list(measValues.keys())[k] ].tflops ] * 2, color=getColor(list(measValues.keys())[k][0:3]), zorder=-1, alpha=0.2  )

ax.set_xscale("log")
ax.set_yscale("log")

ax.set_ylim([ 0.1, 26])
ax.set_xlim([ 0.05, 206])

ax.set_xlabel("arithmetic Intensity, Flop/B or Flop/cycle")
ax.set_ylabel("Performance, GFlop/s")

fig.tight_layout(pad=0)
fig.savefig("plots/bconv_roofline_1x1_" + device.name + ".pdf")


In [None]:
from math import log, exp

def L2Latency(wdev, payload, clock):

    # return max(200, (payload * wdev) / 1500 * clock)
    x = wdev * payload / 8 / 32
    # lat = 210 + 102* log(1 + exp((x-1400)*0.00138))
    lat = max(237, 175 + 440 * log(1 + exp((x - 1000) * 0.0002)))
    # print("{:6.0f} {:6.0f} {:6.1f}".format(x, lat, payload / 8 / 32))
    return lat


def memLatency(wdev, payload, clock):
    # return max(250, wdev*payload / 780 * clock)
    x = wdev * payload / 8 / 32
    lat = max(210, 0 + 1300 * log(1 + exp((x - 1000) * 0.00017)))
    # print("{:6.0f} {:6.0f} {:6.1f}".format(x, lat, payload / 8 / 32))
    return lat


for x in range(1, 108):
    print(memLatency(x * 2048, 100, 1.41))
    print( max(210, x * 2048 * 100 / 1600e9 * 1.41e9))
    print()


In [None]:
from random import *
from math import sqrt

nclusters = min(2, len(measValues.keys() ) // 2)

keys = [k for k in measValues.keys() if k[3] == k[4]]
shuffle(keys)


clusters = [[] for n in range(nclusters)]
clusterCenters = [keys[i] for i in range(nclusters)]

dim1 = { key : perFlop(predValues[key], "perfTFlopsV3") for key in keys}
dim2 = { key : perFlop(measValues[key], "tflops") for key in keys}



def distance(key1, key2):
    return sqrt( (dim1[key1] - dim1[key2])**2 +
                 (dim2[key1] - dim2[key2]) **2 ) 
    

for i in range(0,20):

    for c in range(nclusters):
        print(len(clusters[c]), end=" ")
        if len(clusters[c]) <= 4 or 3*i - randint(0, len(clusters[c])) < 0:
            clusterCenters[c] = choice(keys)
            print(".")
    print()
    clusters = [[] for n in range(nclusters)]    
    for key in keys:
        minDistance = -1
        minCluster = -1
        for c in range(nclusters):            
            d = distance(key, clusterCenters[c])

            if d < minDistance or minDistance < 0:
                minDistance = d
                minCluster = c
        clusters[minCluster].append(key)
            
    for c in range(nclusters):
        shuffle(clusters[c])
        lowestDistanceSum = -1
        lowestKey = -1
        for k1 in clusters[c]:
            distanceSum = 0
            for k2 in clusters[c]:
                distanceSum += distance(k1, k2)**2
            if distanceSum < lowestDistanceSum or lowestDistanceSum < 0:
                lowestDistanceSum = distanceSum
                lowestKey = k1
        if c != -1:
            clusterCenters[c] = lowestKey
        
            
          
    print(clusterCenters)
      
    keyClusters = {}
    for k in keys:
        for c in range(nclusters):
            if k in clusters[c]:
                keyClusters[k] = c


fig, ax = volumeScatterPlot([ ([keyClusters[k]+1, 2, 8], dim1[k], dim2[k], keyClusters[k]) for k in keys], "test_" + str(device.name))    

for c in range(nclusters):
#    ax.scatter( [dim1[k] for k in clusters[c] ], [dim2[k] for k in clusters[c]])
    if clusterCenters[c] != -1:
        ax.plot( dim1[clusterCenters[c]],  dim2[clusterCenters[c]], "+", markersize=15, color="black" )           
        ax.text(dim1[clusterCenters[c]] - 0.01,  dim2[clusterCenters[c]] + (random())/100 , str( clusterCenters[c]), fontsize=7)
plt.show()