In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

import sys 
sys.path.append('../../../warpspeed/')
sys.path.append('../../../measutils/')
sys.path.append('../../../pystencils_notebooks')
sys.path.append('..')

%load_ext autoreload
%autoreload 1



In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from meas_db import MeasDB

from measured_metrics import MeasuredMetrics, ResultComparer
from predict_metrics import *
from plot_utils import *
import genconv
import json

In [None]:
device = DeviceMI210()

In [None]:
meas_db = MeasDB("../bconv.db")
#meas_db.clearDB()

In [None]:
width = 4096 
height = 4096
input_channels = 64
output_channels = 64
batch_size = 1

In [None]:
predValues = dict()
measValues = dict()

print(device.name)


results = meas_db.getRangeKeys({"width" : width,
                        "height" : height,
                        "batch_size" : batch_size,
                        "device": '"' + str(device.name) + '"'},
                       ("input_channels", "xblock", "yblock", "zblock", "x_per_thread", "c_in_per_thread"))


for row in results:
    print(row[1])
    
    meas = row[4]
    key = row[0]
    measValues[key] = meas
    print(key, meas)
    
    metrics = DerivedMetrics(row[2], row[3], device, meas)
    predValues[key] = metrics
    rc = ResultComparer(meas, metrics)







In [None]:
threadCount = np.array( [predValues[k].lc.grid[0] * predValues[k].lc.grid[1] * predValues[k].lc.grid[2]  for k in measValues.keys()])
energyPerThread = np.array( [measValues[k].power * (predValues[k].lc.lupCount / measValues[k].lups /1e9) for k in measValues.keys()] ) / threadCount 
energy = [measValues[k].power * measValues[k].time for k in measValues.keys()]
power = [measValues[k].power for k in measValues.keys()]
flops = [measValues[k].tflops for k in measValues.keys()]


plt.scatter( flops, power, c=[perFlop(measValues[k], "memLoad") for k in measValues.keys()])
plt.title("flops vs power")
plt.show()


plt.scatter( power, [measValues[k].clock for k in measValues.keys()], c=[perFlop(measValues[k], "memLoad") for k in measValues.keys()])
plt.title("power cs clocks")
plt.show()

plt.scatter( flops, [measValues[k].clock for k in measValues.keys()], c=[perFlop(measValues[k], "memLoad")for k in measValues.keys()])
plt.title("flops vs clock")
plt.show()

plt.scatter( flops, [measValues[k].power for k in measValues.keys()], c=[perFlop(measValues[k], "memLoad") for k in measValues.keys()])
plt.title("flops vs power")
plt.show()



In [None]:
 from scipy.optimize import curve_fit



def func(X, energyPerInstruction, energyPerL1Cycle, energyPerL2Byte, energyPerDramByte, perThread, basePower):
    instructions, L1Cycles, L2Bytes, dramBytes, threads = X
    return  (basePower + 
            energyPerInstruction * instructions +
            energyPerL2Byte * L2Bytes +
            energyPerL1Cycle * L1Cycles +
            threads * perThread + energyPerDramByte * dramBytes)

def funcBasePower(X, basePower, perThread):  
    instructions, L1Cycles, L2Bytes, dramBytes, threads = X
    return  (basePower + 
            energyPerInstruction * instructions +
            energyPerL2Byte * L2Bytes +
            energyPerL1Cycle * L1Cycles +
            threads * perThread + energyPerDramByte * dramBytes)



threadCount = np.array( [256* predValues[k].lc.grid[0] * predValues[k].lc.grid[1] * predValues[k].lc.grid[2] for k in measValues.keys()] )

time = np.array([measValues[k].time for k in measValues.keys()])

instructions = threadCount * np.array([measValues[k].valuInsts for k in measValues.keys()]) / time
dramBytes = threadCount * np.array([measValues[k].memLoad + measValues[k].memStore for k in measValues.keys()]) / time
L2Bytes = threadCount * np.array([measValues[k].L2Load + measValues[k].L2Store for k in measValues.keys()]) / time
L1Cycles = threadCount * np.array([measValues[k].L1DataPipeWavefronts for k in measValues.keys()]) / time
clock = np.array([measValues[k].clock for k in measValues.keys()]) 
threads = threadCount / time


xdata =  zip(instructions, L1Cycles, dramBytes, L2Bytes, threads)
ydata = [ measValues[k].power for k in measValues.keys() ]


popt, pcov = curve_fit(func, (instructions, L1Cycles, L2Bytes, dramBytes, threads), ydata )
print(popt)
print(pcov)

predYdata = [func(X, *popt) for X in xdata]
volumeScatterPlot( list( zip( [(k[0], k[4], k[5]+1) for k in measValues], ydata, predYdata)), "Energy Prediction " + str(device.name))
plt.show()






In [None]:

colors = ["#CC1343", "#349999", "#649903", "#FFAA11"]
sizes = dict()

for k in measValues.keys():
    if k[0] not in sizes:
        sizes[k[0]] = k
        continue
    if measValues[k].tflops > measValues[sizes[k[0]]].tflops:
        sizes[k[0]] = k

for s in sizes:
    print(sizes[s])
    print( ResultComparer(measValues[sizes[s]], predValues[sizes[s]]) )


fig, ax = plt.subplots()




#ax.plot( np.arange(len(sizes)), [measValues[sizes[s]].tflops for s in sizes], "-o")
#ax2.plot( np.arange(len(sizes)), [ (measValues[sizes[s]].memLoad + measValues[sizes[s]].memStore) / predValues[sizes[s]].lc.flops / device.memBW * device.peakFP32() for s in sizes], "-o")
#ax2.plot( np.arange(len(sizes)), [ (measValues[sizes[s]].L2Load + measValues[sizes[s]].L2Store) / predValues[sizes[s]].lc.flops / device.L2BW * device.peakFP32() for s in sizes], "-o")
ax.plot( sizes.keys(), [ predValues[sizes[s]].perfMemV4 * predValues[sizes[s]].lc.flops / 1000 for s in sizes], "-o", label="Limit: pred. DRAM", color=colors[0])
npkeys = np.array( list(sizes.keys()))
ax.plot(npkeys, device.memBW / 1000 * (npkeys * npkeys) * 9 * 2 / (npkeys+npkeys) / 4, "--", color=colors[0], label="Limit: min. DRAM", )
ax.plot( sizes.keys(), [ predValues[sizes[s]].perfL2V2/ device.clock * measValues[sizes[s]].clock/1000 * predValues[sizes[s]].lc.flops / 1000 for s in sizes], "-o", label="Limit: L2 cache", color=colors[1])
ax.plot( sizes.keys(), [ predValues[sizes[s]].perfL1 / device.clock * measValues[sizes[s]].clock/1000 * predValues[sizes[s]].lc.flops / 1000 for s in sizes], "-o", label="Limit: L1 cache", color=colors[2])
ax.plot( sizes.keys(), [  device.smCount * measValues[sizes[s]].clock / 1000 * 128 / 1000 /  measValues[sizes[s]].valuInsts * predValues[sizes[s]].lc.flops  for s in sizes], "x-", color=colors[3], label="Limit: Instructions")
ax.plot( sizes.keys(), [  device.peakFP32() / 1000 / 1000 /  device.clock * measValues[sizes[s]].clock for s in sizes], "-", linewidth=2, color="k", label="Limit: FP, clock adjusted")

ax.plot( sizes.keys(), [  measValues[sizes[s]].tflops for s in sizes], "-o", color="gray", label="performance")





ax.legend(loc="lower center")

#ax2.plot( np.arange(len(sizes)), [ perFlop(measValues[sizes[s]], "L2Load") for s in sizes], "-o")
#ax2.plot( np.arange(len(sizes)), [ perFlop(measValues[sizes[s]], "L1Wavefronts") for s in sizes], "-o")




ax.set_yscale("log")
ax.get_yaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
ax.set_yticks([3, 10, device.peakFP32() // 100 / 10, 30, 50, 100])
ax.get_yaxis().set_tick_params(which='minor', size=0)
ax.get_yaxis().set_tick_params(which='minor', width=0)
ax.set_ylim([1.5, 160])

ax.set_xscale("log")
ax.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
ax.set_xticks( list(sizes.keys()))
ax.get_xaxis().set_tick_params(which='minor', size=0)
ax.get_xaxis().set_tick_params(which='minor', width=0) 

ax.grid()
ax.set_xlabel("input/output channel count")
ax.set_ylabel("performance, TFlop/s")

ax.text(0.8, 0.1, device.getDisplayName(), fontsize=18, horizontalalignment='left', fontweight="bold",
     verticalalignment='center', color="k", transform = ax.transAxes, backgroundcolor="w")


#ax.plot( [m[0]*4/9 for m in flops_mi210[5:]], [m[1] / 1000 for m in flops_mi210[5:]] , "-o")


fig.tight_layout()
plt.savefig("./autoplots/channels_" + device.name +  ".svg")
plt.savefig("./autoplots/channels_" + device.name +  ".pdf")




In [None]:

volumeScatterPlot([((k[1], k[2], k[3]), perFlop(measValues[k], "memLoad"), perFlop(predValues[k], "memLoadV1"), k[0]) for k in measValues], "Memory Load Volumes V1 " + str(device.name))
volumeScatterPlot([((k[1], k[2], k[3]), perFlop(measValues[k], "memLoad"), perFlop(predValues[k], "memLoadV2"), k[0], perFlop(predValues[k],"memLoadV1")) for k in measValues], "Memory Load Volumes V2 " + str(device.name))
volumeScatterPlot([((k[1], k[2], k[3]), perFlop(measValues[k], "memLoad"), perFlop(predValues[k], "memLoadV3"), k[0], perFlop(predValues[k],"memLoadV2")) for k in measValues], "Memory Load Volumes V3 " + str(device.name))
volumeScatterPlot([((k[1], k[2], k[3]), perFlop(measValues[k], "memLoad"), perFlop(predValues[k], "memLoadV4"), k[0], perFlop(predValues[k],"memLoadV3")) for k in measValues], "Memory Load Volumes V4 " + str(device.name))


In [None]:
volumeScatterPlot([(k[1:4], perFlop(measValues[k], "memStore"), perFlop(predValues[k], "memStoreV1"), k[0]) for k in measValues], "Memory Store Volumes V1 " + str(device.name))
volumeScatterPlot([(k[1:4], perFlop(measValues[k], "memStore"), perFlop(predValues[k], "memStoreV2"), k[0], perFlop(predValues[k], "memStoreV1"), ) for k in measValues], "Memory Store Volumes V2 " + str(device.name))


In [None]:
volumeScatterPlot([(k[1:4], perFlop(measValues[k], "L2Load_tex"), perFlop(predValues[k], "L2LoadV1"), k[0]) for k in measValues], "Convolution L2 Load Volumes V1  " + str(device.name))
volumeScatterPlot([(k[1:4], perFlop(measValues[k], "L2Load_tex"), perFlop(predValues[k], "L2LoadV2"), k[0], perFlop(predValues[k], "L2LoadV1")) for k in measValues], "Convolution L2 Load Volumes V2 " + str(device.name))
volumeScatterPlot([(k[1:4], perFlop(measValues[k], "L2Store"), perFlop(predValues[k], "L2Store"), k[0] ) for k in measValues], "Convolution L2 Store Volumes " + str(device.name))

In [None]:
fig, ax = volumeScatterPlot([(k[1:4], perFlop(measValues[k], "L1DataPipeWavefronts"), perFlop(predValues[k], "L1DataPipeCycles"), k[0]) for k in measValues], "L1 Data Pipe Cycles  " + str(device.name))
fig, ax = volumeScatterPlot([(k[1:4], perFlop(measValues[k], "L1TagWavefronts"), perFlop(predValues[k], "L1TagCycles") , k[0]) for k in measValues], "L1 Tage Wavefront Cycles  " + str(device.name))
fig, ax = volumeScatterPlot([(k[1:4], max( perFlop(measValues[k], "L1DataPipeWavefronts"), perFlop(measValues[k], "L1TagWavefronts")),
                                      max( perFlop(predValues[k], "L1TagCycles"), perFlop(predValues[k],"L1DataPipeCycles")), k[0]) for k in measValues], "L1 Cache Cycles  " + str(device.name))
fig, ax = volumeScatterPlot([(k[1:4], measValues[k].L1DataPipeWavefronts, measValues[k].L1TagWavefronts  , k[0]) for k in measValues], "L1 Tag vs Datapipe Cycles  " + str(device.name))
ax.set_xlabel("dataPipeWavefronts")
ax.set_ylabel("tagWavefronts")
for k in list(measValues.keys())[-23:]:
    print("{:17s}  {:7.2f} {:7.2f} : {:7.2f} {:7.2f}".format( str(k), measValues[k].L1TagWavefronts, measValues[k].L1DataPipeWavefronts, predValues[k].L1TagCycles, predValues[k].L1DataPipeCycles))

In [None]:
categories = ["flops", "L1", "L2", "DRAM" ]
r = 0
keys = measValues.keys()
volumeScatterPlot([((k[2]*4, k[4]*2, k[5]), measValues[k].tflops, predValues[k].perfTFlopsV1, categories[predValues[k].limV1]) for k in keys], " Extended Roofline V1,  " + str(device.name))

volumeScatterPlot([((k[2]*4, k[4]*2, k[5]), measValues[k].tflops, predValues[k].perfTFlopsV2, categories[predValues[k].limV2], predValues[k].perfTFlopsV1) for k in keys], "R  Extended Roofline V2,  " + str(device.name))
fig, ax = volumeScatterPlot([( (k[2]*4, k[4]*2, k[5]), measValues[k].tflops, predValues[k].perfTFlopsV3, categories[predValues[k].limV3], predValues[k].perfTFlopsV2) for k in keys], " Extended Roofline V3,  " + str(device.name))
ax.set_xlabel("measured performance, GFlop/s")
ax.set_ylabel("predicted performance, GFlop/s")
plt.savefig("./autoplots/extroofline_v3_range" + str(3) + "_" + device.name +  ".svg")
#volumeScatterPlot([(k[0:3], measValues[k].tflops, predValues[k].perfTFlopsV4, categories[predValues[k].limV4], predValues[k].perfTFlopsV3) for k in keys], " Extended Roofline V4,  " + str(device.name))



fig, ax = volumeScatterPlot([((k[2]*4, k[4]*2, k[5]), measValues[k].tflops, predValues[k].perfTFlopsPheno, categories[predValues[k].limPheno], predValues[k].perfTFlopsV2) for k in keys], "R" + str(r) + " Extended Roofline Pheno,  " + str(device.name))
ax.set_xlabel("measured performance, GFlops/s")
ax.set_ylabel("predicted performance, GFLops/s")
plt.savefig("./autoplots/extroofline_pheno_range" + str(3) + "_" + device.name +  ".svg")

    
volumeScatterPlot([((k[2]*4, k[4]*2, k[5]), measValues[k].tflops, predValues[k].perfTFlopsEPMPheno, categories[predValues[k].limPheno], predValues[k].perfTFlopsPheno) for k in keys], "R" + str(r) + " EPM Pheno,  " + str(device.name))
#volumeScatterPlot([(k[0:3], measValues[k].tflops, predValues[k].perfTFlops2LimV4, categories[predValues[k].lim2LimPheno], predValues[k].perfTFlopsPheno) for k in keys], "R" + str(r) + " Standard Roofline Pheno,  " + str(device.name))


In [None]:
predTop = [(predValues[k].perfTFlopsV4, k[1:4], k[0], k[4], k[5]) for k in measValues.keys() ]
measTop = [(measValues[k].tflops,  k[1:4], k[0], k[4], k[5]) for k in measValues.keys() ]

print(len(predTop))

predTop = sorted(predTop)[-216:]
measTop = sorted(measTop)[-216:]


print("Top Preds:")
for p in predTop:    
    print("{} {:5.2f} {!r:12} {!r:4} {!r:4} {!r:4}".format("**" if p[1:] == measTop[-1][1:] else "  ", *p))

print()
print("Top Meas")
for p in measTop:    
    print("{} {:5.2f} {!r:12} {!r:4} {!r:4} {!r:4}".format("**" if p[1:] == predTop[-1][1:] else "  ", *p))

In [None]:
fig, ax = plt.subplots(figsize=(5, 5), dpi=200)



balances = []

pMaxMax = 0

def plotLimiter(q1, q2, color):    
    machineBalance = device.peakFP32() / getattr(device, q2) *0.8
    codeBalance = [ 1 / sum([perFlop(measValues[k], m) for m in q1]) for k in measValues]
    print(machineBalance)
    
    p1 = min([measValues[k].tflops for k in measValues])  / getattr(device, q2) * 1000
    
    p2 = max(codeBalance)
    plt.plot( (  machineBalance, p2), (device.peakFP32() / 1000, device.peakFP32() / 1000), color="gray" )
    plt.plot(  (0, machineBalance), (0 * getattr(device, q2) / 1000, device.peakFP32() / 1000), color=color , solid_capstyle="round" )
    plt.scatter( codeBalance, [measValues[k].tflops for k in measValues], color=color, marker=["v", "s", "^"][plotLimiter.marker], alpha=0.6 )
    plotLimiter.marker += 1
    return codeBalance

plotLimiter.marker = 0    
    
balances.append( plotLimiter( ["memLoad", "memStore"], "memBW", "C0") )
balances.append( plotLimiter(["L2Load_tex", "L2Store"], "L2BW", "C1") )


peakRate = device.smCount * device.clock * 64
machineBalance = device.peakFP32() / peakRate

codeBalance = [1 /  perFlop(predValues[k], "L1Cycles") / 64  for k in measValues]
p1 = min([measValues[k].tflops for k in measValues]) / peakRate * 1000
p2 = max(codeBalance)

print(p1)

plt.plot( (  machineBalance, 20), (device.peakFP32() / 1000, device.peakFP32() / 1000), color="gray", solid_capstyle="butt" )
plt.plot(  (p1, machineBalance), (p1 * peakRate  / 1000, device.peakFP32() / 1000), color="C2" , solid_capstyle="round"  )
plt.scatter( codeBalance, [measValues[k].tflops for k in measValues], color="C2", alpha=0.6  )

balances.append( codeBalance )


for k in range(len(balances[0])):
    plt.plot( (balances[0][k], balances[1][k]), [ measValues[ list(measValues.keys())[k] ].tflops ] * 2, color= getColor(list(measValues.keys())[k][0:3]), zorder=-1, alpha=0.2  )
    plt.plot( (balances[1][k], balances[2][k]), [ measValues[ list(measValues.keys())[k] ].tflops ] * 2, color=getColor(list(measValues.keys())[k][0:3]), zorder=-1, alpha=0.2  )

ax.set_xscale("log")
ax.set_yscale("log")

ax.set_xlabel("arithmetic Intensity, Flop/B or Flop/cycle")
ax.set_ylabel("Performance, GFlop/s")

ax.set_ylim([ 0.1, 90])
ax.set_xlim([ 0.05, 206])

fig.tight_layout()
fig.savefig("autoplots/bconv_roofline_1x1_" + device.name + ".pdf")


In [None]:
from random import *
from math import sqrt

nclusters = min(4, len(measValues.keys() ) // 2)

keys = list(measValues.keys())
shuffle(keys)


clusters = [[] for n in range(nclusters)]
clusterCenters = [keys[i] for i in range(nclusters)]


fakeValues = {}
for key  in measValues.keys():
    fakeValues[key] = [randint(0, 20), randint(0, 20)]  


def distance(key1, key2):
    #return ((fakeValues[key1][0] - fakeValues[key2][0])**2 + (fakeValues[key1][1] - fakeValues[key2][1])**2)
    return sqrt(( (perFlop(measValues[key1], "memLoad") - perFlop(measValues[key2], "memLoad")) / device.memBW  )**2 +
                ( (perFlop(measValues[key1], "L2Load_tex") - perFlop(measValues[key2], "L2Load_tex")) / device.L2BW ) **2) 
    

for i in range(0,10):

    for c in range(nclusters):
        print(len(clusters[c]))
        if len(clusters[c]) <= 1 or randint(0,2000) == 0:
            clusterCenters[c] = choice(keys)

    clusters = [[] for n in range(nclusters)]    
    for key in keys:
        minDistance = -1
        minCluster = -1
        for c in range(nclusters):            
            d = distance(key, clusterCenters[c])

            if d < minDistance or minDistance < 0:
                minDistance = d
                minCluster = c
        clusters[minCluster].append(key)
            
    for c in range(nclusters):
        shuffle(clusters[c])
        lowestDistanceSum = -1
        lowestKey = -1
        for k1 in clusters[c]:
            distanceSum = 0
            for k2 in clusters[c]:
                distanceSum += distance(k1, k2)
            if distanceSum < lowestDistanceSum or lowestDistanceSum < 0:
                lowestDistanceSum = distanceSum
                lowestKey = k1
        if c != -1:
            clusterCenters[c] = lowestKey
        
            
  
        
    print(clusterCenters)
          
    keyClusters = {}
    for k in keys:
        for c in range(nclusters):
            if k in clusters[c]:
                keyClusters[k] = c

    fig, ax = plt.subplots(figsize=[5, 5], dpi=200)
    #volumeScatterPlot([(k[0:3], perFlop(measValues[k], "memLoad"), perFlop(measValues[k], "L2Load_tex"), keyClusters[k]) for k in keys], "test_" + str(device.name))    
    for c in range(nclusters):
        ax.scatter( [perFlop(measValues[k], "memLoad") / device.memBW for k in clusters[c]], [perFlop(measValues[k], "L2Load_tex") / device.L2BW for k in clusters[c]])
        if clusterCenters[c] != -1:
            ax.plot( perFlop(measValues[clusterCenters[c]], "memLoad") / device.memBW,  perFlop(measValues[clusterCenters[c]], "L2Load_tex") / device.L2BW, "+", markersize=15, color="black" )
        #ax.scatter( [fakeValues[k][0] for k in clusters[c]], [fakeValues[k][1] for k in clusters[c]])        

    xlim = ax.get_xlim()
    ylim = ax.get_ylim()
    
    ax.set_xlim(min(xlim[0], ylim[0]), max(xlim[1], ylim[1])  )
    ax.set_ylim(min(xlim[0], ylim[0]), max(xlim[1], ylim[1])  )
        
    plt.show()
    plt.close(fig)



