In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from prettytable import PrettyTable

In [3]:
# constants

# 5 inputs: rmat, ny, 2d, soc, co

# graph properties: numVertices, numEdges, dAvg, dMax, d32, d512, diameter
# rmat: 4194304, 65660814, 15.7, 3687, 12.4%, 0.045%, 19
# ny: 264346, 730100, 2.8, 8, 0.0%, 0.000%, 721
# 2d: 1048576, 4190208, 4.0, 4, 0.0%, 0.000%, 2047
# soc: 4847571, 85702474, 17.7, 20333, 14.0%, 0.125%, 20
# co: 540486, 30491458, 56.4, 3299, 52.5%, 0.092%, 24

# array of graph names
graphs = ["rmat_", "ny_", "2d_", "soc_", "co_"] 

# property column names
properColName = ["numv", "nume", "dAvg", "dMax", "d>=32", "d>=512", "diameter"] 

# an array of all the styles
styles = ["_V", "_E", "_Topo", "_Data", "_Dup", "_NonDup", "_Push", "_Pull", "_ReadWrite", "_ReadModifyWrite", 
          "_Determ", "_NonDeterm", "_Persist", "_NonPersist", "_Thread", "_Warp", "_Block_", "_GlobalAdd_", "_BlockAdd_", "_Reduction_"]

# array of graph property dictionaries
graphProper = [{"rmat_":4194304, "ny_":264346, "2d_":1048576, "soc_":4847571, "co_":540486},
               {"rmat_":65660814, "ny_":730100, "2d_":4190208, "soc_":85702474, "co_":30491458},
               {"rmat_":15.7, "ny_":2.8, "2d_":4.0, "soc_":17.7, "co_":56.4},
               {"rmat_":3687, "ny_":8, "2d_":4, "soc_":20333, "co_":3299},
               {"rmat_":0.124, "ny_":0.000, "2d_":0.000, "soc_":0.140, "co_":0.525},
               {"rmat_":0.00045, "ny_":0.00000, "2d_":0.00000, "soc_":0.00125, "co_":0.00092},
               {"rmat_":19, "ny_":721, "2d_":2047, "soc_":20, "co_":24}
               ] 

# # print graph properties
# for i in range(len(graphs)):
#   graphName = graphs[i]
#   print(graphs[i])
#   for j in range(len(properColName)):
#     print((graphProper[j][graphs[i]]))
#   print("\n")

# read cuda results
csvFile = "cuda.csv"

In [4]:
# compute the correlation between between (style, graph-properties)
def computeCorrelation(df, styles, properColName):
  tb = PrettyTable(["Styles"] + properColName)
  for i in range(len(styles)):
    styleName = styles[i].replace('_', '')
    styleDF = df[df['filename'].str.contains(styleName)] # filter data
    styleDF = styleDF.reset_index()
    newRow = [styleName]
    # print(styleDF)
    for j in range(len(properColName)):
      properName = properColName[j]
      # print("Correlation between %s and %s:" % (styleName, properName))
      styleDF["cuda"] = styleDF["cuda"].astype(float)
      styleDF[properName] = styleDF[properName].astype(float)
      cor = styleDF["cuda"].corr(styleDF[properName])
      newRow.append(cor.round(decimals=5))
      # print(cor)
    tb.add_row(newRow)
  return tb

# add graph properties values
def addGraphProp(df, graphs, properColName, graphProper):
  # add graph properties columns, set default values to null
  newDF = df
  newDF = pd.concat([newDF, pd.DataFrame(columns = properColName)])
  print(newDF)
  for i in range(len(graphs)):
    graphName = graphs[i]
    for j in range(len(properColName)):
      newDF.loc[newDF['filename'].str.contains(graphName), properColName[j]] = graphProper[j][graphName]
  print(newDF)
  return newDF

# add styles columns
def addStylesCol(df, colNames, styleX):
  newDF = (df[df['filename'].str.contains(styleX)]).reset_index()
  for styleY in colNames:
    dfY = (newDF[newDF['filename'].str.contains(styleY)]).reset_index() # the implementations that have both styleX and styleY
    dfY = dfY.rename({'cuda': styleY}, axis='columns')  
    newDF = pd.merge(newDF, dfY, on='filename')
  return newDF


In [7]:
from scipy.stats.mstats import gmean

rawDF = pd.read_csv(csvFile, usecols = ["filename", "cuda"])
styleColName = ["V", "E", "Topo", "Data", "Dup", "NonDup", "Push", "Pull", "RW", "RMW", 
          "Determ", "NonDeterm", "Persist", "NonPersist", "Thread", "Warp", "Block_", "GlobalAdd", "BlockAdd", "Reduction"]

# remove slow CudaAtomic results
df = (rawDF[rawDF['filename'].str.contains('CudaAtomic') == False]).reset_index()

meanTB = []
medianTB = []
corrTB = []

for baseStyle in styles:
  styleDF = ((df[df['filename'].str.contains(baseStyle)]).reset_index())[['filename', 'cuda']]

  meanRow = [baseStyle.replace('_', '')]
  medianRow = [baseStyle.replace('_', '')]
  corrRow = [baseStyle.replace('_', '')]
  for colStyle in styles:
  # for idx in range(14):
  #   colStyle = styles[idx]
    baseDF = ((styleDF[styleDF['filename'].str.contains(colStyle) == False]))[['filename', 'cuda']]

    baseMean = (baseDF['cuda'].mean())
    baseMedian = (baseDF['cuda'].median())

    tmpDF = ((styleDF[styleDF['filename'].str.contains(colStyle)]))[['filename', 'cuda']]
    tmpDF = tmpDF.reset_index() # (i + j) styles
    tmpDF = tmpDF.rename({'cuda': colStyle}, axis='columns') 
    meanVal = (tmpDF[colStyle].mean())
    medianVal = (tmpDF[colStyle].median())

    meanDiff = meanVal / baseMean
    medianDiff = medianVal / baseMedian
    corrVal = (baseDF['cuda'].astype(float)).corr(tmpDF[colStyle].astype(float))

    meanRow.append(meanDiff.round(decimals = 4))
    medianRow.append(medianDiff.round(decimals = 4))
    # gmeanRow.append(gmeanDiff.round(decimals = 4))
    
    corrRow.append(round(corrVal, 4))

  meanTB.append(meanRow)
  medianTB.append(medianRow)
  corrTB.append(corrRow)

meanDF = pd.DataFrame(meanTB, columns=[["Styles (mean)"] + styleColName])
medianDF = pd.DataFrame(medianTB, columns=[["Styles (median)"] + styleColName])
corrDF = pd.DataFrame(corrTB, columns=[["Styles (correlation)"] + styleColName])

corrDF.style.bar(subset=['V'], align='zero', color=['#BA4B2F', '#7AAFe0'])

########Figure 15 in the paper, ratio of the meadian throughputs
print(medianDF)







    Styles (median)       V       E    Topo    Data     Dup  NonDup    Push  \
0                 V     NaN     NaN  0.8698  1.1045  0.6607  0.6844  1.6300   
1                 E     NaN     NaN  1.5135  0.3491     NaN  0.1993  0.1715   
2              Topo  0.6126  1.6323     NaN     NaN     NaN  1.5064  1.3136   
3              Data  2.0022  0.4995     NaN     NaN  0.6599  0.2944  1.3044   
4               Dup     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
5            NonDup  1.7064  0.5860  2.5648  0.3899     NaN     NaN  1.6371   
6              Push  3.0168  0.4502  1.3745  1.2397  0.6798  0.7356     NaN   
7              Pull  2.1211     NaN  1.3609  1.4441     NaN  0.6956     NaN   
8         ReadWrite  0.9470  1.0560  1.2421  0.8051     NaN  0.8290  2.7708   
9   ReadModifyWrite  1.7430  0.5737  0.8024  1.2462  1.1980  1.1230     NaN   
10           Determ  1.7827  0.8532  1.3857  0.9528  0.7310  0.4826  1.4146   
11        NonDeterm  2.1686  1.0140  1.0997  1.4611 

In [None]:
import pandas as pd
df = pd.DataFrame({'col1': list('ABCDEFG'),
                   'col2': list('xywerwr'),
                   'WOE': [-.08, -.03, .01, .09, .15, .24, '']})
print(df['WOE'])
df['WOE'] = pd.to_numeric(df['WOE'])
print(df['WOE'])
df.style.bar(subset=['WOE'], align='mid', color=['#BA4B2F', '#7AAFe0'])
from google.colab import files
# meanDF.to_csv('style-mean7.csv') 
# medianDF.to_csv('style-median7.csv') 
df.to_csv('df.csv') 
# files.download('style-mean7.csv')
# files.download('style-median7.csv')
files.download('df.csv')

0   -0.08
1   -0.03
2    0.01
3    0.09
4    0.15
5    0.24
6        
Name: WOE, dtype: object
0   -0.08
1   -0.03
2    0.01
3    0.09
4    0.15
5    0.24
6     NaN
Name: WOE, dtype: float64


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files
# meanDF.to_csv('style-mean7.csv') 
# medianDF.to_csv('style-median7.csv') 
corrDF.to_csv('style-corr7.csv') 
# files.download('style-mean7.csv')
# files.download('style-median7.csv')
files.download('style-corr7.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# new dataframe: add graph property columns
# newDF = addGraphProp(df, graphs, properColName, graphProper)
# print(newDF)

# calculate correlations with graph property
# graphTB = computeCorrelation(newDF, styles, properColName)
# print(graphTB)

# new dataframe: add style columns
styleX = '_V'
style1 = '_Push'
style2 = '_Pull'
styleDF = ((df[df['filename'].str.contains(styleX)]).reset_index())[['filename', 'cuda']]

# print(styleDF.median())
# print(styleDF.std())
# x = (((styleDF[styleDF['filename'].str.contains(style1)])).reset_index())[['cuda']]
# x = x.rename({'cuda': style1}, axis='columns')
# # x = x.sort_values(by=[style1]) 
# print(x.median())
# print(x.std())
# y = (((styleDF[styleDF['filename'].str.contains(style2)])).reset_index())[['cuda']]
# y = y.rename({'cuda': style2}, axis='columns') 
# # y = y.sort_values(by=[style2]) 
# print(y.median())
# print(y.std())

# result = pd.concat([x, y], axis=1)
# # print(result)
# cor = result[style1].corr(result[style2])
# print(cor)
# print(x)
# print(y)
# x.corr(y)
# print(styleDF)

result = styleDF
meanBase = (styleDF['cuda'].mean()).astype(float)
medianBase = (styleDF['cuda'].median()).astype(float)
# print(meanBase)
for styleY in styles:
  print(styleY)
  dfY = ((styleDF[styleDF['filename'].str.contains(styleY)]))[['filename', 'cuda']]
  dfY = dfY.rename({'cuda': styleY}, axis='columns') 
  result = pd.concat([result, dfY[styleY]], axis=1)
  meanY = (dfY[styleY].mean())
  medianY = (dfY[styleY].median())
  # print(meanY)
  diff1 = (meanY - meanBase) / meanBase
  diff2 = (medianY - medianBase) / medianBase
  print(diff1)
  print(diff2)
  # print(diff1)
  # dif2 = (dfY.median() - medianBase) // medianBase
  # print(dif1)
  # print(dif2)
# print(result)
# result.boxplot(column=styles)  
 
  # print(styleX)
  # print(styleY)
  # result["cuda"] = result["cuda"].astype(float)
  # result[styleY] = result[styleY].astype(float)
  # cor = result["cuda"].corr(result[styleY])
  # print(cor)
# print(result)
# result["_Push"] = result["_Push"].astype(float)
# result['_Pull'] = result['_Pull'].astype(float)
# x = result["_Push"].reset_index()
# print(x)
# x = x.astype(float)
# y = result['_Pull'].reset_index()
# print(y)
# y = y.astype(float)
# cor = x.corr(y)
# print(result["_Push"])

# print(styleDF)

# # add graph properties columns, set default values to null
# newDF = pd.concat([df, pd.DataFrame(columns = properColName)])
# print(newDF)

# for i in range(len(graphs)):
#   graphName = graphs[i]
#   for j in range(len(properColName)):
#      newDF.loc[newDF['filename'].str.contains(graphName), properColName[j]] = graphProper[j][graphName]
# print(newDF)

# # calculate correlations with graph properties
# tb = computeCorrelation(df, styles, properColName)
# print(tb)

                                               filename      cuda     numv  \
0     rmat_3090_BFS_CUDA_E_Data_Push_Determ_IntType_...  0.399387  4194304   
3     rmat_3090_BFS_CUDA_E_Data_Push_Determ_IntType_...  0.375048  4194304   
5     rmat_3090_BFS_CUDA_E_Data_Push_NonDeterm_IntTy...  0.840212  4194304   
6     rmat_3090_BFS_CUDA_E_Data_Push_NonDeterm_IntTy...  0.833692  4194304   
10    rmat_3090_BFS_CUDA_E_Data_Push_NonDeterm_IntTy...  0.879925  4194304   
...                                                 ...       ...      ...   
7481  16_ny_PR_CUDA_Pull_FloatType_NonPersist_Thread...  0.015549   264346   
7482  16_ny_PR_CUDA_Pull_FloatType_Persist_Block_Det...  0.012117   264346   
7483  16_ny_PR_CUDA_Pull_FloatType_NonPersist_Warp_D...  0.027253   264346   
7484  16_ny_PR_CUDA_Pull_FloatType_Persist_Thread_De...  0.014077   264346   
7485  16_ny_PR_CUDA_Pull_FloatType_Persist_Warp_Dete...  0.041279   264346   

          nume  dAvg  dMax  d>=32   d>=512 diameter  
0     656

In [None]:


newDF = addStylesCol(df, styles, 'V')
print(newDF)
      # newDF = pd.concat([newDF, dfY(columns = colNames)])
      # newDF.loc[newDF['filename'].str.contains(graphName), properColName[j]] = graphProper[j][graphName]
