In [None]:
import pandas as pd
import numpy as np

from sklearn.utils import Bunch

def _get_mtxfeature(data):
    '''Get the matrix feature '''
    # print(data.columns)
    feature = data.drop(['mtx_name', 'OP_SpMM'], axis=1)
    np_feature = feature.values
    # print(np_feature.shape)
    return np_feature


def _get_label(data):
    '''Get the label of the data'''
    label = data['OP_SpMM']
    np_label = label.values
    return np_label


def _get_feature_names(data):
    '''feature name'''
    np_fnames = data.columns[1:-1].values
    return np_fnames


def _get_target_names(data):
    '''SpMM method name'''
    tnames = data.iloc[0].values
    return tnames

def load_mtx(csv_path="../ParamSpMM-log/", mask=None, dim=16):
    train_data_csv = pd.read_csv(csv_path + "dim" + str(dim) + "_OP_SpMM.csv")
    bench_data_csv = pd.read_csv(csv_path + "dim" + str(dim) + ".csv")
    if mask is not None:
        train_data_csv = train_data_csv[mask]
        bench_data_csv = bench_data_csv[mask]
    mtx = Bunch()
    mtx.feature = _get_mtxfeature(train_data_csv)
    mtx.label = _get_label(train_data_csv)
    mtx.fnames = _get_feature_names(train_data_csv)
    mtx.tnames = _get_target_names(bench_data_csv)
    return mtx

# speedup of ParamSpMM(with SpMM-decider) and baseline libraries over cusparse
def performance_spd(rnd_tree, dim):
    mtx_data = load_mtx(dim=dim)
    tp = pd.read_csv("../ParamSpMM-log/dim" + str(dim) + ".csv")
    predict = rnd_tree.predict(mtx_data.feature)
    all_predict_tp = tp.values[np.arange(predict.shape[0]), predict].reshape(-1, 1)
    # cal the performance loss
    lable = mtx_data.label
    mask = predict != lable
    # optimal performance
    op_tp = pd.DataFrame(tp.max(axis=1, numeric_only=True))
    # optimal performance of wrong predict
    mask_op = op_tp.values[mask]
    error_num = mask.sum()
    mask_tp = tp.values[mask]
    mask_predict = predict[mask]
    predict_tp = mask_tp[np.arange(error_num), mask_predict].reshape(-1, 1)
    loss = (mask_op - predict_tp) / mask_op
    # test data size
    test_size = lable.shape[0] * 0.2
    loss = loss.sum() / test_size
    # print("Wrong number{} in {}", error_num, test_size)
    # print("Loss: ", loss)
    # read baseline throughput of cusparse and gespmm
    baseline = pd.read_csv("../ParamSpMM-log/dim" + str(dim) + "_baseline.csv")
    # cusparse_woreorder = pd.read_csv(
    #     "../ParamSpMM-log/ablation_study/wo_reorder/dim" + str(dim) + "_baseline.csv"
    # )
    # read GNNA-SpMM throughput
    gnna_op = pd.read_csv("../ParamSpMM-log/baseline_SpMM/GNNASpMM_op.csv")
    DASpMM = pd.read_csv("../ParamSpMM-log/baseline_SpMM/DASpMM.csv")
    # cal speedup
    speedup = {}
    # speedup["cusparse_wor"] = cusparse_woreorder["cusparse"] / baseline["cusparse"]
    speedup["gespmm"] = baseline["gespmm"] / baseline["cusparse"]
    speedup["GNNASpMM_op"] = gnna_op[str(dim)] / baseline["cusparse"]
    speedup["DASpMM"] = DASpMM[str(dim)] / baseline["cusparse"]
    speedup["ParamSpMM"] = all_predict_tp[:, 0] / baseline["cusparse"]
    
    # Print average speedup  
    # print("Average Speedup of ParamSpMM (with SpMM-decider) and baseline libraries over cusparse:")  
    # print("GESpMM: ", pd.DataFrame(speedup["gespmm"]).mean().values[0])  
    # print("GNNASpMM: ", pd.DataFrame(speedup["GNNASpMM_op"]).mean().values[0])  
    # print("DASpMM: ", pd.DataFrame(speedup["DASpMM"]).mean().values[0])  
    # print("ParamSpMM: ", pd.DataFrame(speedup["ParamSpMM"]).mean().values[0])  
    
    # Print speedup of ParamSpMM over baselines  
    print("Speedup of ParamSpMM over baselines:")  
    print("CuSPARSE: ", pd.DataFrame(speedup["ParamSpMM"]).values.mean())
    print("GESpMM: ", (pd.DataFrame(speedup["ParamSpMM"]).values / pd.DataFrame(speedup["gespmm"]).values).mean())  
    print("GNNASpMM: ", (pd.DataFrame(speedup["ParamSpMM"]).values / pd.DataFrame(speedup["GNNASpMM_op"]).values).mean())  
    print("DASpMM: ", (pd.DataFrame(speedup["ParamSpMM"]).values / pd.DataFrame(speedup["DASpMM"]).values).mean())  
    
    pd.DataFrame(speedup).to_csv(
        "../ParamSpMM-log/spd/ParamSpMM" + str(dim) + "_speedup.csv", index=False
    )

In [None]:
dim_list = [16,32,48,64,80,96,112,128,144,160,176,192,208,224,240,256] #16,32,48,64,80,96,112,128,,240,256
for dim in dim_list:
    rnd_tree = pd.read_pickle("./rnd_tree_" + str(dim) + ".pkl")
    print("dim: ", dim)
    performance_spd(rnd_tree, dim)