In [1]:
import pandas as pd
import numpy as np


In [2]:
def _get_mtxfeature(data):
    '''Get the matrix feature '''
    # print(data.columns)
    feature = data.drop(['mtx_name', 'OP_SpMM'], axis=1)
    np_feature = feature.values
    # print(np_feature.shape)
    return np_feature


def _get_label(data):
    '''Get the label of the data'''
    label = data['OP_SpMM']
    np_label = label.values
    return np_label


def _get_feature_names(data):
    '''feature name'''
    np_fnames = data.columns[1:-1].values
    return np_fnames


def _get_target_names(data):
    '''SpMM method name'''
    tnames = data.iloc[0].values
    return tnames

In [3]:
from sklearn.utils import Bunch


def load_mtx(csv_path="../ParamSpMM-log/", mask=None, dim=16):
    train_data_csv = pd.read_csv(csv_path + "dim" + str(dim) + "_OP_SpMM.csv")
    bench_data_csv = pd.read_csv(csv_path + "dim" + str(dim) + ".csv")
    if mask is not None:
        train_data_csv = train_data_csv[mask]
        bench_data_csv = bench_data_csv[mask]
    mtx = Bunch()
    mtx.feature = _get_mtxfeature(train_data_csv)
    mtx.label = _get_label(train_data_csv)
    mtx.fnames = _get_feature_names(train_data_csv)
    mtx.tnames = _get_target_names(bench_data_csv)
    return mtx

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import export_graphviz


def rnd_tree_model(dim, n_estimators=30, random_state=17, importance=False):
    # load data
    mtx_data = load_mtx(dim=dim)
    # dataset split
    X_train, X_test, y_train, y_test = train_test_split(
        mtx_data.feature, mtx_data.label, test_size=0.2, random_state=random_state
    )
    # create model
    rnd = RandomForestClassifier(n_estimators=n_estimators, n_jobs=-1, random_state=random_state)
    # train model
    rnd.fit(X_train, y_train)
    # test model
    score = rnd.score(X_test, y_test)
    print("Test score:", score)
    # feature importance
    if importance:
        for score, name in zip(rnd.feature_importances_, mtx_data.fnames):
            print(round(score, 2), name)
    return rnd

In [5]:
import math


def performance_loss(rnd_tree, csv_path, dim, random_state=17):
    data = np.ones((202, 1))
    data_label = np.arange(202)
    # get test set
    X_train, X_test, y_train, y_test = train_test_split(
        data, data_label, test_size=0.2, random_state=random_state
    )
    mask = np.ones(len(data_label), dtype=bool)
    mask[y_test] = False
    mask = ~mask

    mtx_data = load_mtx(csv_path=csv_path, dim=dim, mask=mask)
    tp = pd.read_csv(csv_path + "dim" + str(dim) + ".csv")[mask]
    predict = rnd_tree.predict(mtx_data.feature)
    # cal the performance loss
    lable = mtx_data.label
    mask = predict != lable
    # optimal performance
    op_tp = pd.DataFrame(tp.max(axis=1, numeric_only=True))
    # optimal performance of wrong predict
    mask_op = op_tp.values[mask]
    error_num = mask.sum()
    mask_tp = tp.values[mask]
    mask_predict = predict[mask]
    predict_tp = mask_tp[np.arange(error_num), mask_predict].reshape(-1, 1)
    loss = (mask_op - predict_tp) / mask_op
    # test data size
    test_size = lable.shape[0]
    loss = loss.sum() / test_size
    print("dim{} Wrong number{} in {}".format(dim, error_num, test_size))
    print("Average normalized performance: ", 1 - loss)


def eval_performance_loss(dim, csv_path="./"):
    if dim % 32 == 0:
        approx_dim = dim
    else:
        approx_dim = (math.ceil(dim / 32) * 2 - 1) * 16
    pickle_path = "../../" + "rnd_tree_" + str(approx_dim) + ".pkl"
    rnd_tree = pickle.load(open(pickle_path, "rb"))
    performance_loss(rnd_tree, csv_path, dim)

In [6]:
# speedup of ParamSpMM(with SpMM-decider) and baseline libraries over cusparse
def performance_spd(rnd_tree, dim):
    mtx_data = load_mtx(dim=dim)
    tp = pd.read_csv("../ParamSpMM-log/dim" + str(dim) + ".csv")
    predict = rnd_tree.predict(mtx_data.feature)
    all_predict_tp = tp.values[np.arange(predict.shape[0]), predict].reshape(-1, 1)
    # cal the performance loss
    lable = mtx_data.label
    mask = predict != lable
    # optimal performance
    op_tp = pd.DataFrame(tp.max(axis=1, numeric_only=True))
    # optimal performance of wrong predict
    mask_op = op_tp.values[mask]
    error_num = mask.sum()
    mask_tp = tp.values[mask]
    mask_predict = predict[mask]
    predict_tp = mask_tp[np.arange(error_num), mask_predict].reshape(-1, 1)
    loss = (mask_op - predict_tp) / mask_op
    # test data size
    test_size = lable.shape[0] * 0.2
    loss = loss.sum() / test_size
    # print("Wrong number{} in {}", error_num, test_size)
    # print("Loss: ", loss)
    # read baseline throughput of cusparse and gespmm
    baseline = pd.read_csv("../ParamSpMM-log/dim" + str(dim) + "_baseline.csv")   
    # cal speedup
    speedup = {}
    speedup["ParamSpMM"] = all_predict_tp[:, 0] / baseline["cusparse"]

    average_speedup = pd.DataFrame(speedup["ParamSpMM"]).mean()
    print("Average speedup of ParamSpMM over cusparse: ", average_speedup.values[0])
    # geomean
    

In [7]:
import _pickle as pickle
pkl_rnd_tree_16_file = "./rnd_tree_16.pkl"
rnd_tree_16 = rnd_tree_model(dim = 16, n_estimators=30, random_state=17)
performance_spd(rnd_tree_16, dim = 16)
performance_loss(rnd_tree_16, csv_path="../ParamSpMM-log/", dim = 16)
with open(pkl_rnd_tree_16_file, 'wb') as f:
    pickle.dump(rnd_tree_16, f)


Test score: 0.7804878048780488
Average speedup of ParamSpMM over cusparse:  2.6751854127874926
dim16 Wrong number9 in 41
Average normalized performance:  0.9883858115447461


In [8]:

pkl_rnd_tree_32_file = "./rnd_tree_32.pkl"
rnd_tree_32 = rnd_tree_model(dim = 32, n_estimators=50, random_state=7)
performance_spd(rnd_tree_32, dim = 32)
performance_loss(rnd_tree_32, csv_path="../ParamSpMM-log/", dim = 32, random_state=7)
with open(pkl_rnd_tree_32_file, 'wb') as f:
    pickle.dump(rnd_tree_32, f)

Test score: 0.6341463414634146
Average speedup of ParamSpMM over cusparse:  2.0669385560292755
dim32 Wrong number15 in 41
Average normalized performance:  0.9851043390686124


In [27]:
pkl_rnd_tree_48_file = "./rnd_tree_48.pkl"
rnd_tree_48 = rnd_tree_model(dim = 48, n_estimators=80, random_state=7)
performance_spd(rnd_tree_48, dim = 48)
performance_loss(rnd_tree_48, csv_path="../ParamSpMM-log/", dim = 48, random_state=7)
with open(pkl_rnd_tree_48_file, 'wb') as f:
    pickle.dump(rnd_tree_48, f)

Test score: 0.7560975609756098
Average speedup of ParamSpMM over cusparse:  1.7331437411639568
dim48 Wrong number10 in 41
Average normalized performance:  0.9982753081816599


In [10]:
pkl_rnd_tree_64_file = "./rnd_tree_64.pkl"
rnd_tree_64 = rnd_tree_model(dim = 64, n_estimators=80, random_state=17) #50
performance_spd(rnd_tree_64, dim = 64)
performance_loss(rnd_tree_64, csv_path="../ParamSpMM-log/", dim = 64)
with open(pkl_rnd_tree_64_file, 'wb') as f:
    pickle.dump(rnd_tree_64, f)

Test score: 0.7073170731707317
Average speedup of ParamSpMM over cusparse:  1.8114762867942442
dim64 Wrong number12 in 41
Average normalized performance:  0.9824215278847933


In [11]:
pkl_rnd_tree_80_file = "./rnd_tree_80.pkl"
rnd_tree_80 = rnd_tree_model(dim = 80, n_estimators=80, random_state=17)
performance_spd(rnd_tree_80, dim = 80)
performance_loss(rnd_tree_80, csv_path="../ParamSpMM-log/", dim = 80)
with open(pkl_rnd_tree_80_file, 'wb') as f:
    pickle.dump(rnd_tree_80, f)

Test score: 0.8780487804878049
Average speedup of ParamSpMM over cusparse:  1.767991611198476
dim80 Wrong number5 in 41
Average normalized performance:  0.9997360101304388


In [12]:
pkl_rnd_tree_96_file = "./rnd_tree_96.pkl"
rnd_tree_96 = rnd_tree_model(dim = 96, n_estimators=80, random_state=17)
performance_spd(rnd_tree_96, dim = 96)
performance_loss(rnd_tree_96, csv_path="../ParamSpMM-log/", dim = 96)
with open(pkl_rnd_tree_96_file, 'wb') as f:
    pickle.dump(rnd_tree_96, f)

Test score: 0.8536585365853658
Average speedup of ParamSpMM over cusparse:  1.8975904847192455
dim96 Wrong number6 in 41
Average normalized performance:  0.9955133267107091


In [13]:
pkl_rnd_tree_112_file = "./rnd_tree_112.pkl"
rnd_tree_112 = rnd_tree_model(dim = 112, n_estimators=80, random_state=17)
performance_spd(rnd_tree_112, dim = 112)
performance_loss(rnd_tree_112, csv_path="../ParamSpMM-log/", dim = 112)
with open(pkl_rnd_tree_112_file, 'wb') as f:
    pickle.dump(rnd_tree_112, f)

Test score: 0.7073170731707317
Average speedup of ParamSpMM over cusparse:  1.843144783902165
dim112 Wrong number12 in 41
Average normalized performance:  0.9931178123315184


In [14]:
pkl_rnd_tree_128_file = "./rnd_tree_128.pkl"
rnd_tree_128 = rnd_tree_model(dim = 128, n_estimators=30, random_state=17)
performance_spd(rnd_tree_128, dim = 128)
performance_loss(rnd_tree_128, csv_path="../ParamSpMM-log/", dim = 128)
with open(pkl_rnd_tree_128_file, 'wb') as f:
    pickle.dump(rnd_tree_128, f)

Test score: 0.7073170731707317
Average speedup of ParamSpMM over cusparse:  1.9671087158319827
dim128 Wrong number12 in 41
Average normalized performance:  0.9929905770908214


In [15]:
pkl_rnd_tree_144_file = "./rnd_tree_144.pkl"
rnd_tree_144 = rnd_tree_model(dim = 144, n_estimators=30, random_state=17)
performance_spd(rnd_tree_144, dim = 144)
performance_loss(rnd_tree_144, csv_path="../ParamSpMM-log/", dim = 144)
with open(pkl_rnd_tree_144_file, 'wb') as f:
    pickle.dump(rnd_tree_144, f)

Test score: 0.8780487804878049
Average speedup of ParamSpMM over cusparse:  2.003422847450495
dim144 Wrong number5 in 41
Average normalized performance:  0.9928413813979304


In [16]:
pkl_rnd_tree_160_file = "./rnd_tree_160.pkl"
rnd_tree_160 = rnd_tree_model(dim = 160, n_estimators=50, random_state=17)
performance_spd(rnd_tree_160, dim = 160)
performance_loss(rnd_tree_160, csv_path="../ParamSpMM-log/", dim = 160)
with open(pkl_rnd_tree_160_file, 'wb') as f:
    pickle.dump(rnd_tree_160, f)

Test score: 0.8780487804878049
Average speedup of ParamSpMM over cusparse:  1.9033393393599127
dim160 Wrong number5 in 41
Average normalized performance:  0.9973749845796686


In [17]:
pkl_rnd_tree_176_file = "./rnd_tree_176.pkl"
rnd_tree_176 = rnd_tree_model(dim = 176, n_estimators=30, random_state=17)
performance_spd(rnd_tree_176, dim = 176)
performance_loss(rnd_tree_176, csv_path="../ParamSpMM-log/", dim = 176)
with open(pkl_rnd_tree_176_file, 'wb') as f:
    pickle.dump(rnd_tree_176, f)

Test score: 0.7317073170731707
Average speedup of ParamSpMM over cusparse:  1.877301333678809
dim176 Wrong number11 in 41
Average normalized performance:  0.9920806916796939


In [18]:
pkl_rnd_tree_192_file = "./rnd_tree_192.pkl"
rnd_tree_192 = rnd_tree_model(dim = 192, n_estimators=30, random_state=17)
performance_spd(rnd_tree_192, dim = 192)
performance_loss(rnd_tree_192, csv_path="../ParamSpMM-log/", dim = 192)
with open(pkl_rnd_tree_192_file, 'wb') as f:
    pickle.dump(rnd_tree_192, f)

Test score: 0.7073170731707317
Average speedup of ParamSpMM over cusparse:  1.8732940336041044
dim192 Wrong number12 in 41
Average normalized performance:  0.9896369488751348


In [19]:
pkl_rnd_tree_208_file = "./rnd_tree_208.pkl"
rnd_tree_208 = rnd_tree_model(dim = 208, n_estimators=80, random_state=17)
performance_spd(rnd_tree_208, dim = 208)
performance_loss(rnd_tree_208, csv_path="../ParamSpMM-log/", dim = 208)
with open(pkl_rnd_tree_208_file, 'wb') as f:
    pickle.dump(rnd_tree_208, f)

Test score: 0.9024390243902439
Average speedup of ParamSpMM over cusparse:  1.7694193201579493
dim208 Wrong number4 in 41
Average normalized performance:  0.9929091574405505


In [20]:
pkl_rnd_tree_224_file = "./rnd_tree_224.pkl"
rnd_tree_224 = rnd_tree_model(dim = 224, n_estimators=80, random_state=17)
performance_spd(rnd_tree_224, dim = 224)
performance_loss(rnd_tree_224, csv_path="../ParamSpMM-log/", dim = 224)
with open(pkl_rnd_tree_224_file, 'wb') as f:
    pickle.dump(rnd_tree_224, f)

Test score: 0.8048780487804879
Average speedup of ParamSpMM over cusparse:  1.7529066170207892
dim224 Wrong number8 in 41
Average normalized performance:  0.9967410365842951


In [21]:
pkl_rnd_tree_240_file = "./rnd_tree_240.pkl"
rnd_tree_240 = rnd_tree_model(dim = 240, n_estimators=30, random_state=17)
performance_spd(rnd_tree_240, dim = 240)
performance_loss(rnd_tree_240, csv_path="../ParamSpMM-log/", dim = 240)
with open(pkl_rnd_tree_240_file, 'wb') as f:
    pickle.dump(rnd_tree_240, f)

Test score: 0.7317073170731707
Average speedup of ParamSpMM over cusparse:  1.874424849659235
dim240 Wrong number11 in 41
Average normalized performance:  0.9919023974157288


In [28]:
pkl_rnd_tree_256_file = "./rnd_tree_256.pkl"
rnd_tree_256 = rnd_tree_model(dim = 256, n_estimators=80, random_state=17)
performance_spd(rnd_tree_256, dim = 256)
performance_loss(rnd_tree_256, csv_path="../ParamSpMM-log/", dim = 256)
with open(pkl_rnd_tree_256_file, 'wb') as f:
    pickle.dump(rnd_tree_256, f)

Test score: 0.7073170731707317
Average speedup of ParamSpMM over cusparse:  1.9311552764911122
dim256 Wrong number12 in 41
Average normalized performance:  0.9875003393276511
