In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import ReLU
from tensorflow.keras.optimizers import Adam

train = pd.read_csv('/Users/bdlab/Desktop/sparse-matrix-multiplication/scenario-extraction/d-optimal/spmm-latency-traintest/train-test-csv/nonsquare-train-1035-from-spmm-contain-todense-over-3s-1293.csv')
test = pd.read_csv('/Users/bdlab/Desktop/sparse-matrix-multiplication/scenario-extraction/d-optimal/spmm-latency-traintest/train-test-csv/nonsquare-test-258-from-spmm-contain-todense-over-3s-1293.csv')

In [2]:
# Train
X_train = train[['lr','lc','rc','ld','rd','lnnz','rnnz','lr*lc','lc*rc','lr*rc']] 
sp_smdm_y_train = train['sp_smdm']
bz_smsm_y_train = train['bz_smsm']

# Test
X_test = test[['lr','lc','rc','ld','rd','lnnz','rnnz','lr*lc','lc*rc','lr*rc']] 
sp_smdm_y_test = test['sp_smdm']
bz_smsm_y_test = test['bz_smsm']

In [3]:
from sklearn.metrics import mean_squared_error

def mape_error(y_test, y_pred):
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

def rmse_error(y_true, y_pred):
    rmse = np.sqrt(np.mean(np.square(y_pred - y_true))) 
    return rmse

# sp_smdm

In [4]:
import xgboost as xgb

# Train + Valid cross-validation을 거친, 최적의 하이퍼파라미터를 사용
sp_smdm_xgbregressor_model = xgb.XGBRegressor(
objective = 'reg:squarederror',
max_depth=int(14),
learning_rate=0.03506040707378152,
n_estimators=int(125.55712369504641),
subsample=0.618897607560714,
reg_lambda = 0.9083177560930208,    
min_child_weight = 0.3293211705027276,
n_jobs=-1
                             )
# 모델 훈련
sp_smdm_xgbregressor_model.fit(X_train, sp_smdm_y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.03506040707378152, max_delta_step=0, max_depth=14,
             min_child_weight=0.3293211705027276, missing=nan,
             monotone_constraints='()', n_estimators=125, n_jobs=-1,
             num_parallel_tree=1, objective='reg:squarederror', random_state=0,
             reg_alpha=0, reg_lambda=0.9083177560930208, scale_pos_weight=1,
             subsample=0.618897607560714, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [5]:
# sp_smdm 최종 y_pred 배열 생성
sp_smdm_y_pred = sp_smdm_xgbregressor_model.predict(X_test)
sp_smdm_y_pred = sp_smdm_y_pred.astype(np.int64)

# bz_smsm

In [6]:
import xgboost as xgb

# Train + Valid cross-validation을 거친, 최적의 하이퍼파라미터를 사용
bz_smsm_xgbregressor_model = xgb.XGBRegressor(
objective = 'reg:squarederror',
max_depth=int(16.992869637868704),
learning_rate=0.04507011754186229,
n_estimators=int(62.21252826172613),
subsample=0.8545270296912297,
reg_lambda = 0.9671716237976958,    
colsample_bytree=0.9784517810866844,
n_jobs=-1
                             )
# 모델 훈련
bz_smsm_xgbregressor_model.fit(X_train, bz_smsm_y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.9784517810866844, gamma=0,
             gpu_id=-1, importance_type='gain', interaction_constraints='',
             learning_rate=0.04507011754186229, max_delta_step=0, max_depth=16,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=62, n_jobs=-1, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=0.9671716237976958, scale_pos_weight=1,
             subsample=0.8545270296912297, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [7]:
# bz_smdm 최종 y_pred 배열 생성
bz_smsm_y_pred = bz_smsm_xgbregressor_model.predict(X_test)
bz_smsm_y_pred = bz_smsm_y_pred.astype(np.int64)

### sp_smdm_y_pred와 bz_smsm_y_pred를 사용해, y_pred_label 생성

In [8]:
y_pred_label = []

for i in range(len(X_test)):
    latency_list = []
    latency_list.append(sp_smdm_y_pred[i])
    latency_list.append(bz_smsm_y_pred[i])
    y_pred_label.append(latency_list.index(min(latency_list)))

### 실제 sp_smdm과 bz_smsm을 통해, y_real_label 생성

In [9]:
y_real_label = []

for i in range(len(X_test)):
    latency_list = []
    latency_list.append(test.loc[i,'sp_smdm'])
    latency_list.append(test.loc[i,'bz_smsm'])
    y_real_label.append(latency_list.index(min(latency_list)))

### y_pred_label과 y_real_label 간 accuracy 측정

In [10]:
from sklearn.metrics import accuracy_score

accuracy_score(y_pred_label, y_real_label)

0.9031007751937985

### 결과 dataframe 생성

In [11]:
temp = test[['lr','lc','rc','ld','rd','lnnz','rnnz','sp_smdm','bz_smsm']]
pd.concat([temp,pd.DataFrame(y_pred_label,columns=['y_pred_label']),pd.DataFrame(sp_smdm_y_pred,columns=['sp_smdm_y_pred']),pd.DataFrame(bz_smsm_y_pred,columns=['bz_smsm_y_pred']), pd.DataFrame(y_real_label,columns=['y_real_label']) ],axis=1)

Unnamed: 0,lr,lc,rc,ld,rd,lnnz,rnnz,sp_smdm,bz_smsm,y_pred_label,sp_smdm_y_pred,bz_smsm_y_pred,y_real_label
0,78717,99615,6291,0.000153,0.05,1199504,31338809,20561,33460,0,20584,38067,0
1,126899,52210,12948,0.001825,0.07,12092788,47322584,538814,309445,1,534198,275405,1
2,131403,43733,7722,0.000084,0.10,484397,33774444,29621,34386,0,27482,36325,0
3,122724,61616,2796,0.000319,0.30,2415325,51698106,37265,65907,0,30020,60143,0
4,11006,60966,19826,0.000134,0.03,89670,36262371,16860,4244,1,17630,5540,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
253,105338,18403,7308,0.000021,0.25,40477,33625627,19112,5371,1,18110,4537,1
254,113424,5936,9464,0.000823,0.13,554023,7304292,18685,45741,0,18484,52323,0
255,73228,31311,712,0.001825,0.03,4184969,669062,5549,4353,1,7869,6413,1
256,78441,27920,14057,0.009521,0.13,20851937,51025114,520580,434655,1,535774,419398,1
