In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv('/Users/bdlab/Desktop/sparse-matrix-multiplication/scenario-extraction/dataset/spmm-latency-dataset/extract-dataset-using-d-optimal/dataset/nonsquare-train-1035-from-spmm-contain-todense-over-3s-1293.csv')
test = pd.read_csv('/Users/bdlab/Desktop/sparse-matrix-multiplication/scenario-extraction/dataset/spmm-latency-dataset/extract-dataset-using-d-optimal/dataset/nonsquare-test-258-from-spmm-contain-todense-over-3s-1293.csv')

In [2]:
X_train = train[['lr','lc','rc','ld','rd','lnnz','rnnz','lr*lc','lc*rc','lr*rc','ld*rd']] 
y_train = train['bz_smsm']

X_test = test[['lr','lc','rc','ld','rd','lnnz','rnnz','lr*lc','lc*rc','lr*rc','ld*rd']] 
y_test = test['bz_smsm']

In [3]:
# X_train = train[['lr','lc','rc','ld','rd','lnnz','rnnz']] 
# y_train = train['bz_smsm']

# # Test
# X_test = test[['lr','lc','rc','ld','rd','lnnz','rnnz']] 
# y_test = test['bz_smsm']

In [4]:
from sklearn.metrics import r2_score, mean_squared_error

# Metric
def mean_absolute_percentage_error(y_test, y_pred):
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

def custom_scoring(real, pred):
    rmse = np.sqrt(mean_squared_error(real, pred))
    r2 = r2_score(real, pred)
    mape = mean_absolute_percentage_error(real, pred)
    return mape

In [5]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer

# 탐색 대상 함수 (XGBRegressor)
def XGB(
max_depth,
learning_rate, 
n_estimators, 
#min_child_weight, 
subsample,
_lambda,
#gamma ,
colsample_bytree, 
#_alpha,
silent=True, 
n_jobs=-1):
    
    # 모델 정의
    model = xgb.XGBRegressor( 
objective = 'reg:squarederror',
max_depth=int(max_depth),
learning_rate=learning_rate,
n_estimators=int(n_estimators),
#min_child_weight=min_child_weight,
subsample=subsample,
reg_lambda=_lambda,        
#gamma=gamma,
colsample_bytree=colsample_bytree, 
#reg_alpha=_alpha
n_jobs=n_jobs        
                              )
    
    # bayesian optimization을 통해 파라미터를 받아
    # Train을 Train + Validation으로 나눠 cross-validation 성능 확인
    kfold = KFold(n_splits=5, shuffle = True, random_state=0)

    # cross-validation 평균 성능 성능 확인
    score = cross_val_score(model,
                            X_train, 
                            y_train, 
                            cv=kfold,
                            scoring=make_scorer(custom_scoring,greater_is_better=False),
                            n_jobs=-1
                           ).mean()
    
    return score

In [6]:
from bayes_opt import BayesianOptimization

# 하이퍼파라미터 정의
pbounds = {
'max_depth': (15, 23), 
'learning_rate': (0.01, 0.3),
'n_estimators': (50, 150),
#'min_child_weight': (0.1, 1),
'subsample': (0.5, 1), 
'_lambda' : (0.1,1),    
#'gamma': (0, 0.3),     
'colsample_bytree' :(0.5, 1)
#'_alpha' : (0,1)           
                      }

# Bayesian optimization 객체 생성
bo=BayesianOptimization(f=XGB, pbounds=pbounds, verbose=2, random_state=1 )    

# 메소드를 이용해 최대화 과정 수행 (파라미터 넣고 목적함수 값 출력하고)
bo.maximize(init_points=2, n_iter=50, acq='ei', xi=0.01)

# 뽑힌 최적의 하이퍼파라미터 값 확인
print("{}\n".format(bo.max))

|   iter    |  target   |  _lambda  | colsam... | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-48.0    [0m | [0m 0.4753  [0m | [0m 0.8602  [0m | [0m 0.01003 [0m | [0m 17.42   [0m | [0m 64.68   [0m | [0m 0.5462  [0m |
| [95m 2       [0m | [95m-32.01   [0m | [95m 0.2676  [0m | [95m 0.6728  [0m | [95m 0.1251  [0m | [95m 19.31   [0m | [95m 91.92   [0m | [95m 0.8426  [0m |
| [95m 3       [0m | [95m-28.41   [0m | [95m 0.4741  [0m | [95m 0.7286  [0m | [95m 0.07077 [0m | [95m 19.09   [0m | [95m 92.1    [0m | [95m 0.7691  [0m |
| [0m 4       [0m | [0m-31.01   [0m | [0m 0.6379  [0m | [0m 0.6592  [0m | [0m 0.2023  [0m | [0m 18.59   [0m | [0m 67.22   [0m | [0m 0.6515  [0m |
| [0m 5       [0m | [0m-67.5    [0m | [0m 0.5339  [0m | [0m 0.5366  [0m | [0m 0.2357  [0m | [0m 18.73   [0m | [0m 67.37   [0m 

| [0m 51      [0m | [0m-32.18   [0m | [0m 0.1991  [0m | [0m 0.6673  [0m | [0m 0.2264  [0m | [0m 22.55   [0m | [0m 141.3   [0m | [0m 0.8711  [0m |
| [0m 52      [0m | [0m-30.93   [0m | [0m 0.1305  [0m | [0m 0.6525  [0m | [0m 0.0258  [0m | [0m 19.78   [0m | [0m 128.4   [0m | [0m 0.633   [0m |
{'target': -24.880260281082748, 'params': {'_lambda': 0.9671716237976958, 'colsample_bytree': 0.9784517810866844, 'learning_rate': 0.04507011754186229, 'max_depth': 16.992869637868704, 'n_estimators': 62.21252826172613, 'subsample': 0.8545270296912297}}



In [7]:
# import xgboost as xgb

# # Train + Valid cross-validation을 거친, 최적의 하이퍼파라미터를 사용
# best_model = xgb.XGBRegressor(
# objective = 'reg:squarederror',
# max_depth=int(bo.max['params']['max_depth']),
# learning_rate=bo.max['params']['learning_rate'],
# n_estimators=int(bo.max['params']['n_estimators']),
# #min_child_weight=bo.max['params']['min_child_weight'],
# subsample=bo.max['params']['subsample'],
# reg_lambda = bo.max['params']['_lambda'],    
# #gamma=bo.max['params']['gamma'],
# colsample_bytree=bo.max['params']['colsample_bytree'],
# #reg_alpha = bo.max['params']['_alpha'],
# n_jobs=-1
#                              )
# # 모델 훈련
# best_model.fit(X_train, y_train)

In [8]:
# # 최적의 하이퍼파라미터 값 확인
# print("{}\n".format(bo.max))

# # 훈련데이터 예측
# y_train_pred = best_model.predict(X_train)
# print("-------- 훈련데이터 예측 --------------------------")
# print("rmse : {}".format(np.sqrt(mean_squared_error(y_train, y_train_pred))))
# print("mape : {}%".format(mean_absolute_percentage_error(y_train, y_train_pred)))
# print("\n")

# # 검증데이터 예측
# print("-------- 검증데이터 예측 --------------------------")
# print("mape : {}%".format(-bo.max['target']))
# print("\n")

# # 테스트데이터 예측
# y_pred = best_model.predict(X_test)
# print("-------- 테스트데이터 예측 -------------------------")
# print("rmse : {}".format(np.sqrt(mean_squared_error(y_test, y_pred))))
# print("mape : {}%".format(mean_absolute_percentage_error(y_test, y_pred)))
# print("\n")

### 반복을 통해 MAPE 결과 수집

In [9]:
from sklearn.model_selection import train_test_split
    
mape_list = []

for i in range(10):

    X_train = train[['lr','lc','rc','ld','rd','lnnz','rnnz','lr*lc','lc*rc','lr*rc','ld*rd']] 
    y_train = train['bz_smsm']

    X_test = test[['lr','lc','rc','ld','rd','lnnz','rnnz','lr*lc','lc*rc','lr*rc','ld*rd']] 
    y_test = test['bz_smsm']
    
    X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=0.1)

    # Train + Valid cross-validation을 거친, 최적의 하이퍼파라미터를 사용
    best_model = xgb.XGBRegressor(
    objective = 'reg:squarederror',
    max_depth=int(bo.max['params']['max_depth']),
    learning_rate=bo.max['params']['learning_rate'],
    n_estimators=int(bo.max['params']['n_estimators']),
    #min_child_weight=bo.max['params']['min_child_weight'],
    subsample=bo.max['params']['subsample'],
    reg_lambda = bo.max['params']['_lambda'],    
    #gamma=bo.max['params']['gamma'],
    colsample_bytree=bo.max['params']['colsample_bytree'],
    #reg_alpha = bo.max['params']['_alpha'],
    n_jobs=-1
                                 )
    # 모델 훈련
    best_model.fit(X_train, y_train)

    # 테스트데이터 예측
    y_pred = best_model.predict(X_test)

    # MAPE 결과 추가
    mape_list.append(mean_absolute_percentage_error(y_test, y_pred))

mape_list = np.array(mape_list)
print(mape_list)
print("median : " , np.median(mape_list))
print("min : " , np.min(mape_list))
print("max : " , np.max(mape_list))

[18.98385344 21.25032731 20.98132174 19.31500564 19.17592758 19.57358014
 20.05220494 20.02417353 19.80080423 20.03103586]
median :  19.912488882279092
min :  18.983853444661335
max :  21.25032731464962


In [10]:
# mape_list = {}
# # 예측값, 실제값을 확인하며 mape 계산 후 mape_list에 삽입 
# for idx,value in enumerate(y_test):
#     mape_temp = {}
#     predicate = int(y_pred[idx])
#     mape = abs((value - predicate) / value) * 100
#     mape_temp['pred'] = predicate
#     mape_temp['real'] = value
#     mape_temp['mape'] = mape
#     mape_list[idx] = mape_temp
# mape_list_sort = sorted(mape_list.values(), key=lambda x:(x['mape']), reverse=True)
# mape_list_sort  