In [1]:
import pandas as pd
import numpy as np

# 1035, 258
train = pd.read_csv('/Users/bdlab/Desktop/sparse-matrix-multiplication/scenario-extraction/d-optimal/spmm-latency-traintest/train-test-csv/nonsquare-train-1035-from-spmm-contain-todense-over-3s-1293.csv')
test = pd.read_csv('/Users/bdlab/Desktop/sparse-matrix-multiplication/scenario-extraction/d-optimal/spmm-latency-traintest/train-test-csv/nonsquare-test-258-from-spmm-contain-todense-over-3s-1293.csv')

In [2]:
# Train + Valid
X_train = train[['lr','lc','rc','ld','rd','lnnz','rnnz','lr*lc','lc*rc','lr*rc','lr*lc*rc','ld*rd','lr*rc*ld*rd','lr*lc*rc*ld*rd','lnnz*rnnz']] 
y_train = train['sp_smdm']

# Test
X_test = test[['lr','lc','rc','ld','rd','lnnz','rnnz','lr*lc','lc*rc','lr*rc','lr*lc*rc','ld*rd','lr*rc*ld*rd','lr*lc*rc*ld*rd','lnnz*rnnz']] 
y_test = test['sp_smdm']

In [3]:
from sklearn.metrics import r2_score, mean_squared_error

# Metric
def mean_absolute_percentage_error(y_test, y_pred):
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

def custom_scoring(real, pred):
    rmse = np.sqrt(mean_squared_error(real, pred))
    r2 = r2_score(real, pred)
    mape = mean_absolute_percentage_error(real, pred)
    return mape

In [4]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer

# 탐색 대상 함수 (XGBRegressor)
def XGB(
max_depth,
learning_rate, 
n_estimators, 
min_child_weight, 
subsample,
_lambda,
gamma ,
colsample_bytree, 
_alpha,
silent=True, 
n_jobs=-1):
    
    # 모델 정의
    model = xgb.XGBRegressor( 
objective = 'reg:squarederror',
max_depth=int(max_depth),
learning_rate=learning_rate,
n_estimators=int(n_estimators),
min_child_weight=min_child_weight,
subsample=subsample,
reg_lambda=_lambda,        
gamma=gamma,
colsample_bytree=colsample_bytree, 
reg_alpha=_alpha,
n_jobs=n_jobs        
                              )
    
    # bayesian optimization을 통해 파라미터를 받아
    # Train을 Train + Validation으로 나눠 cross-validation 성능 확인
    kfold = KFold(n_splits=12, shuffle = True, random_state=0)

    # cross-validation 평균 성능 성능 확인
    score = cross_val_score(model,
                            X_train, 
                            y_train, 
                            cv=kfold,
                            scoring=make_scorer(custom_scoring,greater_is_better=False),
                            n_jobs=-1
                           ).mean()
    
    return score

In [5]:
from bayes_opt import BayesianOptimization

# # 하이퍼파라미터 정의
# pbounds = {
# 'max_depth': (12, 16), 
# 'learning_rate': (0.01, 0.3),
# 'n_estimators': (50, 150),
# 'min_child_weight': (0.1, 1),
# 'subsample': (0.5, 1), 
# '_lambda' : (0.1,1),    
# #'gamma': (0, 0.3),     
# #'colsample_bytree' :(0.5, 1)
# #'_alpha' : (0,1)           
#                       }

# 하이퍼파라미터 정의
pbounds = {
'max_depth': (5, 10), 
'learning_rate': (0.01, 0.3),
'n_estimators': (6000, 7000),
'min_child_weight': (0.1, 1),
'subsample': (0.5, 1), 
'_lambda' : (0.1,1),    
'gamma': (0, 0.6),     
'colsample_bytree' :(0.5, 1),
'_alpha' : (0,0.1)
                      }


# Bayesian optimization 객체 생성
bo=BayesianOptimization(f=XGB, pbounds=pbounds, verbose=2, random_state=1 )    

# 메소드를 이용해 최대화 과정 수행 (파라미터 넣고 목적함수 값 출력하고)
bo.maximize(init_points=2, n_iter=50, acq='ei', xi=0.01)

# 뽑힌 최적의 하이퍼파라미터 값 확인
print("{}\n".format(bo.max))

|   iter    |  target   |  _alpha   |  _lambda  | colsam... |   gamma   | learni... | max_depth | min_ch... | n_esti... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-24.14   [0m | [0m 0.0417  [0m | [0m 0.7483  [0m | [0m 0.5001  [0m | [0m 0.1814  [0m | [0m 0.05256 [0m | [0m 5.462   [0m | [0m 0.2676  [0m | [0m 6.346e+0[0m | [0m 0.6984  [0m |
| [0m 2       [0m | [0m-24.54   [0m | [0m 0.05388 [0m | [0m 0.4773  [0m | [0m 0.8426  [0m | [0m 0.1227  [0m | [0m 0.2647  [0m | [0m 5.137   [0m | [0m 0.7034  [0m | [0m 6.417e+0[0m | [0m 0.7793  [0m |
| [95m 3       [0m | [95m-20.36   [0m | [95m 0.01747 [0m | [95m 1.0     [0m | [95m 0.5     [0m | [95m 0.358   [0m | [95m 0.01    [0m | [95m 6.257   [0m | [95m 0.1     [0m | [95m 6.132e+0[0m | [95m 0.533   [0m |
| [0m 4       [0m | [0m-22.44   [0m | [0m 0.08453 



| [0m 8       [0m | [0m-20.26   [0m | [0m 0.09268 [0m | [0m 0.9652  [0m | [0m 0.6934  [0m | [0m 0.4552  [0m | [0m 0.1024  [0m | [0m 9.089   [0m | [0m 0.7103  [0m | [0m 6.313e+0[0m | [0m 0.5961  [0m |
| [0m 9       [0m | [0m-23.27   [0m | [0m 0.09514 [0m | [0m 0.6014  [0m | [0m 0.9215  [0m | [0m 0.1036  [0m | [0m 0.2447  [0m | [0m 6.712   [0m | [0m 0.7741  [0m | [0m 6.835e+0[0m | [0m 0.6578  [0m |
| [95m 10      [0m | [95m-19.25   [0m | [95m 0.06104 [0m | [95m 0.1406  [0m | [95m 0.7181  [0m | [95m 0.2825  [0m | [95m 0.04649 [0m | [95m 6.577   [0m | [95m 0.3175  [0m | [95m 6.84e+03[0m | [95m 0.9259  [0m |
| [0m 11      [0m | [0m-22.11   [0m | [0m 0.00288 [0m | [0m 0.2896  [0m | [0m 0.8429  [0m | [0m 0.391   [0m | [0m 0.2904  [0m | [0m 8.684   [0m | [0m 0.1916  [0m | [0m 6.932e+0[0m | [0m 0.7577  [0m |
| [0m 12      [0m | [0m-20.29   [0m | [0m 0.01901 [0m | [0m 0.4127  [0m | [0m 0.9913  [0m |

| [0m 45      [0m | [0m-21.49   [0m | [0m 0.02863 [0m | [0m 0.2351  [0m | [0m 0.8934  [0m | [0m 0.04734 [0m | [0m 0.1661  [0m | [0m 6.295   [0m | [0m 0.9108  [0m | [0m 6.077e+0[0m | [0m 0.9761  [0m |
| [0m 46      [0m | [0m-21.66   [0m | [0m 0.0185  [0m | [0m 0.4518  [0m | [0m 0.8934  [0m | [0m 0.3766  [0m | [0m 0.1525  [0m | [0m 7.865   [0m | [0m 0.2252  [0m | [0m 6.133e+0[0m | [0m 0.522   [0m |
| [0m 47      [0m | [0m-20.5    [0m | [0m 0.02351 [0m | [0m 0.241   [0m | [0m 0.6965  [0m | [0m 0.2244  [0m | [0m 0.07838 [0m | [0m 9.64    [0m | [0m 0.6006  [0m | [0m 6.201e+0[0m | [0m 0.9675  [0m |
| [0m 48      [0m | [0m-18.0    [0m | [0m 0.02913 [0m | [0m 0.5073  [0m | [0m 0.9795  [0m | [0m 0.5367  [0m | [0m 0.05994 [0m | [0m 7.494   [0m | [0m 0.5586  [0m | [0m 6.655e+0[0m | [0m 0.7248  [0m |
| [0m 49      [0m | [0m-19.26   [0m | [0m 0.03599 [0m | [0m 0.5022  [0m | [0m 0.7268  [0m | [0m 0.335

KeyboardInterrupt: 

In [None]:
import xgboost as xgb

# Train + Valid cross-validation을 거친, 최적의 하이퍼파라미터를 사용
best_model = xgb.XGBRegressor(
objective = 'reg:squarederror',
max_depth=int(bo.max['params']['max_depth']),
learning_rate=bo.max['params']['learning_rate'],
n_estimators=int(bo.max['params']['n_estimators']),
min_child_weight=bo.max['params']['min_child_weight'],
subsample=bo.max['params']['subsample'],
reg_lambda = bo.max['params']['_lambda'],    
gamma=bo.max['params']['gamma'],
colsample_bytree=bo.max['params']['colsample_bytree'],
reg_alpha = bo.max['params']['_alpha'],
n_jobs=-1
                             )
# 모델 훈련
best_model.fit(X_train, y_train)

In [None]:
# 최적의 하이퍼파라미터 값 확인
print("{}\n".format(bo.max))

# 훈련데이터 예측
y_train_pred = best_model.predict(X_train)
print("-------- 훈련데이터 예측 --------------------------")
print("rmse : {}".format(np.sqrt(mean_squared_error(y_train, y_train_pred))))
print("mape : {}%".format(mean_absolute_percentage_error(y_train, y_train_pred)))
print("\n")

# 검증데이터 예측
print("-------- 검증데이터 예측 --------------------------")
print("mape : {}%".format(-bo.max['target']))
print("\n")

# 테스트데이터 예측
y_pred = best_model.predict(X_test)
print("-------- 테스트데이터 예측 -------------------------")
print("rmse : {}".format(np.sqrt(mean_squared_error(y_test, y_pred))))
print("mape : {}%".format(mean_absolute_percentage_error(y_test, y_pred)))
print("\n")

In [None]:
# mape_list = {}
# # 예측값, 실제값을 확인하며 mape 계산 후 mape_list에 삽입 
# for idx,value in enumerate(y_test):
#     mape_temp = {}
#     predicate = int(y_pred[idx])
#     mape = abs((value - predicate) / value) * 100
#     mape_temp['pred'] = predicate
#     mape_temp['real'] = value
#     mape_temp['mape'] = mape
#     mape_list[idx] = mape_temp
# mape_list_sort = sorted(mape_list.values(), key=lambda x:(x['mape']), reverse=True)
# mape_list_sort  

In [None]:
xgb.plot_importance(best_model)