In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv('/Users/bdlab/Desktop/sparse-matrix-multiplication/scenario-extraction/d-optimal/d-optimal-of-spmm/train-test-csv/1164-train-from-1293-spmm-over-3s-and-add-feature.csv')
test = pd.read_csv('/Users/bdlab/Desktop/sparse-matrix-multiplication/scenario-extraction/d-optimal/d-optimal-of-spmm/train-test-csv/129-test-from-1293-spmm-over-3s-and-add-feature.csv')

In [2]:
# Train + Valid
X_train = train[['lr','lc','rc','ld','rd','lnnz','rnnz','lr*lc','lc*rc','lr*rc','lr*lc*rc','ld*rd','lr*rc*ld*rd','lr*lc*rc*ld*rd']] 
y_train = train['sp_smdm']

# Test
X_test = test[['lr','lc','rc','ld','rd','lnnz','rnnz','lr*lc','lc*rc','lr*rc','lr*lc*rc','ld*rd','lr*rc*ld*rd','lr*lc*rc*ld*rd']] 
y_test = test['sp_smdm']

In [3]:
from sklearn.metrics import r2_score, mean_squared_error

# Metric
def mean_absolute_percentage_error(y_test, y_pred):
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

def custom_scoring(real, pred):
    rmse = np.sqrt(mean_squared_error(real, pred))
    r2 = r2_score(real, pred)
    mape = mean_absolute_percentage_error(real, pred)
    return mape

In [4]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer

# 탐색 대상 함수 (XGBRegressor)
def XGB(
max_depth,
learning_rate, 
n_estimators, 
min_child_weight, 
subsample,
_lambda,
#gamma ,
#colsample_bytree, 
#_alpha,
silent=True, 
n_jobs=-1):
    
    # 모델 정의
    model = xgb.XGBRegressor( 
objective = 'reg:squarederror',
max_depth=int(max_depth),
learning_rate=learning_rate,
n_estimators=int(n_estimators),
min_child_weight=min_child_weight,
subsample=subsample,
reg_lambda=_lambda,        
#gamma=gamma,
#colsample_bytree=colsample_bytree, 
#reg_alpha=_alpha
n_jobs=n_jobs        
                              )
    
    # bayesian optimization을 통해 파라미터를 받아
    # Train을 Train + Validation으로 나눠 cross-validation 성능 확인
    kfold = KFold(n_splits=9, shuffle = True, random_state=0)

    # cross-validation 평균 성능 성능 확인
    score = cross_val_score(model,
                            X_train, 
                            y_train, 
                            cv=kfold,
                            scoring=make_scorer(custom_scoring,greater_is_better=False),
                            n_jobs=-1
                           ).mean()
    
    return score

In [5]:
from bayes_opt import BayesianOptimization

# 하이퍼파라미터 정의
pbounds = {
'max_depth': (11, 14), 
'learning_rate': (0.01, 0.3),
'n_estimators': (50, 100),
'min_child_weight': (0.1, 1),
'subsample': (0.5, 1), 
'_lambda' : (0.1,1),    
#'gamma': (0, 0.3),     
#'colsample_bytree' :(0.5, 1)
#'_alpha' : (0,1)           
                      }

# Bayesian optimization 객체 생성
bo=BayesianOptimization(f=XGB, pbounds=pbounds, verbose=2, random_state=1 )    

# 메소드를 이용해 최대화 과정 수행 (파라미터 넣고 목적함수 값 출력하고)
bo.maximize(init_points=2, n_iter=50, acq='ei', xi=0.01)

# 뽑힌 최적의 하이퍼파라미터 값 확인
print("{}\n".format(bo.max))

|   iter    |  target   |  _lambda  | learni... | max_depth | min_ch... | n_esti... | subsample |
-------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-23.3    [0m | [0m 0.4753  [0m | [0m 0.2189  [0m | [0m 15.0    [0m | [0m 0.3721  [0m | [0m 57.34   [0m | [0m 0.5462  [0m |
| [95m 2       [0m | [95m-20.09   [0m | [95m 0.2676  [0m | [95m 0.1102  [0m | [95m 18.97   [0m | [95m 0.5849  [0m | [95m 70.96   [0m | [95m 0.8426  [0m |
| [0m 3       [0m | [0m-20.19   [0m | [0m 0.3008  [0m | [0m 0.1009  [0m | [0m 18.94   [0m | [0m 0.3342  [0m | [0m 71.19   [0m | [0m 0.7811  [0m |
| [0m 4       [0m | [0m-24.03   [0m | [0m 0.1     [0m | [0m 0.3     [0m | [0m 21.66   [0m | [0m 1.0     [0m | [0m 68.57   [0m | [0m 1.0     [0m |
| [0m 5       [0m | [0m-22.88   [0m | [0m 0.1793  [0m | [0m 0.06113 [0m | [0m 17.73   [0m | [0m 1.0     [0m | [0m 70.98   [0m | [0m 1

| [0m 51      [0m | [0m-22.03   [0m | [0m 0.2373  [0m | [0m 0.1692  [0m | [0m 18.24   [0m | [0m 0.1974  [0m | [0m 71.29   [0m | [0m 0.9836  [0m |
| [0m 52      [0m | [0m-20.45   [0m | [0m 0.1305  [0m | [0m 0.09845 [0m | [0m 15.54   [0m | [0m 0.6377  [0m | [0m 89.21   [0m | [0m 0.633   [0m |
{'target': -19.089683174022092, 'params': {'_lambda': 0.9998490383930189, 'learning_rate': 0.03405736231340852, 'max_depth': 18.09981371031183, 'min_child_weight': 0.37607617264488746, 'n_estimators': 66.38685400108382, 'subsample': 0.6258819254429664}}



In [6]:
import xgboost as xgb

# Train + Valid cross-validation을 거친, 최적의 하이퍼파라미터를 사용
best_model = xgb.XGBRegressor(
objective = 'reg:squarederror',
max_depth=int(bo.max['params']['max_depth']),
learning_rate=bo.max['params']['learning_rate'],
n_estimators=int(bo.max['params']['n_estimators']),
min_child_weight=bo.max['params']['min_child_weight'],
subsample=bo.max['params']['subsample'],
reg_lambda = bo.max['params']['_lambda'],    
#gamma=bo.max['params']['gamma'],
#colsample_bytree=bo.max['params']['colsample_bytree'],
#reg_alpha = bo.max['params']['_alpha']
n_jobs=-1
                             )
# 모델 훈련
best_model.fit(X_train, y_train)

XGBRegressor(learning_rate=0.03405736231340852, max_depth=18,
             min_child_weight=0.37607617264488746, n_estimators=66, n_jobs=-1,
             objective='reg:squarederror', reg_lambda=0.9998490383930189,
             subsample=0.6258819254429664)

In [7]:
# 최적의 하이퍼파라미터 값 확인
print("{}\n".format(bo.max))

# Train -> Train 에러율
y_train_pred = best_model.predict(X_train)
print("-------- train -> train -------------------------")
print("rmse : {}".format(np.sqrt(mean_squared_error(y_train, y_train_pred))))
print("mape : {}%".format(mean_absolute_percentage_error(y_train, y_train_pred)))
print("\n")

# Train -> dev (cross-validation) 에러율
print("-------- train -> dev (cross-validation) --------")
print("mape : {}%".format(-bo.max['target']))
print("\n")

# Train -> Test 에러율
y_pred = best_model.predict(X_test)
print("-------- train -> test --------------------------")
print("rmse : {}".format(np.sqrt(mean_squared_error(y_test, y_pred))))
print("mape : {}%".format(mean_absolute_percentage_error(y_test, y_pred)))
print("\n")

{'target': -19.089683174022092, 'params': {'_lambda': 0.9998490383930189, 'learning_rate': 0.03405736231340852, 'max_depth': 18.09981371031183, 'min_child_weight': 0.37607617264488746, 'n_estimators': 66.38685400108382, 'subsample': 0.6258819254429664}}

-------- train -> train -------------------------
rmse : 24209.57388391213
mape : 11.49255574114531%


-------- train -> dev (cross-validation) --------
mape : 19.089683174022092%


-------- train -> test --------------------------
rmse : 19251.545947353585
mape : 17.044703043818103%




In [9]:
# mape_list = {}
# # 예측값, 실제값을 확인하며 mape 계산 후 mape_list에 삽입 
# for idx,value in enumerate(y_test):
#     mape_temp = {}
#     predicate = int(y_pred[idx])
#     mape = abs((value - predicate) / value) * 100
#     mape_temp['pred'] = predicate
#     mape_temp['real'] = value
#     mape_temp['mape'] = mape
#     mape_list[idx] = mape_temp
# mape_list_sort = sorted(mape_list.values(), key=lambda x:(x['mape']), reverse=True)
# mape_list_sort  