In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv('/Users/bdlab/Desktop/sparse-matrix-multiplication/scenario-extraction/d-optimal/d-optimal-of-spmm/train-test-csv/1164-train-from-1293-spmm-over-3s-and-add-feature.csv')
test = pd.read_csv('/Users/bdlab/Desktop/sparse-matrix-multiplication/scenario-extraction/d-optimal/d-optimal-of-spmm/train-test-csv/129-test-from-1293-spmm-over-3s-and-add-feature.csv')

In [2]:
# Train + Valid
X_train = train[['lr','lc','rc','ld','rd','lnnz','rnnz','lr*lc','lc*rc','lr*rc','lr*lc*rc','ld*rd','lr*rc*ld*rd','lr*lc*rc*ld*rd']] 
y_train = train['bz_smsm']

# Test
X_test = test[['lr','lc','rc','ld','rd','lnnz','rnnz','lr*lc','lc*rc','lr*rc','lr*lc*rc','ld*rd','lr*rc*ld*rd','lr*lc*rc*ld*rd']] 
y_test = test['bz_smsm']

In [3]:
from sklearn.metrics import r2_score, mean_squared_error

# Metric
def mean_absolute_percentage_error(y_test, y_pred):
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

def custom_scoring(real, pred):
    rmse = np.sqrt(mean_squared_error(real, pred))
    r2 = r2_score(real, pred)
    mape = mean_absolute_percentage_error(real, pred)
    return mape

In [4]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer

# 탐색 대상 함수 (XGBRegressor)
def XGB(
max_depth,
learning_rate, 
n_estimators, 
#min_child_weight, 
subsample,
_lambda,
#gamma ,
colsample_bytree, 
#_alpha,
silent=True, 
n_jobs=-1):
    
    # 모델 정의
    model = xgb.XGBRegressor( 
objective = 'reg:squarederror',
max_depth=int(max_depth),
learning_rate=learning_rate,
n_estimators=int(n_estimators),
#min_child_weight=min_child_weight,
subsample=subsample,
reg_lambda=_lambda,        
#gamma=gamma,
colsample_bytree=colsample_bytree, 
#reg_alpha=_alpha
n_jobs=n_jobs        
                              )
    
    # bayesian optimization을 통해 파라미터를 받아
    # Train을 Train + Validation으로 나눠 cross-validation 성능 확인
    kfold = KFold(n_splits=5, shuffle = True, random_state=0)

    # cross-validation 평균 성능 성능 확인
    score = cross_val_score(model,
                            X_train, 
                            y_train, 
                            cv=kfold,
                            scoring=make_scorer(custom_scoring,greater_is_better=False),
                            n_jobs=-1
                           ).mean()
    
    return score

In [5]:
from bayes_opt import BayesianOptimization

# 하이퍼파라미터 정의
pbounds = {
'max_depth': (15, 23), 
'learning_rate': (0.01, 0.3),
'n_estimators': (50, 150),
#'min_child_weight': (0.1, 1),
'subsample': (0.5, 1), 
'_lambda' : (0.1,1),    
#'gamma': (0, 0.3),     
'colsample_bytree' :(0.5, 1)
#'_alpha' : (0,1)           
                      }

# Bayesian optimization 객체 생성
bo=BayesianOptimization(f=XGB, pbounds=pbounds, verbose=2, random_state=1 )    

# 메소드를 이용해 최대화 과정 수행 (파라미터 넣고 목적함수 값 출력하고)
bo.maximize(init_points=2, n_iter=50, acq='ei', xi=0.01)

# 뽑힌 최적의 하이퍼파라미터 값 확인
print("{}\n".format(bo.max))

|   iter    |  target   |  _lambda  | colsam... | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-70.15   [0m | [0m 0.4753  [0m | [0m 0.8602  [0m | [0m 0.005022[0m | [0m 18.02   [0m | [0m 64.68   [0m | [0m 0.5462  [0m |
| [95m 2       [0m | [95m-25.9    [0m | [95m 0.2676  [0m | [95m 0.6728  [0m | [95m 0.08237 [0m | [95m 20.39   [0m | [95m 91.92   [0m | [95m 0.8426  [0m |
| [0m 3       [0m | [0m-28.58   [0m | [0m 0.8332  [0m | [0m 0.5081  [0m | [0m 0.06893 [0m | [0m 20.44   [0m | [0m 92.38   [0m | [0m 0.7349  [0m |
| [95m 4       [0m | [95m-24.14   [0m | [95m 0.1     [0m | [95m 1.0     [0m | [95m 0.2     [0m | [95m 19.76   [0m | [95m 87.13   [0m | [95m 1.0     [0m |
| [95m 5       [0m | [95m-24.14   [0m | [95m 0.1     [0m | [95m 1.0     [0m | [95m 0.2     [0m | [95m 25.0    [0m | [95m 88.35 

| [0m 51      [0m | [0m-21.95   [0m | [0m 0.9122  [0m | [0m 0.7443  [0m | [0m 0.08548 [0m | [0m 20.94   [0m | [0m 139.2   [0m | [0m 0.9138  [0m |
| [0m 52      [0m | [0m-24.09   [0m | [0m 0.7889  [0m | [0m 0.829   [0m | [0m 0.1165  [0m | [0m 20.8    [0m | [0m 139.5   [0m | [0m 0.7793  [0m |
{'target': -20.29669549816437, 'params': {'_lambda': 0.17442799795460104, 'colsample_bytree': 0.9879164037585435, 'learning_rate': 0.03746438801250545, 'max_depth': 20.89859289179659, 'n_estimators': 138.4961753113434, 'subsample': 0.720612232127243}}



In [6]:
import xgboost as xgb

# Train + Valid cross-validation을 거친, 최적의 하이퍼파라미터를 사용
best_model = xgb.XGBRegressor(
objective = 'reg:squarederror',
max_depth=int(bo.max['params']['max_depth']),
learning_rate=bo.max['params']['learning_rate'],
n_estimators=int(bo.max['params']['n_estimators']),
#min_child_weight=bo.max['params']['min_child_weight'],
subsample=bo.max['params']['subsample'],
reg_lambda = bo.max['params']['_lambda'],    
#gamma=bo.max['params']['gamma'],
colsample_bytree=bo.max['params']['colsample_bytree'],
#reg_alpha = bo.max['params']['_alpha']
n_jobs=-1
                             )
# 모델 훈련
best_model.fit(X_train, y_train)

XGBRegressor(colsample_bytree=0.9879164037585435,
             learning_rate=0.03746438801250545, max_depth=20, n_estimators=138,
             n_jobs=-1, objective='reg:squarederror',
             reg_lambda=0.17442799795460104, subsample=0.720612232127243)

In [7]:
# 최적의 하이퍼파라미터 값 확인
print("{}\n".format(bo.max))

# Train -> Train 에러율
y_train_pred = best_model.predict(X_train)
print("-------- train -> train -------------------------")
print("rmse : {}".format(np.sqrt(mean_squared_error(y_train, y_train_pred))))
print("mape : {}%".format(mean_absolute_percentage_error(y_train, y_train_pred)))
print("\n")

# Train -> dev (cross-validation) 에러율
print("-------- train -> validation (cross-validation) --------")
print("mape : {}%".format(-bo.max['target']))
print("\n")

# Train -> Test 에러율
y_pred = best_model.predict(X_test)
print("-------- train -> test --------------------------")
print("rmse : {}".format(np.sqrt(mean_squared_error(y_test, y_pred))))
print("mape : {}%".format(mean_absolute_percentage_error(y_test, y_pred)))
print("\n")

{'target': -20.29669549816437, 'params': {'_lambda': 0.17442799795460104, 'colsample_bytree': 0.9879164037585435, 'learning_rate': 0.03746438801250545, 'max_depth': 20.89859289179659, 'n_estimators': 138.4961753113434, 'subsample': 0.720612232127243}}

-------- train -> train -------------------------
rmse : 1643.7977739688044
mape : 0.8212506339569652%


-------- train -> validation (cross-validation) --------
mape : 20.29669549816437%


-------- train -> test --------------------------
rmse : 14547.41831542676
mape : 13.761242330567674%




In [8]:
# mape_list = {}
# # 예측값, 실제값을 확인하며 mape 계산 후 mape_list에 삽입 
# for idx,value in enumerate(y_test):
#     mape_temp = {}
#     predicate = int(y_pred[idx])
#     mape = abs((value - predicate) / value) * 100
#     mape_temp['pred'] = predicate
#     mape_temp['real'] = value
#     mape_temp['mape'] = mape
#     mape_list[idx] = mape_temp
# mape_list_sort = sorted(mape_list.values(), key=lambda x:(x['mape']), reverse=True)
# mape_list_sort  