In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv('./spmm-square-nonsquare-data-doptimal-1800.csv')
test = pd.read_csv('./spmm-square-nonsquare-data-doptimal-200.csv')

In [2]:
# 900개
X_train = train[['lr','lc','rc','ld','rd','lnnz','rnnz','lr*lc','lc*rc','lr*rc','lr*lc*rc','ld*rd','lr*rc*ld*rd','lr*lc*rc*ld*rd']] 
y_train = train['bz_smsm']
# 100개
X_test = test[['lr','lc','rc','ld','rd','lnnz','rnnz','lr*lc','lc*rc','lr*rc','lr*lc*rc','ld*rd','lr*rc*ld*rd','lr*lc*rc*ld*rd']] 
y_test = test['bz_smsm']

In [3]:
from sklearn.metrics import mean_squared_error

def mean_absolute_percentage_error(y_test, y_pred):
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

def rmse(y_true, y_pred):
    rmse = np.sqrt(np.mean(np.square(y_pred - y_true))) 
    return rmse

def custom_scoring(y_test, y_pred):

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mape = mean_absolute_percentage_error(y_test, y_pred)

    return mape

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer

grid = {
    'max_depth' : [6,8,10,12,14,16],
    'loss' : ['ls', 'lad', 'huber', 'quantile'],
    'criterion' : ['friedman_mse', 'mse', 'mae'],
    'min_samples_leaf' : [1,2,4,8],
    'min_samples_split' : [2,4,8],
    'max_features' : ['sqrt'],
    'n_estimators' : [100,200,400,600,800],
    'learning_rate' : [0.01, 0.05, 0.1, 0.2, 0.25, 0.3],
}

regressor = GradientBoostingRegressor()

kf = KFold(random_state=30,
           n_splits=10,
           shuffle=True
          )

regressor_grid = GridSearchCV(regressor, 
                              param_grid = grid, 
                              scoring = make_scorer(custom_scoring, greater_is_better=False),
                              cv=kf, 
                              n_jobs=-1,
                              verbose=3) 

regressor_grid.fit(X_train, y_train)

print("최고의 파라미터 :", regressor_grid.best_params_)
#print("최고 평균 정확도 : {}".format(regressor_grid.best_score_))

Fitting 10 folds for each of 25920 candidates, totalling 259200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   30.9s
[Parallel(n_jobs=-1)]: Done 416 tasks      | elapsed:   55.3s
[Parallel(n_jobs=-1)]: Done 704 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1056 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 1472 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 1952 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 2496 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 3104 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 3776 tasks      | elapsed: 37.1min
[Parallel(n_jobs=-1)]: Done 4512 tasks      | elapsed: 98.2min
[Parallel(n_jobs=-1)]: Done 5312 tasks      | elapsed: 102.0min
[Parallel(n_jobs=-1)]: Done 6176 tasks      | elapsed: 107.5min
[Parallel(n_jobs=-1)]: Done 7104 tasks      | elapsed: 117.4min
[Parallel(n_jobs=-1)]: Done 8096 tasks    

In [None]:
# regressor = GradientBoostingRegressor(
# learning_rate=0.05,
# max_depth=8,
# n_estimators=600    
# )
# regressor.fit(X_train, y_train)

# y_pred_train = regressor.predict(X_train)
# y_pred = regressor.predict(X_test)

In [None]:
# # 훈련 자체 에러율
# print("--------train 에러율--------")
# print("rmse : {}".format(rmse(y_test,y_pred)))
# print("mape : {}".format(mean_absolute_percentage_error(y_train, y_pred_train)))

# # 테스트 에러율
# print("--------test 에러율--------")
# print("rmse : {}".format(rmse(y_test,y_pred)))
# print("mape : {}".format(mean_absolute_percentage_error(y_test,y_pred)))

In [None]:
# --------train 에러율--------
# rmse : 23418.322014128913
# mape : 3.284169705358724
# --------test 에러율--------
# rmse : 23418.322014128913
# mape : 32.723544718729144