In [None]:
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization
import time

In [None]:
#load the dataset
feat_no = 11
data_input = pd.read_excel('../Data/MNPs_dataset.xlsx', sheet_name='MNPs_11')
dataset = data_input.iloc[:,0:feat_no+1]
X = dataset.iloc[:,0:feat_no]
y = dataset.iloc[:,-1]

In [None]:
#Optimize the hyperparameters with bayesian optimization
def lgbmr_cv(n_estimators, learning_rate, max_depth):
    estimator = LGBMRegressor(
        n_estimators = int(n_estimators),
        learning_rate = learning_rate,
        max_depth = int(max_depth)
    )
    cval = cross_val_score(estimator, X_train, y_train, scoring='neg_root_mean_squared_error', cv=10)
    return cval.mean()

rs = np.arange(50)

metric = {
    'r2_train':[],'rmse_train':[],'mae_train':[],
    'r2_test':[],'rmse_test':[],'mae_test':[]
}

best_params = {
    'n_estimators':[],
    'learning_rate':[],
    'max_depth':[]
}

start_t = time.time()

for i in rs:
    X_train, X_test, y_train, y_test = [],[],[],[]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=i)

    lgbmr_bo = BayesianOptimization(
        lgbmr_cv,
        pbounds={
            "n_estimators": (100,500),
            "learning_rate": (0.1,0.3),
            "max_depth": (3,9)
        },
        random_state=42,
        verbose=0
    )
    
    lgbmr_bo.maximize(n_iter=25)
    
    params = {
        'n_estimators': int(lgbmr_bo.max['params']['n_estimators']),
        'learning_rate': round(lgbmr_bo.max['params']['learning_rate'],2),
        'max_depth': int(lgbmr_bo.max['params']['max_depth'])
    }

    model = LGBMRegressor(**params)
    model.fit(X_train, y_train)

    y_pred_train = model.predict(X_train)
    r2_train = r2_score(y_train,y_pred_train)
    rmse_train = mean_squared_error(y_train,y_pred_train)**0.5
    mae_train = mean_absolute_error(y_train,y_pred_train)
    
    y_pred_test = model.predict(X_test)
    r2_test = r2_score(y_test,y_pred_test)
    rmse_test = mean_squared_error(y_test,y_pred_test)**0.5
    mae_test = mean_absolute_error(y_test,y_pred_test)

    metric['r2_train'].append(r2_train)
    metric['rmse_train'].append(rmse_train)
    metric['mae_train'].append(mae_train)
    metric['r2_test'].append(r2_test)
    metric['rmse_test'].append(rmse_test)
    metric['mae_test'].append(mae_test)
    
    best_params['n_estimators'].append(int(lgbmr_bo.max['params']['n_estimators']))
    best_params['learning_rate'].append(round(lgbmr_bo.max['params']['learning_rate'],2))
    best_params['max_depth'].append(int(lgbmr_bo.max['params']['max_depth']))
    
end_t = time.time()

print('Training time', round(end_t - start_t, 2), 's')

In [None]:
metric_df = pd.DataFrame(metric)
best_params_df = pd.DataFrame(best_params)
metric_df.to_excel('../Data_saved/lgbm_metrics.xlsx')
best_params_df.to_excel('../Data_saved/lgbm_params.xlsx')

In [None]:
np.average(metric['r2_test'])

In [None]:
metric_df.describe()