In [1]:
import numpy as np
import pandas as pd
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Load dataset
df = pd.read_csv("for_regr_with_descrip.csv")

# Load embeddings
blomap_embeddings = np.load("blomap_regr.npy")
fingerprints_embeddings = np.load("fingerprints_regr.npy")
protbert_embeddings = np.load("protbert_regr.npy")

# Apply PCA to Blomap for XGBoost
pca_blomap = PCA(n_components=10, random_state=42)
blomap_pca = pca_blomap.fit_transform(blomap_embeddings)

# Select numerical features
selected_features = [
    "MW", "GRAVY", "pI", "Charge", "Charge_Density", "Aromaticity",
    "Flexibility", "Aliphatic_Index", "Boman_Index", "Hydrophobic_AA",
    "Polar_AA", "Positive_AA", "Negative_AA", "MolWt", "LogP",
    "TPSA", "HBD", "HBA", "RotBonds", "Rings", "Fsp3"
]
X_numerical = df[selected_features].copy()

# One-hot encoding for cell_line
if "cell_line" in df.columns:
    enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    cell_line_encoded = enc.fit_transform(df[['cell_line']])
    cell_line_feature_names = enc.get_feature_names_out(["cell_line"])
    X_cell_line = pd.DataFrame(cell_line_encoded, columns=cell_line_feature_names)
else:
    X_cell_line = pd.DataFrame()

# Prepare feature matrices
X_xgb = np.hstack([X_numerical, blomap_pca, fingerprints_embeddings, protbert_embeddings])
if not X_cell_line.empty:
    X_xgb = np.hstack([X_xgb, X_cell_line])

X_lgbm = np.hstack([X_numerical, blomap_embeddings, fingerprints_embeddings, protbert_embeddings])
if not X_cell_line.empty:
    X_lgbm = np.hstack([X_lgbm, X_cell_line])

y = df["id_uptake"].values
valid_idx = ~np.isnan(y)
X_xgb, X_lgbm, y = X_xgb[valid_idx], X_lgbm[valid_idx], y[valid_idx]

# Handle missing values
imputer = SimpleImputer(strategy="mean")
X_xgb, X_lgbm = imputer.fit_transform(X_xgb), imputer.fit_transform(X_lgbm)

# Log-transform target variable
y = np.log1p(y)

# Train-test split
X_train_xgb, X_test_xgb, y_train, y_test = train_test_split(X_xgb, y, test_size=0.2, random_state=42)
X_train_lgbm, X_test_lgbm, _, _ = train_test_split(X_lgbm, y, test_size=0.2, random_state=42)

# Train XGBoost
xgb_model = XGBRegressor(n_estimators=754, max_depth=6, learning_rate=0.054886325307314195,
                         subsample=0.9967873263465272, colsample_bytree=0.8645926672674225,
                         random_state=42)
xgb_model.fit(X_train_xgb, y_train)
xgb_pred = np.expm1(xgb_model.predict(X_test_xgb))

# Train LightGBM
lgbm_model = LGBMRegressor(n_estimators=629, learning_rate=0.0114315426267485, num_leaves=77, 
                            min_data_in_leaf=9, max_depth=7, colsample_bytree=0.7, random_state=42)
lgbm_model.fit(X_train_lgbm, y_train)
lgbm_pred = np.expm1(lgbm_model.predict(X_test_lgbm))

# Ensemble predictions (90% XGBoost, 10% LightGBM)
ensemble_pred = (0.9 * xgb_pred + 0.1 * lgbm_pred)

# Evaluate model
print(f"MAE XGBoost: {mean_absolute_error(np.expm1(y_test), xgb_pred):.4f}")
print(f"MAE LightGBM: {mean_absolute_error(np.expm1(y_test), lgbm_pred):.4f}")
print(f"MAE Ensemble (90% XGBoost, 10% LightGBM): {mean_absolute_error(np.expm1(y_test), ensemble_pred):.4f}")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008583 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6410
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 936
[LightGBM] [Info] Start training from score 6.308943
MAE XGBoost: 262.8491
MAE LightGBM: 297.7806
MAE Ensemble (90% XGBoost, 10% LightGBM): 264.9488


In [2]:
import numpy as np
import pandas as pd
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Load dataset
df = pd.read_csv("for_regr_with_descrip.csv")

# Load embeddings
blomap_embeddings = np.load("blomap_regr.npy")
fingerprints_embeddings = np.load("fingerprints_regr.npy")
protbert_embeddings = np.load("protbert_regr.npy")

# Apply PCA to Blomap for XGBoost
pca_blomap = PCA(n_components=10, random_state=42)
blomap_pca = pca_blomap.fit_transform(blomap_embeddings)

# Select numerical features
selected_features = [
    "MW", "GRAVY", "pI", "Charge", "Charge_Density", "Aromaticity",
    "Flexibility", "Aliphatic_Index", "Boman_Index", "Hydrophobic_AA",
    "Polar_AA", "Positive_AA", "Negative_AA", "MolWt", "LogP",
    "TPSA", "HBD", "HBA", "RotBonds", "Rings", "Fsp3"
]
X_numerical = df[selected_features].copy()

# One-hot encoding for cell_line
if "cell_line" in df.columns:
    enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    cell_line_encoded = enc.fit_transform(df[['cell_line']])
    cell_line_feature_names = enc.get_feature_names_out(["cell_line"])
    X_cell_line = pd.DataFrame(cell_line_encoded, columns=cell_line_feature_names)
else:
    X_cell_line = pd.DataFrame()

# Prepare feature matrices
X_xgb = np.hstack([X_numerical, blomap_pca, fingerprints_embeddings, protbert_embeddings])
if not X_cell_line.empty:
    X_xgb = np.hstack([X_xgb, X_cell_line])

X_lgbm = np.hstack([X_numerical, blomap_embeddings, fingerprints_embeddings, protbert_embeddings])
if not X_cell_line.empty:
    X_lgbm = np.hstack([X_lgbm, X_cell_line])

y = df["id_uptake"].values
valid_idx = ~np.isnan(y)
X_xgb, X_lgbm, y = X_xgb[valid_idx], X_lgbm[valid_idx], y[valid_idx]

# Handle missing values
imputer = SimpleImputer(strategy="mean")
X_xgb, X_lgbm = imputer.fit_transform(X_xgb), imputer.fit_transform(X_lgbm)

# Log-transform target variable
y = np.log1p(y)

# Train-test split
X_train_xgb, X_test_xgb, y_train, y_test = train_test_split(X_xgb, y, test_size=0.2, random_state=42)
X_train_lgbm, X_test_lgbm, _, _ = train_test_split(X_lgbm, y, test_size=0.2, random_state=42)

# Optimize XGBoost with Optuna
def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'random_state': 42
    }
    model = XGBRegressor(**params)
    model.fit(X_train_xgb, y_train)
    pred = model.predict(X_test_xgb)
    return mean_absolute_error(y_test, pred)

study_xgb = optuna.create_study(direction='minimize')
study_xgb.optimize(objective_xgb, n_trials=50)
best_params_xgb = study_xgb.best_params
xgb_model = XGBRegressor(**best_params_xgb)
xgb_model.fit(X_train_xgb, y_train)
xgb_pred = np.expm1(xgb_model.predict(X_test_xgb))

# Optimize LightGBM with Optuna
def objective_lgbm(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 50),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'random_state': 42
    }
    model = LGBMRegressor(**params)
    model.fit(X_train_lgbm, y_train)
    pred = model.predict(X_test_lgbm)
    return mean_absolute_error(y_test, pred)

study_lgbm = optuna.create_study(direction='minimize')
study_lgbm.optimize(objective_lgbm, n_trials=50)
best_params_lgbm = study_lgbm.best_params
lgbm_model = LGBMRegressor(**best_params_lgbm)
lgbm_model.fit(X_train_lgbm, y_train)
lgbm_pred = np.expm1(lgbm_model.predict(X_test_lgbm))

# Ensemble predictions (90% XGBoost, 10% LightGBM)
ensemble_pred = (0.9 * xgb_pred + 0.1 * lgbm_pred)

# Evaluate model
print(f"MAE XGBoost: {mean_absolute_error(np.expm1(y_test), xgb_pred):.4f}")
print(f"MAE LightGBM: {mean_absolute_error(np.expm1(y_test), lgbm_pred):.4f}")
print(f"MAE Ensemble (90% XGBoost, 10% LightGBM): {mean_absolute_error(np.expm1(y_test), ensemble_pred):.4f}")


[I 2025-02-22 19:51:59,749] A new study created in memory with name: no-name-509af9dd-e1c0-43c1-b161-4e0078dbe7a5
[I 2025-02-22 19:52:07,755] Trial 0 finished with value: 0.3678559743507389 and parameters: {'n_estimators': 663, 'max_depth': 9, 'learning_rate': 0.012020505304461544, 'subsample': 0.6153433394154513, 'colsample_bytree': 0.8475731470334467}. Best is trial 0 with value: 0.3678559743507389.
[I 2025-02-22 19:52:08,615] Trial 1 finished with value: 0.4228379720613616 and parameters: {'n_estimators': 105, 'max_depth': 7, 'learning_rate': 0.03677254430219553, 'subsample': 0.6924651421529586, 'colsample_bytree': 0.5361630635887492}. Best is trial 0 with value: 0.3678559743507389.
[I 2025-02-22 19:52:10,928] Trial 2 finished with value: 0.3587238823970908 and parameters: {'n_estimators': 231, 'max_depth': 9, 'learning_rate': 0.044542031021204666, 'subsample': 0.8544368403214837, 'colsample_bytree': 0.6319880810911952}. Best is trial 2 with value: 0.3587238823970908.
[I 2025-02-22 

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004109 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4142
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 333
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:17,907] Trial 0 finished with value: 0.457321890011527 and parameters: {'n_estimators': 484, 'learning_rate': 0.25850016128294084, 'num_leaves': 32, 'min_data_in_leaf': 28, 'max_depth': 8, 'colsample_bytree': 0.5343305348968448}. Best is trial 0 with value: 0.457321890011527.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004409 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4083
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 312
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:18,449] Trial 1 finished with value: 0.503497082425003 and parameters: {'n_estimators': 859, 'learning_rate': 0.014442381410496141, 'num_leaves': 41, 'min_data_in_leaf': 34, 'max_depth': 5, 'colsample_bytree': 0.9280399846185863}. Best is trial 0 with value: 0.457321890011527.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007810 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4142
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 333
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:18,713] Trial 2 finished with value: 0.5313677320894044 and parameters: {'n_estimators': 176, 'learning_rate': 0.02129713341676084, 'num_leaves': 88, 'min_data_in_leaf': 28, 'max_depth': 9, 'colsample_bytree': 0.9461421531196128}. Best is trial 0 with value: 0.457321890011527.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005279 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4303
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 372
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:19,689] Trial 3 finished with value: 0.4022493872812349 and parameters: {'n_estimators': 807, 'learning_rate': 0.024704373656250604, 'num_leaves': 50, 'min_data_in_leaf': 17, 'max_depth': 9, 'colsample_bytree': 0.8959071213577137}. Best is trial 3 with value: 0.4022493872812349.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004897 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4252
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 356
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:20,046] Trial 4 finished with value: 0.4857955876101563 and parameters: {'n_estimators': 595, 'learning_rate': 0.2787362823165131, 'num_leaves': 32, 'min_data_in_leaf': 21, 'max_depth': 4, 'colsample_bytree': 0.7102693550544708}. Best is trial 3 with value: 0.4022493872812349.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003983 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4399
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 402
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:21,344] Trial 5 finished with value: 0.3871452454338274 and parameters: {'n_estimators': 875, 'learning_rate': 0.1283792083986569, 'num_leaves': 66, 'min_data_in_leaf': 12, 'max_depth': 7, 'colsample_bytree': 0.5705100073179097}. Best is trial 5 with value: 0.3871452454338274.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005241 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4319
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 375
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:21,698] Trial 6 finished with value: 0.3968227066536716 and parameters: {'n_estimators': 254, 'learning_rate': 0.28327573232388953, 'num_leaves': 88, 'min_data_in_leaf': 16, 'max_depth': 5, 'colsample_bytree': 0.5916708340444654}. Best is trial 5 with value: 0.3871452454338274.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003567 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4199
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 343
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:21,947] Trial 7 finished with value: 0.5258360779937049 and parameters: {'n_estimators': 389, 'learning_rate': 0.019367080098209512, 'num_leaves': 20, 'min_data_in_leaf': 25, 'max_depth': 4, 'colsample_bytree': 0.9404354987665102}. Best is trial 5 with value: 0.3871452454338274.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004983 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4215
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 346
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:22,631] Trial 8 finished with value: 0.4440375456433406 and parameters: {'n_estimators': 664, 'learning_rate': 0.0385875314375423, 'num_leaves': 23, 'min_data_in_leaf': 24, 'max_depth': 8, 'colsample_bytree': 0.6631385101797157}. Best is trial 5 with value: 0.3871452454338274.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004385 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4032
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 299
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:23,064] Trial 9 finished with value: 0.49178172604610587 and parameters: {'n_estimators': 390, 'learning_rate': 0.03238451694488863, 'num_leaves': 78, 'min_data_in_leaf': 39, 'max_depth': 10, 'colsample_bytree': 0.9104258436574858}. Best is trial 5 with value: 0.3871452454338274.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013705 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8244
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 1421
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:24,644] Trial 10 finished with value: 0.35727311994199923 and parameters: {'n_estimators': 986, 'learning_rate': 0.10882404680327565, 'num_leaves': 66, 'min_data_in_leaf': 7, 'max_depth': 6, 'colsample_bytree': 0.8023661963971713}. Best is trial 10 with value: 0.35727311994199923.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014067 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8429
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 1477
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:26,464] Trial 11 finished with value: 0.359354991403389 and parameters: {'n_estimators': 984, 'learning_rate': 0.10045877115513908, 'num_leaves': 66, 'min_data_in_leaf': 5, 'max_depth': 6, 'colsample_bytree': 0.8054801228536501}. Best is trial 10 with value: 0.35727311994199923.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.057132 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8429
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 1477
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:28,455] Trial 12 finished with value: 0.37363147959752274 and parameters: {'n_estimators': 990, 'learning_rate': 0.0790278175212086, 'num_leaves': 62, 'min_data_in_leaf': 5, 'max_depth': 6, 'colsample_bytree': 0.8084303852903281}. Best is trial 10 with value: 0.35727311994199923.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004201 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3956
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 276
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:29,137] Trial 13 finished with value: 0.4910671468666497 and parameters: {'n_estimators': 955, 'learning_rate': 0.10464499098543764, 'num_leaves': 68, 'min_data_in_leaf': 50, 'max_depth': 6, 'colsample_bytree': 0.8027832881645867}. Best is trial 10 with value: 0.35727311994199923.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014385 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8429
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 1477
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:29,754] Trial 14 finished with value: 0.3830691629375131 and parameters: {'n_estimators': 725, 'learning_rate': 0.05930103026437993, 'num_leaves': 99, 'min_data_in_leaf': 5, 'max_depth': 3, 'colsample_bytree': 0.7886986939754279}. Best is trial 10 with value: 0.35727311994199923.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005322 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4423
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 409
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:30,835] Trial 15 finished with value: 0.39075899467236347 and parameters: {'n_estimators': 759, 'learning_rate': 0.1505621632794886, 'num_leaves': 53, 'min_data_in_leaf': 11, 'max_depth': 7, 'colsample_bytree': 0.8414183017553317}. Best is trial 10 with value: 0.35727311994199923.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006554 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5593
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 706
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:31,712] Trial 16 finished with value: 0.3833340077818787 and parameters: {'n_estimators': 1000, 'learning_rate': 0.16945115999606641, 'num_leaves': 75, 'min_data_in_leaf': 10, 'max_depth': 5, 'colsample_bytree': 0.7232395618769949}. Best is trial 10 with value: 0.35727311994199923.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014301 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8429
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 1477
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:33,302] Trial 17 finished with value: 0.3639941847879997 and parameters: {'n_estimators': 886, 'learning_rate': 0.0773796055870976, 'num_leaves': 50, 'min_data_in_leaf': 5, 'max_depth': 6, 'colsample_bytree': 0.8552927578746438}. Best is trial 10 with value: 0.35727311994199923.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004372 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3987
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 285
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:33,894] Trial 18 finished with value: 0.4802974720668656 and parameters: {'n_estimators': 672, 'learning_rate': 0.08648291870612057, 'num_leaves': 76, 'min_data_in_leaf': 45, 'max_depth': 7, 'colsample_bytree': 0.6800319791828398}. Best is trial 10 with value: 0.35727311994199923.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005173 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4303
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 372
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:34,191] Trial 19 finished with value: 0.42448861956125544 and parameters: {'n_estimators': 524, 'learning_rate': 0.04988177774305707, 'num_leaves': 58, 'min_data_in_leaf': 17, 'max_depth': 3, 'colsample_bytree': 0.7723451384262717}. Best is trial 10 with value: 0.35727311994199923.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007646 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5593
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 706
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:35,594] Trial 20 finished with value: 0.39338033216073887 and parameters: {'n_estimators': 915, 'learning_rate': 0.17851536227238599, 'num_leaves': 42, 'min_data_in_leaf': 10, 'max_depth': 8, 'colsample_bytree': 0.9984125984843956}. Best is trial 10 with value: 0.35727311994199923.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014376 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8429
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 1477
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:37,059] Trial 21 finished with value: 0.35389896152342154 and parameters: {'n_estimators': 837, 'learning_rate': 0.06765973754237954, 'num_leaves': 49, 'min_data_in_leaf': 5, 'max_depth': 6, 'colsample_bytree': 0.8531741692068897}. Best is trial 21 with value: 0.35389896152342154.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015188 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7307
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 1182
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:38,315] Trial 22 finished with value: 0.3795945855550224 and parameters: {'n_estimators': 808, 'learning_rate': 0.06296441775434199, 'num_leaves': 57, 'min_data_in_leaf': 8, 'max_depth': 6, 'colsample_bytree': 0.8575314395497464}. Best is trial 21 with value: 0.35389896152342154.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008923 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4359
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 390
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:39,039] Trial 23 finished with value: 0.38265978498979025 and parameters: {'n_estimators': 933, 'learning_rate': 0.11478354181963901, 'num_leaves': 69, 'min_data_in_leaf': 14, 'max_depth': 5, 'colsample_bytree': 0.7484575024357087}. Best is trial 21 with value: 0.35389896152342154.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011295 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7307
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 1182
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:39,787] Trial 24 finished with value: 0.3835908208372531 and parameters: {'n_estimators': 779, 'learning_rate': 0.04453055048179919, 'num_leaves': 43, 'min_data_in_leaf': 8, 'max_depth': 4, 'colsample_bytree': 0.8139389030697061}. Best is trial 21 with value: 0.35389896152342154.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004961 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4260
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 359
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:40,689] Trial 25 finished with value: 0.42874440546901305 and parameters: {'n_estimators': 883, 'learning_rate': 0.085308514443947, 'num_leaves': 64, 'min_data_in_leaf': 20, 'max_depth': 7, 'colsample_bytree': 0.8729314615379267}. Best is trial 21 with value: 0.35389896152342154.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005873 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4087
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 314
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:41,551] Trial 26 finished with value: 0.4701388867291838 and parameters: {'n_estimators': 836, 'learning_rate': 0.0665185378810505, 'num_leaves': 84, 'min_data_in_leaf': 33, 'max_depth': 6, 'colsample_bytree': 0.7531344742862165}. Best is trial 21 with value: 0.35389896152342154.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014269 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8244
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 1421
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:42,761] Trial 27 finished with value: 0.39365151477728494 and parameters: {'n_estimators': 994, 'learning_rate': 0.18675105688738738, 'num_leaves': 72, 'min_data_in_leaf': 7, 'max_depth': 5, 'colsample_bytree': 0.6438702999603211}. Best is trial 21 with value: 0.35389896152342154.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005295 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4359
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 390
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:43,900] Trial 28 finished with value: 0.38211621235676047 and parameters: {'n_estimators': 716, 'learning_rate': 0.10505880593509685, 'num_leaves': 55, 'min_data_in_leaf': 14, 'max_depth': 7, 'colsample_bytree': 0.9737720466819659}. Best is trial 21 with value: 0.35389896152342154.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006208 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4267
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 362
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:44,568] Trial 29 finished with value: 0.4304845704955571 and parameters: {'n_estimators': 607, 'learning_rate': 0.21762620104317562, 'num_leaves': 35, 'min_data_in_leaf': 19, 'max_depth': 8, 'colsample_bytree': 0.8229202150717032}. Best is trial 21 with value: 0.35389896152342154.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005236 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4373
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 395
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:45,399] Trial 30 finished with value: 0.3956415418414553 and parameters: {'n_estimators': 928, 'learning_rate': 0.12820728169923068, 'num_leaves': 47, 'min_data_in_leaf': 13, 'max_depth': 6, 'colsample_bytree': 0.8836708677343362}. Best is trial 21 with value: 0.35389896152342154.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015185 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8429
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 1477
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:46,905] Trial 31 finished with value: 0.368224868658223 and parameters: {'n_estimators': 889, 'learning_rate': 0.0806140412596048, 'num_leaves': 48, 'min_data_in_leaf': 5, 'max_depth': 6, 'colsample_bytree': 0.8463167027979767}. Best is trial 21 with value: 0.35389896152342154.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012279 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7307
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 1182
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:47,838] Trial 32 finished with value: 0.4082483691524127 and parameters: {'n_estimators': 845, 'learning_rate': 0.011281846455561784, 'num_leaves': 59, 'min_data_in_leaf': 8, 'max_depth': 5, 'colsample_bytree': 0.7704154110703942}. Best is trial 21 with value: 0.35389896152342154.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014765 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8429
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 1477
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:49,376] Trial 33 finished with value: 0.363264648716971 and parameters: {'n_estimators': 947, 'learning_rate': 0.06994656089793157, 'num_leaves': 39, 'min_data_in_leaf': 5, 'max_depth': 6, 'colsample_bytree': 0.8363262496825447}. Best is trial 21 with value: 0.35389896152342154.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009619 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6410
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 936
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:50,781] Trial 34 finished with value: 0.3661056526677113 and parameters: {'n_estimators': 951, 'learning_rate': 0.03296543803638355, 'num_leaves': 38, 'min_data_in_leaf': 9, 'max_depth': 7, 'colsample_bytree': 0.9156201998119038}. Best is trial 21 with value: 0.35389896152342154.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005494 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4423
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 409
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:51,511] Trial 35 finished with value: 0.3852521677902066 and parameters: {'n_estimators': 951, 'learning_rate': 0.0467908014962687, 'num_leaves': 82, 'min_data_in_leaf': 11, 'max_depth': 5, 'colsample_bytree': 0.7216926635967962}. Best is trial 21 with value: 0.35389896152342154.




[I 2025-02-22 19:54:51,664] Trial 36 finished with value: 0.5250138672399772 and parameters: {'n_estimators': 119, 'learning_rate': 0.0968282810128679, 'num_leaves': 31, 'min_data_in_leaf': 30, 'max_depth': 4, 'colsample_bytree': 0.8255599612335895}. Best is trial 21 with value: 0.35389896152342154.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004586 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4106
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 323
[LightGBM] [Info] Start training from score 6.308943
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014998 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8244
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 1421
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:52,911] Trial 37 finished with value: 0.36199427595265443 and parameters: {'n_estimators': 785, 'learning_rate': 0.13774189237145465, 'num_leaves': 26, 'min_data_in_leaf': 7, 'max_depth': 6, 'colsample_bytree': 0.8969984880180883}. Best is trial 21 with value: 0.35389896152342154.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003678 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4319
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 375
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:53,888] Trial 38 finished with value: 0.40835615965503713 and parameters: {'n_estimators': 818, 'learning_rate': 0.14167912241048447, 'num_leaves': 29, 'min_data_in_leaf': 16, 'max_depth': 9, 'colsample_bytree': 0.9651123779094023}. Best is trial 21 with value: 0.35389896152342154.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004738 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4219
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 348
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:55,104] Trial 39 finished with value: 0.4611246966478822 and parameters: {'n_estimators': 766, 'learning_rate': 0.22207501163586738, 'num_leaves': 28, 'min_data_in_leaf': 23, 'max_depth': 7, 'colsample_bytree': 0.5153283707362082}. Best is trial 21 with value: 0.35389896152342154.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005281 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4373
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 395
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:55,492] Trial 40 finished with value: 0.40257978819882806 and parameters: {'n_estimators': 402, 'learning_rate': 0.11468665519595775, 'num_leaves': 24, 'min_data_in_leaf': 13, 'max_depth': 5, 'colsample_bytree': 0.8926416110640627}. Best is trial 21 with value: 0.35389896152342154.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013771 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8244
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 1421
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:57,051] Trial 41 finished with value: 0.36537526845793616 and parameters: {'n_estimators': 850, 'learning_rate': 0.07136876191820074, 'num_leaves': 38, 'min_data_in_leaf': 7, 'max_depth': 6, 'colsample_bytree': 0.9301453437689335}. Best is trial 21 with value: 0.35389896152342154.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013179 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8244
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 1421
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:54:58,557] Trial 42 finished with value: 0.36426406202884015 and parameters: {'n_estimators': 969, 'learning_rate': 0.09429929524481266, 'num_leaves': 44, 'min_data_in_leaf': 7, 'max_depth': 6, 'colsample_bytree': 0.872278475567511}. Best is trial 21 with value: 0.35389896152342154.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014524 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8429
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 1477
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:55:00,043] Trial 43 finished with value: 0.37090843026486997 and parameters: {'n_estimators': 885, 'learning_rate': 0.1357948644185415, 'num_leaves': 62, 'min_data_in_leaf': 5, 'max_depth': 6, 'colsample_bytree': 0.7910275200482807}. Best is trial 21 with value: 0.35389896152342154.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025527 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5593
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 706
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:55:00,962] Trial 44 finished with value: 0.3893545196316264 and parameters: {'n_estimators': 926, 'learning_rate': 0.054768047255055685, 'num_leaves': 25, 'min_data_in_leaf': 10, 'max_depth': 5, 'colsample_bytree': 0.8357265048102568}. Best is trial 21 with value: 0.35389896152342154.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004238 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4032
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 299
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:55:01,274] Trial 45 finished with value: 0.5205834531790838 and parameters: {'n_estimators': 273, 'learning_rate': 0.11778972901413927, 'num_leaves': 52, 'min_data_in_leaf': 40, 'max_depth': 7, 'colsample_bytree': 0.9004640024875612}. Best is trial 21 with value: 0.35389896152342154.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013226 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8244
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 1421
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:55:03,044] Trial 46 finished with value: 0.38710660561005733 and parameters: {'n_estimators': 611, 'learning_rate': 0.03807403719415024, 'num_leaves': 20, 'min_data_in_leaf': 7, 'max_depth': 4, 'colsample_bytree': 0.7773700978747744}. Best is trial 21 with value: 0.35389896152342154.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005637 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4399
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 402
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:55:03,899] Trial 47 finished with value: 0.3947102550437297 and parameters: {'n_estimators': 712, 'learning_rate': 0.15376238718941154, 'num_leaves': 36, 'min_data_in_leaf': 12, 'max_depth': 6, 'colsample_bytree': 0.7489379711542083}. Best is trial 21 with value: 0.35389896152342154.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011387 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4349
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 386
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:55:05,389] Trial 48 finished with value: 0.39161781371084103 and parameters: {'n_estimators': 785, 'learning_rate': 0.05721693488440889, 'num_leaves': 71, 'min_data_in_leaf': 15, 'max_depth': 8, 'colsample_bytree': 0.8637510010686398}. Best is trial 21 with value: 0.35389896152342154.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027360 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8429
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 1477
[LightGBM] [Info] Start training from score 6.308943


[I 2025-02-22 19:55:07,922] Trial 49 finished with value: 0.36421361723670886 and parameters: {'n_estimators': 997, 'learning_rate': 0.07178547354203263, 'num_leaves': 46, 'min_data_in_leaf': 5, 'max_depth': 7, 'colsample_bytree': 0.8056092298581912}. Best is trial 21 with value: 0.35389896152342154.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.048219 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8429
[LightGBM] [Info] Number of data points in the train set: 701, number of used features: 1477
[LightGBM] [Info] Start training from score 6.308943
MAE XGBoost: 271.2844
MAE LightGBM: 265.4718
MAE Ensemble (90% XGBoost, 10% LightGBM): 268.3830
