In [1]:
import pandas as pd
import optuna
import numpy as np
import xgboost
from sklearn.linear_model import LinearRegression, Ridge
from tqdm import tqdm
from pprint import pprint
from utils.experiments import get_stratified_fold_indices, PROC_SOURCE, form_factor, crest_factor
from utils.metrics import calculate_metrics
import matplotlib.pyplot as plt


pd.set_option("display.max_columns", None)



  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def objective(trial):
    ds = pd.read_pickle(PROC_SOURCE / "ten_materials.pkl.gz")
    #hyperparameters
    max_depth= trial.suggest_int("max_depth",9,12,log=True)
    n_estimators = trial.suggest_int("n_estimators",400,1000)
    learning_rate = trial.suggest_float("learning_rate", 0.05,0.15)

    # drop H curve, we only take power loss as target
    ds = ds.drop(columns=[c for c in ds if c.startswith("H_t")])
    exp_log = {}
    feature_imp_sum= np.zeros(10)
    for material_lbl, mat_df in tqdm(
        ds.groupby("material"), desc="Train across materials"
    ):
        full_b = mat_df.loc[:, [f"B_t_{k}" for k in range(1024)]].to_numpy()
        dbdt = full_b[:, 1:] - full_b[:, :-1]
        mat_df = mat_df.reset_index(drop=True)
        kfold_lbls = get_stratified_fold_indices(mat_df, 4)
        mat_df_proc = mat_df.assign(
            kfold=kfold_lbls,
            log_freq= np.log10(mat_df.loc[:,'freq']),
            #b_fft=np.fft.fft(full_b),
            b_fft_mean=np.mean(np.abs(np.fft.fft(full_b)),axis=1),
            b_peak2peak=full_b.max(axis=1) - full_b.min(axis=1),
            log_peak2peak = np.log10(full_b.max(axis=1) - full_b.min(axis=1)),
            max_dbdt=np.max(dbdt, axis=1),
            min_dbdt=np.min(dbdt, axis=1),
            mean_abs_dbdt=np.mean(np.abs(dbdt), axis=1),
            #crest_fac=crest_factor(full_b),
            form_fac=form_factor(full_b)
            # median_dbdt=np.median(dbdt, axis=1)
            # more features imaginable (count of spikes e.g.)
        ).drop(
            columns=[c for c in mat_df if c.startswith("B_t_")] + ["material"]
        )  # drop B curve
        # training result container
        results_df = mat_df_proc.loc[:, ["ploss", "kfold"]].assign(pred=0)
        x_cols = [c for c in mat_df_proc if c not in ["ploss", "kfold"]]

        for kfold_lbl, test_fold_df in mat_df_proc.groupby("kfold"):
            train_fold_df = (
                mat_df_proc.query("kfold != @kfold_lbl")
                .reset_index(drop=True)
                .drop(columns="kfold")
            )
            assert len(train_fold_df) > 0, "empty dataframe error"
            y = train_fold_df.pop("ploss")
            X = train_fold_df.loc[:, x_cols]

            gbm = xgboost.XGBRegressor(max_depth=max_depth, gamma = 0.05822,learning_rate=learning_rate, n_estimators=n_estimators, subsample=0.89965, colsample_bytree=0.76261, objective='reg:squarederror')
            gbm.fit(X, y)
            pred = gbm.predict(test_fold_df.loc[:, x_cols])
            results_df.loc[results_df.kfold == kfold_lbl, "pred"] = pred
            feature_imp_sum += gbm.feature_importances_
            # plot

        # book keeping
        # print(feature_imp_sum)
        # plot
        

        exp_log[material_lbl] = calculate_metrics(
            results_df.loc[:, "pred"], results_df.loc[:, "ploss"]
        )
    results=pd.DataFrame(exp_log).T
    return np.mean(results.loc[:,"avg-abs-rel-err"].to_numpy())



In [6]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)

[I 2023-08-08 15:38:42,733] A new study created in memory with name: no-name-f3777323-e283-4489-8ef2-4dc6baabfbef
Train across materials: 100%|██████████| 10/10 [03:59<00:00, 23.92s/it]
[I 2023-08-08 15:42:54,335] Trial 0 finished with value: 0.043035884764537 and parameters: {'max_depth': 9, 'n_estimators': 946, 'learning_rate': 0.10333693807931968}. Best is trial 0 with value: 0.043035884764537.
Train across materials: 100%|██████████| 10/10 [03:16<00:00, 19.61s/it]
[I 2023-08-08 15:46:23,516] Trial 1 finished with value: 0.05008844093133533 and parameters: {'max_depth': 12, 'n_estimators': 556, 'learning_rate': 0.13164113745593725}. Best is trial 0 with value: 0.043035884764537.
Train across materials: 100%|██████████| 10/10 [03:39<00:00, 21.94s/it]
[I 2023-08-08 15:50:16,313] Trial 2 finished with value: 0.04561040169472873 and parameters: {'max_depth': 11, 'n_estimators': 692, 'learning_rate': 0.09910209601788832}. Best is trial 0 with value: 0.043035884764537.
Train across materi

In [7]:
study.best_trial

FrozenTrial(number=0, state=1, values=[0.043035884764537], datetime_start=datetime.datetime(2023, 8, 8, 15, 38, 42, 734472), datetime_complete=datetime.datetime(2023, 8, 8, 15, 42, 54, 335176), params={'max_depth': 9, 'n_estimators': 946, 'learning_rate': 0.10333693807931968}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'max_depth': IntDistribution(high=12, log=True, low=9, step=1), 'n_estimators': IntDistribution(high=1000, log=False, low=400, step=1), 'learning_rate': FloatDistribution(high=0.15, log=False, low=0.05, step=None)}, trial_id=0, value=None)