# Ordinary Least Squares on magnet challenge dataset

In [1]:
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pandas as pd
from scipy.interpolate import UnivariateSpline
from scipy.integrate import trapezoid

# Read Dataset

In [2]:
filepath = '/home/nikolasf/Dokumente/01_git/30_Python/MC_UPB/data/input/processed'
material_name = 'ten_materials'
ds = pd.read_pickle(f'{filepath}/{material_name}.pkl.gz')

# Initial edit dataset

In [3]:

ds = ds.drop(columns=[c for c in ds if c.startswith("H_t")])
ds = ds.query('temp == 25')

# add the saturation flux density. Data from datasheets.
ds.loc[ds['material'] == '3C90', 'b_sat_25'] = 0.47
ds.loc[ds['material'] == '3C94', 'b_sat_25'] = 0.47
ds.loc[ds['material'] == '3E6', 'b_sat_25'] = 0.46
ds.loc[ds['material'] == '3F4', 'b_sat_25'] = 0.41
ds.loc[ds['material'] == '77', 'b_sat_25'] = 0.51
ds.loc[ds['material'] == '78', 'b_sat_25'] = 0.48
ds.loc[ds['material'] == 'N27', 'b_sat_25'] = 0.50
ds.loc[ds['material'] == 'N30', 'b_sat_25'] = 0.38
ds.loc[ds['material'] == 'N49', 'b_sat_25'] = 0.49
ds.loc[ds['material'] == 'N87', 'b_sat_25'] = 0.49


print(ds['ploss'])
ds['ploss'] = np.log(ds['ploss'])
print(ds['ploss'])

# print(ds)

0           2319.444340
1           3191.235893
2           4341.086142
3           5795.359190
4           7813.691725
              ...      
156279     61790.145402
156280     78810.500010
156281    100717.381914
156282    129153.194834
156283    165612.702450
Name: ploss, Length: 46915, dtype: float64
0          7.749083
1          8.068164
2          8.375880
3          8.664813
4          8.963633
            ...    
156279    11.031499
156280    11.274802
156281    11.520074
156282    11.768755
156283    12.017407
Name: ploss, Length: 46915, dtype: float64


### Adapted Wilhelm example

In [4]:
"""Run linear regression with regularization training"""
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, GradientBoostingRegressor, VotingRegressor, ExtraTreesRegressor
from tqdm import tqdm
from pprint import pprint
from utils.experiments import get_stratified_fold_indices, PROC_SOURCE
from utils.metrics import calculate_metrics

pd.set_option("display.max_columns", None)


def objective(trial):

    n_estimators = 429 # trial.suggest_int('n_estimators', 10, 500)
    print(f'{n_estimators = }')
    criterion = trial.suggest_categorical('criterion', ['friedman_mse'])
    print(f"{criterion = }")
    learning_rate = 0.12 # trial.suggest_float('learning_rate', 0.01, 1)
    print(f"{learning_rate = }")
    
    exp_log = {}
    for material_lbl, mat_df in tqdm(ds.groupby("material"), desc="Train across materials"):
        full_b = mat_df.loc[:, [f"B_t_{k}" for k in range(1024)]].to_numpy()
        dbdt = full_b[:, 1:] - full_b[:, :-1]
        mat_df = mat_df.reset_index(drop=True)

        x_vec = np.linspace(0, 1023, 1024)
        b_vec = []
        for value in x_vec:
            b_vec.append(f'B_t_{int(value)}')
        mat_df["b"] = mat_df[b_vec].values.tolist()
        x_vec = None
        b_vec = None
        mat_df['delta_b'] = mat_df['b'].map(lambda x: np.max(x) - np.min(x))

        # figure out integral_part 
        mat_df["time_s"] = mat_df["freq"].map(lambda x: np.linspace(0, 1/x, 1024))

        # derivation
        # according to https://im-coder.com/zweite-ableitung-in-python-scipy-numpy-pandas.html
        mat_df["fitted_function"] = mat_df.apply(lambda x: UnivariateSpline(x["time_s"], x["b"], s=0, k=4), axis=1)
        mat_df["amplitude_2nd_derivation"] = mat_df["fitted_function"].apply(lambda x: x.derivative(n=2))
        mat_df["integrated_function"] = mat_df.apply(lambda x: trapezoid(np.abs(x["amplitude_2nd_derivation"](x["time_s"])), x["time_s"]), axis=1)     

        mat_df["integral_part"] = mat_df["integrated_function"] / mat_df["delta_b"]

        # cross validation 'kfold'
        kfold_lbls = get_stratified_fold_indices(mat_df, 4)
        mat_df_proc = mat_df.assign(
            kfold=kfold_lbls,
            # integral_part=mat_df['integral_part'],
            #delta_b=mat_df['delta_b'],
            #b_sat=mat_df['b_sat_25'],
            db_bsat_1 = mat_df['delta_b'] / mat_df['b_sat_25'],
            db_bsat_2 = (mat_df['delta_b'] / mat_df['b_sat_25']) ** 2,
            db_bsat_3 = (mat_df['delta_b'] / mat_df['b_sat_25']) ** 3,
            db_bsat_4 = (mat_df['delta_b'] / mat_df['b_sat_25']) ** 4,
            db_bsat_5 = (mat_df['delta_b'] / mat_df['b_sat_25']) ** 5,
            db_bsat_6 = (mat_df['delta_b'] / mat_df['b_sat_25']) ** 6,
            t0 = 1,
            t1 = 0,
            t2 = (mat_df['integral_part'] ** (-1)) / 2,
            t3 = -(mat_df['integral_part'] ** (-2)) / 6,
            t4 = ( 3 * mat_df['integral_part'] ** (-2) + 2 * mat_df['integral_part'] ** (-3)) / 24
            # more features imaginable (count of spikes e.g.)
        ).drop(
            columns=[c for c in mat_df if c.startswith("B_t_")] + ["material"] + ['b'] + ['time_s'] + ['fitted_function'] +['amplitude_2nd_derivation'] + ['integrated_function'] + ['temp'] + ['b_sat_25']
        )  # drop B curve

        # training result container
        results_df = mat_df_proc.loc[:, ["ploss", "kfold"]].assign(pred=0)
        x_cols = [c for c in mat_df_proc if c not in ["ploss", "kfold"]]
        print(x_cols)
        for kfold_lbl, test_fold_df in mat_df_proc.groupby("kfold"):
            train_fold_df = (
                mat_df_proc.query("kfold != @kfold_lbl")
                .reset_index(drop=True)
                .drop(columns="kfold")
            )
            assert len(train_fold_df) > 0, "empty dataframe error"
            y = train_fold_df.pop("ploss")
            X = train_fold_df.loc[:, x_cols]

            mdl = GradientBoostingRegressor(n_estimators=n_estimators, criterion=criterion, learning_rate=learning_rate) # GradientBoostingRegressor() # HistGradientBoostingRegressor() # RandomForestRegressor(n_estimators = 100) #LinearRegression() # Ridge()  # 
            mdl.fit(X.to_numpy(), y.to_numpy())
            pred = mdl.predict(test_fold_df.loc[:, x_cols].to_numpy())
            results_df.loc[results_df.kfold == kfold_lbl, "pred"] = pred

        # book keeping
        exp_log[material_lbl] = calculate_metrics(
            np.exp(results_df.loc[:, "pred"]), np.exp(results_df.loc[:, "ploss"])
        )
    print(mdl.__class__.__name__)
    print(f"{n_estimators = }")
    print(f"{learning_rate = }")
    #print(mdl.features_importances_)
    
    print("Overall Score")
    score = pd.DataFrame(exp_log).T
    print(score)

    added_avg_abs_rel_err = score['avg-abs-rel-err'].sum()
    print('sum error')
    print(added_avg_abs_rel_err)
    
    return added_avg_abs_rel_err

In [5]:
#print(mdl.features_importances_)


### Run objective function single

In [6]:
# objective()

### Run objective function in a hyperparameter optimization loop

In [7]:
import optuna

import sklearn.datasets
import sklearn.ensemble
import sklearn.model_selection
import sklearn.svm


# FYI: Objective functions can take additional arguments
# (https://optuna.readthedocs.io/en/stable/faq.html#objective-func-additional-args).



study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=1, n_jobs=1)
#print(study.best_trial)

[I 2023-09-01 08:50:36,506] A new study created in memory with name: no-name-e0f808de-0d87-4c0c-aecd-f35deaa82db9


n_estimators = 429
criterion = 'friedman_mse'
learning_rate = 0.12


Train across materials:   0%|          | 0/10 [00:00<?, ?it/s]

['freq', 'delta_b', 'integral_part', 'db_bsat_1', 'db_bsat_2', 'db_bsat_3', 'db_bsat_4', 'db_bsat_5', 'db_bsat_6', 't0', 't1', 't2', 't3', 't4']


Train across materials:  10%|█         | 1/10 [01:13<11:01, 73.44s/it]

['freq', 'delta_b', 'integral_part', 'db_bsat_1', 'db_bsat_2', 'db_bsat_3', 'db_bsat_4', 'db_bsat_5', 'db_bsat_6', 't0', 't1', 't2', 't3', 't4']


Train across materials:  20%|██        | 2/10 [02:28<09:52, 74.11s/it]

['freq', 'delta_b', 'integral_part', 'db_bsat_1', 'db_bsat_2', 'db_bsat_3', 'db_bsat_4', 'db_bsat_5', 'db_bsat_6', 't0', 't1', 't2', 't3', 't4']


Train across materials:  30%|███       | 3/10 [02:41<05:24, 46.30s/it]

['freq', 'delta_b', 'integral_part', 'db_bsat_1', 'db_bsat_2', 'db_bsat_3', 'db_bsat_4', 'db_bsat_5', 'db_bsat_6', 't0', 't1', 't2', 't3', 't4']


Train across materials:  40%|████      | 4/10 [02:55<03:21, 33.52s/it]

['freq', 'delta_b', 'integral_part', 'db_bsat_1', 'db_bsat_2', 'db_bsat_3', 'db_bsat_4', 'db_bsat_5', 'db_bsat_6', 't0', 't1', 't2', 't3', 't4']


Train across materials:  50%|█████     | 5/10 [03:15<02:23, 28.78s/it]

['freq', 'delta_b', 'integral_part', 'db_bsat_1', 'db_bsat_2', 'db_bsat_3', 'db_bsat_4', 'db_bsat_5', 'db_bsat_6', 't0', 't1', 't2', 't3', 't4']


Train across materials:  60%|██████    | 6/10 [03:43<01:53, 28.39s/it]

['freq', 'delta_b', 'integral_part', 'db_bsat_1', 'db_bsat_2', 'db_bsat_3', 'db_bsat_4', 'db_bsat_5', 'db_bsat_6', 't0', 't1', 't2', 't3', 't4']


Train across materials:  70%|███████   | 7/10 [04:03<01:17, 25.90s/it]

['freq', 'delta_b', 'integral_part', 'db_bsat_1', 'db_bsat_2', 'db_bsat_3', 'db_bsat_4', 'db_bsat_5', 'db_bsat_6', 't0', 't1', 't2', 't3', 't4']


Train across materials:  80%|████████  | 8/10 [04:20<00:45, 22.78s/it]

['freq', 'delta_b', 'integral_part', 'db_bsat_1', 'db_bsat_2', 'db_bsat_3', 'db_bsat_4', 'db_bsat_5', 'db_bsat_6', 't0', 't1', 't2', 't3', 't4']


Train across materials:  90%|█████████ | 9/10 [04:35<00:20, 20.34s/it]

['freq', 'delta_b', 'integral_part', 'db_bsat_1', 'db_bsat_2', 'db_bsat_3', 'db_bsat_4', 'db_bsat_5', 'db_bsat_6', 't0', 't1', 't2', 't3', 't4']


Train across materials: 100%|██████████| 10/10 [05:46<00:00, 34.65s/it]
[I 2023-09-01 08:56:23,194] Trial 0 finished with value: 0.8605990165875625 and parameters: {'criterion': 'friedman_mse'}. Best is trial 0 with value: 0.8605990165875625.


GradientBoostingRegressor
n_estimators = 429
learning_rate = 0.12
Overall Score
               mse           mae  avg-abs-rel-err  percentile_5_rel_err  \
3C90  7.046213e+08  12222.947832         0.092755              0.005449   
3C94  2.006546e+09  20709.790009         0.092965              0.006054   
3E6   4.597660e+09  33245.532382         0.085069              0.005876   
3F4   7.293612e+08   9697.432630         0.058616              0.004035   
77    2.444026e+09  22798.598061         0.097164              0.006124   
78    1.703914e+09  18989.606688         0.097450              0.006583   
N27   2.313452e+09  21663.224465         0.075091              0.004891   
N30   5.115155e+09  31634.673170         0.085142              0.006145   
N49   3.939435e+09  30377.704650         0.099155              0.005876   
N87   5.543226e+08  11180.969596         0.077192              0.005165   

      percentile_95_rel_err       l_infty   l_infty_over  l_infty_under  
3C90               0