# Ordinary Least Squares on magnet challenge dataset

In [1]:
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pandas as pd
from scipy.interpolate import UnivariateSpline
from scipy.integrate import trapezoid

# Read Dataset

In [2]:
filepath = '/home/nikolasf/Dokumente/01_git/30_Python/MC_UPB/data/input/processed'
material_name = 'ten_materials'
ds = pd.read_pickle(f'{filepath}/{material_name}.pkl.gz')

print(pd.unique(ds["material"]))

['3C90' '3F4' '78' '77' 'N49' '3E6' 'N27' 'N30' '3C94' 'N87']


# Initial edit dataset

In [3]:

ds = ds.drop(columns=[c for c in ds if c.startswith("H_t")])
#ds = ds.query('temp == 25')

# add the saturation flux density. Data from datasheets.
ds.loc[ds['material'] == '3C90', 'b_sat_25'] = 0.47
ds.loc[ds['material'] == '3C94', 'b_sat_25'] = 0.47
ds.loc[ds['material'] == '3E6', 'b_sat_25'] = 0.46
ds.loc[ds['material'] == '3F4', 'b_sat_25'] = 0.41
ds.loc[ds['material'] == '77', 'b_sat_25'] = 0.51
ds.loc[ds['material'] == '78', 'b_sat_25'] = 0.48
ds.loc[ds['material'] == 'N27', 'b_sat_25'] = 0.50
ds.loc[ds['material'] == 'N30', 'b_sat_25'] = 0.38
ds.loc[ds['material'] == 'N49', 'b_sat_25'] = 0.49
ds.loc[ds['material'] == 'N87', 'b_sat_25'] = 0.49


print(ds['ploss'])
ds['ploss'] = np.log(ds['ploss'])
ds['freq'] = np.log(ds['freq'])

print(ds['ploss'])

# print(ds)

0           2319.444340
1           3191.235893
2           4341.086142
3           5795.359190
4           7813.691725
              ...      
186742     68121.464932
186743     85924.743654
186744    108411.839417
186745    135853.384296
186746    171849.522231
Name: ploss, Length: 186747, dtype: float64
0          7.749083
1          8.068164
2          8.375880
3          8.664813
4          8.963633
            ...    
186742    11.129048
186743    11.361227
186744    11.593693
186745    11.819332
186746    12.054375
Name: ploss, Length: 186747, dtype: float64


### Adapted Wilhelm example

In [4]:
"""Run linear regression with regularization training"""
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, GradientBoostingRegressor, VotingRegressor, ExtraTreesRegressor
from tqdm import tqdm
from pprint import pprint
from utils.experiments import get_stratified_fold_indices, PROC_SOURCE
from utils.metrics import calculate_metrics
import xgboost as xgb

pd.set_option("display.max_columns", None)


def objective(trial):



    n_estimators = trial.suggest_int('n_estimators', 100, 800)
    # 429 is best
    print(f'{n_estimators = }')
    criterion = trial.suggest_categorical('criterion', ['squared_error'])
    print(f"{criterion = }")
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.5)
    # 0.12 is best
    print(f"{learning_rate = }")
    
    exp_log = {}
    for material_lbl, mat_df in tqdm(ds.groupby("material"), desc="Train across materials"):

        full_b = mat_df.loc[:, [f"B_t_{k}" for k in range(1024)]].to_numpy()

        dbdt = full_b[:, 1:] - full_b[:, :-1]
        mat_df = mat_df.reset_index(drop=True)

        x_vec = np.linspace(0, 1023, 1024)
        b_vec = []
        for value in x_vec:
            b_vec.append(f'B_t_{int(value)}')
        mat_df["b"] = mat_df[b_vec].values.tolist()
        x_vec = None
        b_vec = None
        mat_df['delta_b'] = mat_df['b'].map(lambda x: np.max(x) - np.min(x))

        # figure out integral_part 
        mat_df["time_s"] = mat_df["freq"].map(lambda x: np.linspace(0, 1/x, 1024))

        # derivation
        # according to https://im-coder.com/zweite-ableitung-in-python-scipy-numpy-pandas.html
        mat_df["fitted_function"] = mat_df.apply(lambda x: UnivariateSpline(x["time_s"], x["b"], s=0, k=4), axis=1)
        mat_df["amplitude_2nd_derivation"] = mat_df["fitted_function"].apply(lambda x: x.derivative(n=2))
        mat_df["integrated_function"] = mat_df.apply(lambda x: trapezoid(np.abs(x["amplitude_2nd_derivation"](x["time_s"])), x["time_s"]), axis=1)     

        mat_df["integral_part"] = mat_df["integrated_function"] / mat_df["delta_b"]

        
        
        mat_df["rms"] = np.apply_along_axis(lambda x: np.sqrt(np.mean(np.gradient(np.gradient(x))**2)), 1, full_b)
        mat_df["rqm"] = np.apply_along_axis(lambda x: np.sqrt(np.mean(np.gradient(np.gradient(x))**4)), 1, full_b)
        mat_df["peak"] = np.apply_along_axis(lambda x: max(np.gradient(np.gradient(x))), 1, full_b)
        
        # old, non-working examples
        #mat_df["rms"]=mat_df.apply(lambda x: np.sqrt(np.mean(np.gradient(np.gradient(x["full_b"]))**2)))
        #mat_df["rqm"]=mat_df.apply(lambda x: np.sqrt(np.mean(np.gradient(np.gradient(x["full_b"]))**4)))
        #mat_df["peak"] = mat_df.apply(lambda x: max(np.gradient(np.gradient(x["full_b"]))))

        

        # cross validation 'kfold'
        kfold_lbls = get_stratified_fold_indices(mat_df, 4)
        mat_df_proc = mat_df.assign(
            kfold=kfold_lbls,
            # integral_part=mat_df['integral_part'],
            #delta_b=mat_df['delta_b'],
            #b_sat=mat_df['b_sat_25'],
            db_bsat_1 = mat_df['delta_b'] / mat_df['b_sat_25'],
            db_bsat_2 = (mat_df['delta_b'] / mat_df['b_sat_25']) ** 2,
            db_bsat_3 = (mat_df['delta_b'] / mat_df['b_sat_25']) ** 3,
            db_bsat_4 = (mat_df['delta_b'] / mat_df['b_sat_25']) ** 4,
            db_bsat_5 = (mat_df['delta_b'] / mat_df['b_sat_25']) ** 5,
            db_bsat_6 = (mat_df['delta_b'] / mat_df['b_sat_25']) ** 6,
            temp_square = mat_df['temp'] ** 2,
            t2 = (mat_df['integral_part'] ** (-1)) / 2,
            #t3 = -(mat_df['integral_part'] ** (-2)) / 6,
            #t4 = ( 3 * mat_df['integral_part'] ** (-2) + 2 * mat_df['integral_part'] ** (-3)) / 24,
            #mean_abs_dbdt=np.mean(np.abs(mat_df["delta_b"])),
            rms = mat_df["rms"],
            rmq = mat_df["rqm"],
            peak = mat_df["peak"],
            # more features imaginable (count of spikes e.g.)
        ).drop(
            columns=[c for c in mat_df if c.startswith("B_t_")] + ["material"] + ['b'] + ['time_s'] + ['fitted_function'] +['amplitude_2nd_derivation'] + ['integrated_function'] + ['b_sat_25']
        )  # drop B curve

        # training result container
        results_df = mat_df_proc.loc[:, ["ploss", "kfold"]].assign(pred=0)
        x_cols = [c for c in mat_df_proc if c not in ["ploss", "kfold"]]
        print(x_cols)
        for kfold_lbl, test_fold_df in mat_df_proc.groupby("kfold"):
            train_fold_df = (
                mat_df_proc.query("kfold != @kfold_lbl")
                .reset_index(drop=True)
                .drop(columns="kfold")
            )
            assert len(train_fold_df) > 0, "empty dataframe error"
            y = train_fold_df.pop("ploss")
            X = train_fold_df.loc[:, x_cols]

            #mdl = GradientBoostingRegressor(n_estimators=n_estimators, criterion=criterion, learning_rate=learning_rate) # GradientBoostingRegressor() # HistGradientBoostingRegressor() # RandomForestRegressor(n_estimators = 100) #LinearRegression() # Ridge()  # 
            mdl = xgb.XGBRegressor(n_estimators=n_estimators, learning_rate=learning_rate) #, max_depth=max_depth)
            mdl.fit(X.to_numpy(), y.to_numpy())
            pred = mdl.predict(test_fold_df.loc[:, x_cols].to_numpy())
            results_df.loc[results_df.kfold == kfold_lbl, "pred"] = pred

        # book keeping
        exp_log[material_lbl] = calculate_metrics(
            np.exp(results_df.loc[:, "pred"]), np.exp(results_df.loc[:, "ploss"])
        )
    print(mdl.__class__.__name__)
    print(f"{n_estimators = }")
    print(f"{learning_rate = }")
    print(mdl.feature_importances_)
    
    print("Overall Score")
    score = pd.DataFrame(exp_log).T
    print(score)

    added_avg_abs_rel_err = score['avg-abs-rel-err'].sum()
    print('sum error')
    print(added_avg_abs_rel_err)
    
    # feature importances
    feat_importances = pd.Series(mdl.feature_importances_, index=x_cols)
    feat_importances.nlargest(20).plot(kind='barh')
    
    return added_avg_abs_rel_err

In [5]:
# print(mdl.features_importances_)


### Run objective function single

In [6]:
import numpy as np
import pandas as pd

# create a Numpy array
my_array = np.array([[1, 2, 3, 4, 5], [6,7,8,9,10]])

# create a Pandas data frame from the dictionary
my_df = pd.DataFrame()

my_df["test"] = my_array.tolist()

print(my_df)
print(f"{my_df.shape = }")
print(f"{my_array.shape = }")

my_df.to_numpy()


               test
0   [1, 2, 3, 4, 5]
1  [6, 7, 8, 9, 10]
my_df.shape = (2, 1)
my_array.shape = (2, 5)


array([[list([1, 2, 3, 4, 5])],
       [list([6, 7, 8, 9, 10])]], dtype=object)

In [7]:
# objective()

### Run objective function in a hyperparameter optimization loop

In [8]:
import optuna

import sklearn.datasets
import sklearn.ensemble
import sklearn.model_selection
import sklearn.svm

# FYI: Objective functions can take additional arguments
# (https://optuna.readthedocs.io/en/stable/faq.html#objective-func-additional-args).



study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=3, n_jobs=1)
#print(study.best_trial)

[I 2023-09-27 21:47:22,826] A new study created in memory with name: no-name-7ad6bf75-8f6f-4cf0-bcb6-0b5d216b62ec


n_estimators = 463
criterion = 'squared_error'
learning_rate = 0.22698166594174812


Train across materials:   0%|          | 0/10 [00:00<?, ?it/s]

['freq', 'temp', 'delta_b', 'integral_part', 'rms', 'rqm', 'peak', 'db_bsat_1', 'db_bsat_2', 'db_bsat_3', 'db_bsat_4', 'db_bsat_5', 'db_bsat_6', 'temp_square', 't2', 'rmq']


Train across materials:  10%|█         | 1/10 [04:07<37:06, 247.34s/it]
[W 2023-09-27 21:51:30,197] Trial 0 failed with parameters: {'n_estimators': 463, 'criterion': 'squared_error', 'learning_rate': 0.22698166594174812} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/nikolasf/Dokumente/01_git/30_Python/MC_UPB/.venv/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_1156/1418518545.py", line 60, in objective
    mat_df["peak"] = np.apply_along_axis(lambda x: max(np.gradient(np.gradient(x))), 1, full_b)
  File "<__array_function__ internals>", line 200, in apply_along_axis
  File "/home/nikolasf/Dokumente/01_git/30_Python/MC_UPB/.venv/lib/python3.10/site-packages/numpy/lib/shape_base.py", line 402, in apply_along_axis
    buff[ind] = asanyarray(func1d(inarr_view[ind], *args, **kwargs))
  File "/tmp/ipykernel_1156/1418518545.py", line 60, in <lambd

KeyboardInterrupt: 