In [1]:
import ast
import pickle
import re
from pathlib import Path

import pandas as pd
from sklearn.preprocessing import StandardScaler

from run1.lib.classes_ml import DataHandler
from run1.lib.directory import get_directory
from run1.lib.optuna_ml import OptunaUtil
from run1.lib.shap import ShapUtil

In [2]:
CURRENT_DIR = Path.cwd()  # Current directory of the running file
# Get data directory
directory = get_directory(CURRENT_DIR, verbose=True)
DATA_PATH = directory["DATA_PATH"]
STUDY_ML_PATH = directory["STUDY_ML_PATH"]

Code is running in a Jupyter environment.
ROOT_DIR: c:\Users\admin\Coding\research\weld-ml
DATA_DIR: c:\Users\admin\Coding\research\weld-ml\run1\P03_MF_2\T01_af_features
DATA_PATH: c:\Users\admin\Coding\research\weld-ml\run1\P03_MF_2\T01_af_features\S01_combined_data.xlsx
STUDY_ML_DIR: c:\Users\admin\Coding\research\weld-ml\run1\P03_MF_2\T02_optuna
STUDY_ML_PATH: c:\Users\admin\Coding\research\weld-ml\run1\P03_MF_2\T02_optuna\S02_combine_study.xlsx
STUDY_TABPFN_DIR: c:\Users\admin\Coding\research\weld-ml\run1\P03_MF_2\T11_tabPFN
STUDY_TABPFN_PATH: c:\Users\admin\Coding\research\weld-ml\run1\P03_MF_2\T11_tabPFN\S01_calculate_performance.xlsx


In [3]:
# Load study info
study_info = pd.read_excel(STUDY_ML_PATH)
study_info["model_params"] = study_info["model_params"].apply(ast.literal_eval)

# Load experimental data
_df = pd.read_excel(DATA_PATH)
print(f"df.shape: {_df.shape}")


df.shape: (378, 61)


In [4]:
# Select columns for features and targets
colsY = [c for c in _df.columns if re.search(r"stress_value", c)]
colsX = [c for c in _df.columns if c not in ["sample_no", "location", *colsY]]

# Select feature columns based on predefined names
colsY = [c for c in colsY if c in ["stress_value_center"]]

_dfY = _df[colsY]
_dfX = _df[colsX]
print("Selected feature columns:", colsX)
print("Selected target columns:", colsY)
print(f"dfX.shape: {_dfX.shape}")
print(f"dfY.shape: {_dfY.shape}")

Selected feature columns: ['position', 'R', 'W', 'D', 'Fx_location', 'Fy_location', 'Fz_location', 'Mz_location', 'Fx__dwell__fft_coefficient__attr_"abs"__coeff_11', 'Fx__dwell__ar_coefficient__coeff_0__k_10', 'Fx__dwell__quantile__q_0.7', 'Fx__dwell__fft_coefficient__attr_"real"__coeff_62', 'Fx__dwell__partial_autocorrelation__lag_6', 'Fx__dwell__change_quantiles__f_agg_"mean"__isabs_True__qh_0.8__ql_0.2', 'Fy__dwell__fft_coefficient__attr_"real"__coeff_71', 'Fy__dwell__fft_coefficient__attr_"abs"__coeff_59', 'Fy__dwell__last_location_of_minimum', 'Fy__dwell__first_location_of_minimum', 'Fy__dwell__change_quantiles__f_agg_"mean"__isabs_True__qh_0.8__ql_0.2', 'Fy__dwell__autocorrelation__lag_3', 'Fz__dwell__fft_coefficient__attr_"angle"__coeff_4', 'Fz__dwell__fft_coefficient__attr_"real"__coeff_4', 'Fz__dwell__fft_coefficient__attr_"angle"__coeff_68', 'Fz__dwell__change_quantiles__f_agg_"mean"__isabs_True__qh_0.4__ql_0.0', 'Fz__dwell__approximate_entropy__m_2__r_0.7', 'Fz__dwell__appro

In [5]:
# %% Extract features and targets
_X = _dfX.values
_Y = _dfY.values
print(f"_X.shape: {_X.shape}")
print(f"_Y.shape: {_Y.shape}")

_X.shape: (378, 56)
_Y.shape: (378, 1)


In [6]:
# Create DataHandler instance
data_handler = DataHandler(
    _X=_X,
    _Y=_Y,
    scalerX=StandardScaler(),
    scalerY=StandardScaler(),
    colsX=colsX,
    colsY=colsY,
)

In [7]:
study_info

Unnamed: 0,study_name,dt,model,n_trials,random_state,test_size,best_param,best_value,total_trial,mse_mean,mse_std,mape_mean,mape_std,r2_mean,r2_std,model_params
0,study_DTR_RS-1_TS-0_3,2026-01-29_17-14,DTR,1,1,0.3,"{'criterion': 'friedman_mse', 'splitter': 'bes...",-0.017166,1,0.966295,0.048622,1.621212,0.429574,0.017166,0.006515,"{'criterion': 'friedman_mse', 'splitter': 'bes..."
1,study_DTR_RS-2_TS-0_3,2026-01-29_17-14,DTR,1,2,0.3,"{'criterion': 'friedman_mse', 'splitter': 'bes...",-0.012265,1,0.968677,0.108174,1.805419,0.332943,0.012265,0.07171,"{'criterion': 'friedman_mse', 'splitter': 'bes..."
2,study_DTR_RS-3_TS-0_3,2026-01-29_17-14,DTR,1,3,0.3,"{'criterion': 'friedman_mse', 'splitter': 'bes...",0.027131,1,1.004886,0.115896,1.744065,0.090434,-0.027131,0.044853,"{'criterion': 'friedman_mse', 'splitter': 'bes..."
3,study_DTR_RS-4_TS-0_3,2026-01-29_17-14,DTR,1,4,0.3,"{'criterion': 'friedman_mse', 'splitter': 'bes...",-0.022317,1,0.962534,0.074204,1.743513,0.169511,0.022317,0.037264,"{'criterion': 'friedman_mse', 'splitter': 'bes..."
4,study_DTR_RS-5_TS-0_3,2026-01-29_17-14,DTR,1,5,0.3,"{'criterion': 'friedman_mse', 'splitter': 'bes...",-0.009735,1,0.968498,0.074488,19.07546,17.72396,0.009735,0.046996,"{'criterion': 'friedman_mse', 'splitter': 'bes..."
5,study_EN_RS-1_TS-0_3,2026-01-29_17-14,EN,1,1,0.3,"{'alpha': 5.5641802254313645e-05, 'l1_ratio': ...",0.050521,1,1.032262,0.069856,1.585923,0.390798,-0.050521,0.050102,"{'alpha': 5.5641802254313645e-05, 'l1_ratio': ..."
6,study_EN_RS-2_TS-0_3,2026-01-29_17-14,EN,1,2,0.3,"{'alpha': 5.5641802254313645e-05, 'l1_ratio': ...",0.076614,1,1.058368,0.117748,1.908094,0.271009,-0.076614,0.045717,"{'alpha': 5.5641802254313645e-05, 'l1_ratio': ..."
7,study_EN_RS-3_TS-0_3,2026-01-29_17-14,EN,1,3,0.3,"{'alpha': 5.5641802254313645e-05, 'l1_ratio': ...",0.071113,1,1.036521,0.079935,1.912367,0.347258,-0.071113,0.07183,"{'alpha': 5.5641802254313645e-05, 'l1_ratio': ..."
8,study_EN_RS-4_TS-0_3,2026-01-29_17-14,EN,1,4,0.3,"{'alpha': 5.5641802254313645e-05, 'l1_ratio': ...",0.03453,1,1.013405,0.041367,2.594135,1.165559,-0.03453,0.089661,"{'alpha': 5.5641802254313645e-05, 'l1_ratio': ..."
9,study_EN_RS-5_TS-0_3,2026-01-29_17-14,EN,1,5,0.3,"{'alpha': 5.5641802254313645e-05, 'l1_ratio': ...",-0.025992,1,0.9596,0.057493,12.964309,11.432652,0.025992,0.056964,"{'alpha': 5.5641802254313645e-05, 'l1_ratio': ..."


In [8]:
# Get the best study info for each model based on r2_mean
study_info_best = study_info.groupby("model").apply(
    lambda df: df.sort_values("r2_mean", ascending=False).head(1), include_groups=False
)
study_info_best = study_info_best.reset_index().drop(columns=["level_1"])
study_info_best

Unnamed: 0,model,study_name,dt,n_trials,random_state,test_size,best_param,best_value,total_trial,mse_mean,mse_std,mape_mean,mape_std,r2_mean,r2_std,model_params
0,DTR,study_DTR_RS-4_TS-0_3,2026-01-29_17-14,1,4,0.3,"{'criterion': 'friedman_mse', 'splitter': 'bes...",-0.022317,1,0.962534,0.074204,1.743513,0.169511,0.022317,0.037264,"{'criterion': 'friedman_mse', 'splitter': 'bes..."
1,EN,study_EN_RS-5_TS-0_3,2026-01-29_17-14,1,5,0.3,"{'alpha': 5.5641802254313645e-05, 'l1_ratio': ...",-0.025992,1,0.9596,0.057493,12.964309,11.432652,0.025992,0.056964,"{'alpha': 5.5641802254313645e-05, 'l1_ratio': ..."
2,GBR,study_GBR_RS-5_TS-0_3,2026-01-29_17-14,1,5,0.3,"{'n_estimators': 118, 'learning_rate': 0.63512...",0.243537,1,1.20587,0.043977,40.365125,54.169215,-0.243537,0.030373,"{'n_estimators': 118, 'learning_rate': 0.63512..."
3,KNR,study_KNR_RS-5_TS-0_3,2026-01-29_17-14,1,5,0.3,"{'n_neighbors': 19, 'weights': 'uniform', 'alg...",-0.069198,1,0.9158,0.047114,13.513571,14.246245,0.069198,0.038995,"{'n_neighbors': 19, 'weights': 'uniform', 'alg..."
4,RFR,study_RFR_RS-1_TS-0_3,2026-01-29_17-14,1,1,0.3,"{'n_estimators': 118, 'max_depth': 106, 'min_s...",-0.178945,1,0.808052,0.070217,1.608309,0.453406,0.178945,0.03487,"{'n_estimators': 118, 'max_depth': 106, 'min_s..."
5,SVR,study_SVR_RS-4_TS-0_3,2026-01-29_17-14,1,4,0.3,"{'kernel': 'linear', 'C': 7.451156502282094e-0...",-0.005248,1,0.987033,0.073066,1.109646,0.116544,0.005248,0.001448,"{'kernel': 'linear', 'C': 7.451156502282094e-0..."
6,XGBR,study_XGBR_RS-5_TS-0_3,2026-01-29_17-14,1,5,0.3,"{'n_estimators': 118, 'max_depth': 12, 'learni...",-0.083821,1,0.891025,0.029258,24.775407,36.73552,0.083821,0.050994,"{'n_estimators': 118, 'max_depth': 12, 'learni..."


In [9]:
shaps = []
infos = []
for idx, study in study_info_best.iterrows():
    # idx = 1
    # study = study_info_best.iloc[idx]
    model = study["model"]
    model_params = study["model_params"]

    # These parameters are used so that I can recover the same StandardScaler and train-test split as in the Optuna study
    random_state = study["random_state"]
    test_size = study["test_size"]

    print(
        f"Processing study {idx + 1}/{len(study_info)}: model={model}, random_state={random_state}, test_size={test_size}"
    )
    data_handler.split_and_scale(random_state=random_state, test_size=test_size)
    df_X_train, df_Y_train = data_handler.get_train(as_dataframe=True)
    df_X_test, df_Y_test = data_handler.get_test(as_dataframe=True)

    # Combine train and test sets
    df_X_comb = pd.concat([df_X_train, df_X_test], axis=0)
    df_Y_comb = pd.concat([df_Y_train, df_Y_test], axis=0)
    reg = OptunaUtil.get_model(model_name=model, **model_params)

    # Fit the model on the combined dataset for SHAP analysis. Use the underlying estimator instead of the MultiOutputRegressor
    estimator = reg.estimator
    estimator.fit(df_X_comb, df_Y_comb.values.ravel())
    explainer = ShapUtil.get_shap_explainer(
        model_name=model, estimator=estimator, X=df_X_comb
    )
    shap_values = explainer(df_X_comb)

    # Save info
    info = dict(
        model=model,
        model_params=model_params,
        test_size=test_size,
        random_state=random_state,
    )
    infos.append(info)
    
    # Store SHAP values
    shaps.append(
        dict(
            **info,
            shap_values=shap_values,
        )
    )

Processing study 1/35: model=DTR, random_state=4, test_size=0.3
Processing study 2/35: model=EN, random_state=5, test_size=0.3


PermutationExplainer explainer: 379it [00:19, 14.48it/s]                         


Processing study 3/35: model=GBR, random_state=5, test_size=0.3
Processing study 4/35: model=KNR, random_state=5, test_size=0.3


[WinError 2] The system cannot find the file specified
  File "c:\Users\admin\Coding\research\weld-ml\.venv\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Python312\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Python312\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Python312\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
PermutationExplainer explainer: 379it [01:25,  3.94it/s]                         


Processing study 5/35: model=RFR, random_state=1, test_size=0.3
Processing study 6/35: model=SVR, random_state=4, test_size=0.3


PermutationExplainer explainer: 379it [03:43,  1.56it/s]                         


Processing study 7/35: model=XGBR, random_state=5, test_size=0.3


In [10]:
with open("S01_shap_calc.pkl", "wb") as f:
    pickle.dump(dict(shaps=shaps, infos=infos), f)