In [1]:
import ast
import pickle
import re
from pathlib import Path

import pandas as pd
from sklearn.preprocessing import StandardScaler

from run1.lib.classes_ml import DataHandler
from run1.lib.directory import get_directory
from run1.lib.optuna_ml import OptunaUtil
from run1.lib.shap import ShapUtil

In [2]:
CURRENT_DIR = Path.cwd()  # Current directory of the running file
# Get data directory
directory = get_directory(CURRENT_DIR, verbose=True)
DATA_PATH = directory["DATA_PATH"]
STUDY_ML_PATH = directory["STUDY_ML_PATH"]

Code is running in a Jupyter environment.
ROOT_DIR: c:\Users\admin\Coding\research\weld-ml
DATA_DIR: c:\Users\admin\Coding\research\weld-ml\run1\P03_MF_2\T01_af_features
DATA_PATH: c:\Users\admin\Coding\research\weld-ml\run1\P03_MF_2\T01_af_features\S01_combined_data.xlsx
STUDY_ML_DIR: c:\Users\admin\Coding\research\weld-ml\run1\P03_MF_2\T02_optuna
STUDY_ML_PATH: c:\Users\admin\Coding\research\weld-ml\run1\P03_MF_2\T02_optuna\S02_combine_study.xlsx
STUDY_TABPFN_DIR: c:\Users\admin\Coding\research\weld-ml\run1\P03_MF_2\T11_tabPFN
STUDY_TABPFN_PATH: c:\Users\admin\Coding\research\weld-ml\run1\P03_MF_2\T11_tabPFN\S01_calculate_performance.xlsx


In [3]:
# Load study info
study_info = pd.read_excel(STUDY_ML_PATH)
study_info["model_params"] = study_info["model_params"].apply(ast.literal_eval)

# Load experimental data
_df = pd.read_excel(DATA_PATH)
print(f"df.shape: {_df.shape}")


df.shape: (378, 61)


In [4]:
# Select columns for features and targets
colsY = [c for c in _df.columns if re.search(r"stress_value", c)]
colsX = [c for c in _df.columns if c not in ["sample_no", "location", *colsY]]

# Select feature columns based on predefined names
colsY = [c for c in colsY if c in ["stress_value_center"]]

_dfY = _df[colsY]
_dfX = _df[colsX]
print("Selected feature columns:", colsX)
print("Selected target columns:", colsY)
print(f"dfX.shape: {_dfX.shape}")
print(f"dfY.shape: {_dfY.shape}")

Selected feature columns: ['position', 'R', 'W', 'D', 'Fx_location', 'Fy_location', 'Fz_location', 'Mz_location', 'Fx__dwell__fft_coefficient__attr_"abs"__coeff_11', 'Fx__dwell__ar_coefficient__coeff_0__k_10', 'Fx__dwell__quantile__q_0.7', 'Fx__dwell__fft_coefficient__attr_"real"__coeff_62', 'Fx__dwell__partial_autocorrelation__lag_6', 'Fx__dwell__change_quantiles__f_agg_"mean"__isabs_True__qh_0.8__ql_0.2', 'Fy__dwell__fft_coefficient__attr_"real"__coeff_71', 'Fy__dwell__fft_coefficient__attr_"abs"__coeff_59', 'Fy__dwell__last_location_of_minimum', 'Fy__dwell__first_location_of_minimum', 'Fy__dwell__change_quantiles__f_agg_"mean"__isabs_True__qh_0.8__ql_0.2', 'Fy__dwell__autocorrelation__lag_3', 'Fz__dwell__fft_coefficient__attr_"angle"__coeff_4', 'Fz__dwell__fft_coefficient__attr_"real"__coeff_4', 'Fz__dwell__fft_coefficient__attr_"angle"__coeff_68', 'Fz__dwell__change_quantiles__f_agg_"mean"__isabs_True__qh_0.4__ql_0.0', 'Fz__dwell__approximate_entropy__m_2__r_0.7', 'Fz__dwell__appro

In [5]:
# %% Extract features and targets
_X = _dfX.values
_Y = _dfY.values
print(f"_X.shape: {_X.shape}")
print(f"_Y.shape: {_Y.shape}")

_X.shape: (378, 56)
_Y.shape: (378, 1)


In [6]:
# Create DataHandler instance
data_handler = DataHandler(
    _X=_X,
    _Y=_Y,
    scalerX=StandardScaler(),
    scalerY=StandardScaler(),
    colsX=colsX,
    colsY=colsY,
)

In [7]:
study_info

Unnamed: 0,study_name,dt,model,n_trials,random_state,test_size,best_param,best_value,total_trial,mse_mean,mse_std,mape_mean,mape_std,r2_mean,r2_std,model_params
0,study_DTR_RS-1_TS-0_3,2026-01-30_04-30,DTR,1,1,0.3,"{'criterion': 'friedman_mse', 'splitter': 'bes...",-0.146928,52,0.838077,0.046165,1.827033,0.694127,0.146928,0.036472,"{'criterion': 'friedman_mse', 'splitter': 'bes..."
1,study_DTR_RS-2_TS-0_3,2026-01-30_04-30,DTR,1,2,0.3,"{'criterion': 'squared_error', 'splitter': 'be...",-0.115367,52,0.871588,0.107509,1.926611,0.352723,0.115367,0.061003,"{'criterion': 'squared_error', 'splitter': 'be..."
2,study_DTR_RS-3_TS-0_3,2026-01-30_04-30,DTR,1,3,0.3,"{'criterion': 'absolute_error', 'splitter': 'r...",-0.105443,52,0.879979,0.127294,1.69683,0.18093,0.105443,0.029103,"{'criterion': 'absolute_error', 'splitter': 'r..."
3,study_DTR_RS-4_TS-0_3,2026-01-30_04-30,DTR,1,4,0.3,"{'criterion': 'squared_error', 'splitter': 'be...",-0.137617,52,0.848966,0.040036,2.006716,0.392713,0.137617,0.02174,"{'criterion': 'squared_error', 'splitter': 'be..."
4,study_DTR_RS-5_TS-0_3,2026-01-30_04-30,DTR,1,5,0.3,"{'criterion': 'absolute_error', 'splitter': 'b...",-0.145449,52,0.824146,0.042275,17.987777,24.326394,0.145449,0.02257,"{'criterion': 'absolute_error', 'splitter': 'b..."
5,study_EN_RS-1_TS-0_3,2026-01-30_04-30,EN,1,1,0.3,"{'alpha': 0.07233779101064103, 'l1_ratio': 0.8...",-0.189238,52,0.796403,0.033227,1.591885,0.377639,0.189238,0.022447,"{'alpha': 0.07233779101064103, 'l1_ratio': 0.8..."
6,study_EN_RS-2_TS-0_3,2026-01-30_04-30,EN,1,2,0.3,"{'alpha': 0.10237815206681478, 'l1_ratio': 0.6...",-0.159695,52,0.827182,0.099264,1.761525,0.267865,0.159695,0.038073,"{'alpha': 0.10237815206681478, 'l1_ratio': 0.6..."
7,study_EN_RS-3_TS-0_3,2026-01-30_04-30,EN,1,3,0.3,"{'alpha': 0.0885690610531529, 'l1_ratio': 0.89...",-0.137036,52,0.840953,0.100242,1.437507,0.124851,0.137036,0.018267,"{'alpha': 0.0885690610531529, 'l1_ratio': 0.89..."
8,study_EN_RS-4_TS-0_3,2026-01-30_04-30,EN,1,4,0.3,"{'alpha': 0.06889337054498809, 'l1_ratio': 0.9...",-0.186765,52,0.800148,0.022023,1.718647,0.287858,0.186765,0.038329,"{'alpha': 0.06889337054498809, 'l1_ratio': 0.9..."
9,study_EN_RS-5_TS-0_3,2026-01-30_04-30,EN,1,5,0.3,"{'alpha': 0.15729272385950474, 'l1_ratio': 0.1...",-0.215084,52,0.771215,0.031874,9.480635,10.835316,0.215084,0.012047,"{'alpha': 0.15729272385950474, 'l1_ratio': 0.1..."


In [8]:
# Get the best study info for each model based on r2_mean
study_info_best = study_info.groupby("model").apply(
    lambda df: df.sort_values("r2_mean", ascending=False).head(1), include_groups=False
)
study_info_best = study_info_best.reset_index().drop(columns=["level_1"])
study_info_best

Unnamed: 0,model,study_name,dt,n_trials,random_state,test_size,best_param,best_value,total_trial,mse_mean,mse_std,mape_mean,mape_std,r2_mean,r2_std,model_params
0,DTR,study_DTR_RS-1_TS-0_3,2026-01-30_04-30,1,1,0.3,"{'criterion': 'friedman_mse', 'splitter': 'bes...",-0.146928,52,0.838077,0.046165,1.827033,0.694127,0.146928,0.036472,"{'criterion': 'friedman_mse', 'splitter': 'bes..."
1,EN,study_EN_RS-5_TS-0_3,2026-01-30_04-30,1,5,0.3,"{'alpha': 0.15729272385950474, 'l1_ratio': 0.1...",-0.215084,52,0.771215,0.031874,9.480635,10.835316,0.215084,0.012047,"{'alpha': 0.15729272385950474, 'l1_ratio': 0.1..."
2,GBR,study_GBR_RS-5_TS-0_3,2026-01-30_04-30,1,5,0.3,"{'n_estimators': 356, 'learning_rate': 0.00633...",-0.221137,52,0.760523,0.024946,19.713722,26.293683,0.221137,0.015082,"{'n_estimators': 356, 'learning_rate': 0.00633..."
3,KNR,study_KNR_RS-5_TS-0_3,2026-01-30_04-30,1,5,0.3,"{'n_neighbors': 44, 'weights': 'distance', 'al...",-0.114948,52,0.873314,0.041455,11.560775,13.093146,0.114948,0.028722,"{'n_neighbors': 44, 'weights': 'distance', 'al..."
4,RFR,study_RFR_RS-5_TS-0_3,2026-01-30_04-30,1,5,0.3,"{'n_estimators': 323, 'max_depth': 8, 'min_sam...",-0.21002,52,0.771629,0.036225,17.759529,22.024575,0.21002,0.02227,"{'n_estimators': 323, 'max_depth': 8, 'min_sam..."
5,SVR,study_SVR_RS-5_TS-0_3,2026-01-30_04-30,1,5,0.3,"{'kernel': 'linear', 'C': 0.004174129907473797...",-0.180172,52,0.803706,0.036731,7.636933,8.340382,0.180172,0.022757,"{'kernel': 'linear', 'C': 0.004174129907473797..."
6,XGBR,study_XGBR_RS-5_TS-0_3,2026-01-30_04-30,1,5,0.3,"{'n_estimators': 183, 'max_depth': 9, 'learnin...",-0.217352,53,0.764681,0.032285,13.891006,17.584359,0.217352,0.019389,"{'n_estimators': 183, 'max_depth': 9, 'learnin..."


In [9]:
shaps = []
infos = []
for idx, study in study_info_best.iterrows():
    # idx = 1
    # study = study_info_best.iloc[idx]
    model = study["model"]
    model_params = study["model_params"]

    # These parameters are used so that I can recover the same StandardScaler and train-test split as in the Optuna study
    random_state = study["random_state"]
    test_size = study["test_size"]

    print(
        f"Processing study {idx + 1}/{len(study_info)}: model={model}, random_state={random_state}, test_size={test_size}"
    )
    data_handler.split_and_scale(random_state=random_state, test_size=test_size)
    df_X_train, df_Y_train = data_handler.get_train(as_dataframe=True)
    df_X_test, df_Y_test = data_handler.get_test(as_dataframe=True)

    # Combine train and test sets
    df_X_comb = pd.concat([df_X_train, df_X_test], axis=0)
    df_Y_comb = pd.concat([df_Y_train, df_Y_test], axis=0)
    reg = OptunaUtil.get_model(model_name=model, **model_params)

    # Fit the model on the combined dataset for SHAP analysis. Use the underlying estimator instead of the MultiOutputRegressor
    estimator = reg.estimator
    estimator.fit(df_X_comb, df_Y_comb.values.ravel())
    explainer = ShapUtil.get_shap_explainer(
        model_name=model, estimator=estimator, X=df_X_comb
    )
    shap_values = explainer(df_X_comb)

    # Save info
    info = dict(
        model=model,
        model_params=model_params,
        test_size=test_size,
        random_state=random_state,
    )
    infos.append(info)
    
    # Store SHAP values
    shaps.append(
        dict(
            **info,
            shap_values=shap_values,
        )
    )

Processing study 1/35: model=DTR, random_state=1, test_size=0.3
Processing study 2/35: model=EN, random_state=5, test_size=0.3


PermutationExplainer explainer: 379it [00:15, 16.00it/s]                         


Processing study 3/35: model=GBR, random_state=5, test_size=0.3
Processing study 4/35: model=KNR, random_state=5, test_size=0.3


PermutationExplainer explainer: 379it [06:11,  1.01s/it]                         


Processing study 5/35: model=RFR, random_state=5, test_size=0.3
Processing study 6/35: model=SVR, random_state=5, test_size=0.3


PermutationExplainer explainer: 379it [02:15,  2.58it/s]                         


Processing study 7/35: model=XGBR, random_state=5, test_size=0.3


In [10]:
with open("S01_shap_calc.pkl", "wb") as f:
    pickle.dump(dict(shaps=shaps, infos=infos), f)