In [1]:
import ast
import pickle
import re
from pathlib import Path

import pandas as pd
from sklearn.preprocessing import StandardScaler

from run1.lib.classes_ml import DataHandler
from run1.lib.directory import get_directory
from run1.lib.optuna_ml import OptunaUtil
from run1.lib.shap import ShapUtil

In [2]:
CURRENT_DIR = Path.cwd()  # Current directory of the running file
# Get data directory
directory = get_directory(CURRENT_DIR, verbose=True)
DATA_PATH = directory["DATA_PATH"]
STUDY_ML_PATH = directory["STUDY_ML_PATH"]

Code is running in a Jupyter environment.
ROOT_DIR: c:\Users\admin\Coding\research\weld-ml
DATA_DIR: c:\Users\admin\Coding\research\weld-ml\run1\P04_MF2\T01_af_features
DATA_PATH: c:\Users\admin\Coding\research\weld-ml\run1\P04_MF2\T01_af_features\S01_combined_data.xlsx
STUDY_ML_DIR: c:\Users\admin\Coding\research\weld-ml\run1\P04_MF2\T02_optuna
STUDY_ML_PATH: c:\Users\admin\Coding\research\weld-ml\run1\P04_MF2\T02_optuna\S02_combine_study.xlsx
STUDY_TABPFN_DIR: c:\Users\admin\Coding\research\weld-ml\run1\P04_MF2\T11_tabPFN
STUDY_TABPFN_PATH: c:\Users\admin\Coding\research\weld-ml\run1\P04_MF2\T11_tabPFN\S01_calculate_performance.xlsx


In [3]:
# Load study info
study_info = pd.read_excel(STUDY_ML_PATH)
study_info["model_params"] = study_info["model_params"].apply(ast.literal_eval)

# Load experimental data
_df = pd.read_excel(DATA_PATH)
print(f"df.shape: {_df.shape}")


df.shape: (378, 25)


In [4]:
# Select columns for features and targets
colsY = [c for c in _df.columns if re.search(r"stress_value", c)]
colsX = [c for c in _df.columns if c not in ["sample_no", "location", *colsY]]

# Select feature columns based on predefined names
colsY = [c for c in colsY if c in ["stress_value_center"]]

_dfY = _df[colsY]
_dfX = _df[colsX]
print("Selected feature columns:", colsX)
print("Selected target columns:", colsY)
print(f"dfX.shape: {_dfX.shape}")
print(f"dfY.shape: {_dfY.shape}")

Selected feature columns: ['D', 'Fx__dwell__ar_coefficient__coeff_0__k_10', 'Fx__dwell__fft_coefficient__attr_"abs"__coeff_11', 'Fx__weld__change_quantiles__f_agg_"var"__isabs_True__qh_0.6__ql_0.2', 'Fx__weld__fft_coefficient__attr_"abs"__coeff_5', 'Fx_location', 'Fy__dwell__fft_coefficient__attr_"abs"__coeff_59', 'Fy__dwell__fft_coefficient__attr_"real"__coeff_71', 'Fy__weld__change_quantiles__f_agg_"mean"__isabs_True__qh_0.4__ql_0.0', 'Fy__weld__energy_ratio_by_chunks__num_segments_10__segment_focus_5', 'Fz__dwell__fft_coefficient__attr_"angle"__coeff_4', 'Fz__dwell__fft_coefficient__attr_"real"__coeff_4', 'Fz__weld__agg_linear_trend__attr_"rvalue"__chunk_len_5__f_agg_"var"', 'Fz__weld__change_quantiles__f_agg_"mean"__isabs_False__qh_1.0__ql_0.4', 'Fz_location', 'Mz__dwell__augmented_dickey_fuller__attr_"pvalue"__autolag_"AIC"', 'Mz__dwell__augmented_dickey_fuller__attr_"teststat"__autolag_"AIC"', 'R', 'W', 'position']
Selected target columns: ['stress_value_center']
dfX.shape: (378,

In [5]:
# %% Extract features and targets
_X = _dfX.values
_Y = _dfY.values
print(f"_X.shape: {_X.shape}")
print(f"_Y.shape: {_Y.shape}")

_X.shape: (378, 20)
_Y.shape: (378, 1)


In [6]:
# Create DataHandler instance
data_handler = DataHandler(
    _X=_X,
    _Y=_Y,
    scalerX=StandardScaler(),
    scalerY=StandardScaler(),
    colsX=colsX,
    colsY=colsY,
)

In [7]:
study_info

Unnamed: 0,study_name,dt,model,n_trials,random_state,test_size,best_param,best_value,total_trial,mse_mean,mse_std,mape_mean,mape_std,r2_mean,r2_std,model_params
0,study_DTR_RS-1_TS-0_3,2026-02-03_22-53,DTR,50,1,0.3,"{'criterion': 'squared_error', 'splitter': 'ra...",-0.139346,80,0.84601,0.067927,1.451561,0.28835,0.139346,0.027745,"{'criterion': 'squared_error', 'splitter': 'ra..."
1,study_DTR_RS-2_TS-0_3,2026-02-03_22-53,DTR,50,2,0.3,"{'criterion': 'squared_error', 'splitter': 'ra...",-0.109533,80,0.875347,0.065682,1.887708,0.378771,0.109533,0.025279,"{'criterion': 'squared_error', 'splitter': 'ra..."
2,study_DTR_RS-3_TS-0_3,2026-02-03_22-53,DTR,50,3,0.3,"{'criterion': 'absolute_error', 'splitter': 'r...",-0.099489,80,0.876141,0.120888,1.817804,0.353386,0.099489,0.020083,"{'criterion': 'absolute_error', 'splitter': 'r..."
3,study_DTR_RS-4_TS-0_3,2026-02-03_22-53,DTR,50,4,0.3,"{'criterion': 'squared_error', 'splitter': 'ra...",-0.161086,80,0.834001,0.086591,1.901808,0.320215,0.161086,0.064964,"{'criterion': 'squared_error', 'splitter': 'ra..."
4,study_DTR_RS-5_TS-0_3,2026-02-03_22-53,DTR,50,5,0.3,"{'criterion': 'friedman_mse', 'splitter': 'ran...",-0.157235,80,0.826376,0.055954,17.544031,16.477837,0.157235,0.041788,"{'criterion': 'friedman_mse', 'splitter': 'ran..."
5,study_EN_RS-1_TS-0_3,2026-02-03_22-53,EN,50,1,0.3,"{'alpha': 0.08529802715173362, 'l1_ratio': 0.4...",-0.148314,80,0.836978,0.037242,1.474846,0.297348,0.148314,0.02476,"{'alpha': 0.08529802715173362, 'l1_ratio': 0.4..."
6,study_EN_RS-2_TS-0_3,2026-02-03_22-53,EN,50,2,0.3,"{'alpha': 0.06571140688552143, 'l1_ratio': 0.8...",-0.139557,80,0.848324,0.099716,1.619539,0.285157,0.139557,0.015875,"{'alpha': 0.06571140688552143, 'l1_ratio': 0.8..."
7,study_EN_RS-3_TS-0_3,2026-02-03_22-53,EN,50,3,0.3,"{'alpha': 0.09212379490393291, 'l1_ratio': 0.6...",-0.110966,80,0.866507,0.117218,1.477669,0.202124,0.110966,0.013944,"{'alpha': 0.09212379490393291, 'l1_ratio': 0.6..."
8,study_EN_RS-4_TS-0_3,2026-02-03_22-53,EN,50,4,0.3,"{'alpha': 0.03890452034262176, 'l1_ratio': 0.8...",-0.165341,80,0.819703,0.032863,1.677411,0.357119,0.165341,0.031745,"{'alpha': 0.03890452034262176, 'l1_ratio': 0.8..."
9,study_EN_RS-5_TS-0_3,2026-02-03_22-53,EN,50,5,0.3,"{'alpha': 0.03388469103413104, 'l1_ratio': 0.4...",-0.180867,80,0.800976,0.01951,11.570833,10.272896,0.180867,0.015216,"{'alpha': 0.03388469103413104, 'l1_ratio': 0.4..."


In [8]:
# Get the best study info for each model based on r2_mean
study_info_best = study_info.groupby("model").apply(
    lambda df: df.sort_values("r2_mean", ascending=False).head(1), include_groups=False
)
study_info_best = study_info_best.reset_index().drop(columns=["level_1"])
study_info_best

Unnamed: 0,model,study_name,dt,n_trials,random_state,test_size,best_param,best_value,total_trial,mse_mean,mse_std,mape_mean,mape_std,r2_mean,r2_std,model_params
0,DTR,study_DTR_RS-4_TS-0_3,2026-02-03_22-53,50,4,0.3,"{'criterion': 'squared_error', 'splitter': 'ra...",-0.161086,80,0.834001,0.086591,1.901808,0.320215,0.161086,0.064964,"{'criterion': 'squared_error', 'splitter': 'ra..."
1,EN,study_EN_RS-5_TS-0_3,2026-02-03_22-53,50,5,0.3,"{'alpha': 0.03388469103413104, 'l1_ratio': 0.4...",-0.180867,80,0.800976,0.01951,11.570833,10.272896,0.180867,0.015216,"{'alpha': 0.03388469103413104, 'l1_ratio': 0.4..."
2,GBR,study_GBR_RS-4_TS-0_3,2026-02-03_22-53,50,4,0.3,"{'n_estimators': 134, 'learning_rate': 0.02748...",-0.198749,80,0.789098,0.059153,1.873684,0.394312,0.198749,0.045296,"{'n_estimators': 134, 'learning_rate': 0.02748..."
3,KNR,study_KNR_RS-5_TS-0_3,2026-02-03_22-53,50,5,0.3,"{'n_neighbors': 36, 'weights': 'distance', 'al...",-0.123633,80,0.863818,0.042421,18.967107,23.887208,0.123633,0.026001,"{'n_neighbors': 36, 'weights': 'distance', 'al..."
4,RFR,study_RFR_RS-5_TS-0_3,2026-02-03_22-53,50,5,0.3,"{'n_estimators': 60, 'max_depth': 5, 'min_samp...",-0.197709,80,0.783847,0.029774,17.449966,22.48725,0.197709,0.031403,"{'n_estimators': 60, 'max_depth': 5, 'min_samp..."
5,SVR,study_SVR_RS-5_TS-0_3,2026-02-03_22-53,50,5,0.3,"{'kernel': 'linear', 'C': 0.012708183589980318...",-0.154886,80,0.831516,0.042596,12.912057,13.302529,0.154886,0.031947,"{'kernel': 'linear', 'C': 0.012708183589980318..."
6,XGBR,study_XGBR_RS-5_TS-0_3,2026-02-03_22-53,50,5,0.3,"{'n_estimators': 229, 'max_depth': 8, 'learnin...",-0.201031,60,0.781947,0.014281,15.033675,21.623964,0.201031,0.022359,"{'n_estimators': 229, 'max_depth': 8, 'learnin..."


In [9]:
# filt = ~study_info_best["model"].isin(["KNR"])
# study_info_best = study_info_best[filt]
# study_info_best

In [10]:
import numpy as np

shaps = []
infos = []
for idx, study in study_info_best.iterrows():
    # idx = 1
    # study = study_info_best.iloc[idx]
    model = study["model"]
    model_params = study["model_params"]

    # These parameters are used so that I can recover the same StandardScaler and train-test split as in the Optuna study
    random_state = study["random_state"]
    test_size = study["test_size"]

    print(
        f"Processing study {idx + 1}/{len(study_info)}: model={model}, random_state={random_state}, test_size={test_size}"
    )
    data_handler.split_and_scale(random_state=random_state, test_size=test_size)
    df_X_train, df_Y_train = data_handler.get_train(as_dataframe=True)
    df_X_test, df_Y_test = data_handler.get_test(as_dataframe=True)

    # Combine train and test sets
    df_X_comb = pd.concat([df_X_train, df_X_test], axis=0)
    df_Y_comb = pd.concat([df_Y_train, df_Y_test], axis=0)
    reg = OptunaUtil.get_model(model_name=model, **model_params)

    # Fit the model on the combined dataset for SHAP analysis. Use the underlying estimator instead of the MultiOutputRegressor
    estimator = reg.estimator
    estimator.fit(df_X_comb, df_Y_comb.values.ravel())
    explainer = ShapUtil.get_shap_explainer(
        model_name=model, estimator=estimator, X=df_X_comb, check_additivity=True
    )
    shap_values = explainer(df_X_comb)

    # Save info
    info = dict(
        model=model,
        model_params=model_params,
        test_size=test_size,
        random_state=random_state,
    )
    infos.append(info)

    # Store SHAP values
    shaps.append(
        dict(
            **info,
            shap_values=shap_values,
        )
    )

Processing study 1/35: model=DTR, random_state=4, test_size=0.3
Processing study 2/35: model=EN, random_state=5, test_size=0.3
Processing study 3/35: model=GBR, random_state=4, test_size=0.3


PermutationExplainer explainer: 379it [00:23,  9.10it/s]                         
[WinError 2] The system cannot find the file specified
  File "c:\Users\admin\Coding\research\weld-ml\.venv\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Python312\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Python312\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Python312\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


Processing study 4/35: model=KNR, random_state=5, test_size=0.3


PermutationExplainer explainer: 379it [01:06,  4.90it/s]                         


Processing study 5/35: model=RFR, random_state=5, test_size=0.3


PermutationExplainer explainer: 379it [00:42,  6.81it/s]                         


Processing study 6/35: model=SVR, random_state=5, test_size=0.3


PermutationExplainer explainer: 379it [00:33,  7.87it/s]                         


Processing study 7/35: model=XGBR, random_state=5, test_size=0.3


PermutationExplainer explainer: 379it [02:56,  2.03it/s]                         


In [11]:
with open("S01_shap_calc.pkl", "wb") as f:
    pickle.dump(dict(shaps=shaps, infos=infos), f)