In [109]:
import ast
import re
from pathlib import Path
import pickle

import pandas as pd
from sklearn.preprocessing import StandardScaler

from run1.lib.classes_ml import DataHandler
from run1.lib.optuna_ml import OptunaUtil
from run1.lib.shap import ShapUtil

In [110]:
BASE_DIR = Path.cwd()  # Current directory of the running file
ROOT_DIR = BASE_DIR.parent.parent.parent
DATA_DIR = ROOT_DIR / "run1" / "data"
STUDY_DIR = ROOT_DIR / "run1" / "P01_no_af" / "T01_optuna"
CURRENT_DIR = BASE_DIR
print(f"CURRENT_DIR: {CURRENT_DIR}")
print(f"DATA_DIR: {DATA_DIR}")
print(f"STUDY_DIR: {STUDY_DIR}")

CURRENT_DIR: c:\Users\admin\Coding\research\weld-ml\run1\P01_no_af\T23_shap
DATA_DIR: c:\Users\admin\Coding\research\weld-ml\run1\data
STUDY_DIR: c:\Users\admin\Coding\research\weld-ml\run1\P01_no_af\T01_optuna


In [111]:
# Load study info
study_info_filename = "S02_combine_study.xlsx"
study_info = pd.read_excel(STUDY_DIR / study_info_filename)
study_info["model_params"] = study_info["model_params"].apply(ast.literal_eval)

# Load experimental data
_df = pd.read_excel(DATA_DIR / "S02_data_exp.xlsx")
print(f"df.shape: {_df.shape}")


df.shape: (378, 9)


In [112]:
# Select columns for features and targets
colsY = [c for c in _df.columns if re.search(r"stress_value", c)]

# Select feature columns based on predefined names
colsY = [c for c in colsY if c in ["stress_value_center"]]

# Predefined feature columns
colsX = [c for c in _df.columns if c in ["R", "W", "D", "position"]]
_dfY = _df[colsY]
_dfX = _df[colsX]
print("Selected feature columns:", colsX)
print("Selected target columns:", colsY)
print(f"dfX.shape: {_dfX.shape}")
print(f"dfY.shape: {_dfY.shape}")

Selected feature columns: ['position', 'R', 'W', 'D']
Selected target columns: ['stress_value_center']
dfX.shape: (378, 4)
dfY.shape: (378, 1)


In [113]:
# %% Extract features and targets
_X = _dfX.values
_Y = _dfY.values
print(f"_X.shape: {_X.shape}")
print(f"_Y.shape: {_Y.shape}")

_X.shape: (378, 4)
_Y.shape: (378, 1)


In [114]:
# Create DataHandler instance
data_handler = DataHandler(
    _X=_X,
    _Y=_Y,
    scalerX=StandardScaler(),
    scalerY=StandardScaler(),
    colsX=colsX,
    colsY=colsY,
)

In [115]:
study_info

Unnamed: 0,study_name,dt,model,n_trials,random_state,test_size,best_param,best_value,total_trial,mse_mean,mse_std,mape_mean,mape_std,r2_mean,r2_std,model_params
0,study_DTR_RS-1_TS-0_3,2026-01-28_15-19,DTR,1,1,0.3,"{'criterion': 'friedman_mse', 'splitter': 'bes...",-0.086123,6,0.896066,0.047836,1.50236,0.315104,0.086123,0.022041,"{'criterion': 'friedman_mse', 'splitter': 'bes..."
1,study_DTR_RS-2_TS-0_3,2026-01-28_15-19,DTR,1,2,0.3,"{'criterion': 'friedman_mse', 'splitter': 'bes...",-0.051307,5,0.930573,0.07142,1.731294,0.292716,0.051307,0.032413,"{'criterion': 'friedman_mse', 'splitter': 'bes..."
2,study_DTR_RS-3_TS-0_3,2026-01-28_15-19,DTR,1,3,0.3,"{'criterion': 'absolute_error', 'splitter': 'r...",-0.049025,5,0.927986,0.126566,1.724843,0.339879,0.049025,0.010178,"{'criterion': 'absolute_error', 'splitter': 'r..."
3,study_DTR_RS-4_TS-0_3,2026-01-28_15-19,DTR,1,4,0.3,"{'criterion': 'absolute_error', 'splitter': 'r...",-0.08222,5,0.911722,0.091865,1.569615,0.292636,0.08222,0.06779,"{'criterion': 'absolute_error', 'splitter': 'r..."
4,study_DTR_RS-5_TS-0_3,2026-01-28_15-19,DTR,1,5,0.3,"{'criterion': 'friedman_mse', 'splitter': 'bes...",-0.073278,5,0.912384,0.054152,12.568603,11.040725,0.073278,0.058696,"{'criterion': 'friedman_mse', 'splitter': 'bes..."
5,study_EN_RS-1_TS-0_3,2026-01-28_15-19,EN,1,1,0.3,"{'alpha': 5.5641802254313645e-05, 'l1_ratio': ...",0.029026,1,1.011097,0.068451,1.111043,0.034699,-0.029026,0.027787,"{'alpha': 5.5641802254313645e-05, 'l1_ratio': ..."
6,study_EN_RS-2_TS-0_3,2026-01-28_15-19,EN,1,2,0.3,"{'alpha': 5.5641802254313645e-05, 'l1_ratio': ...",0.026222,1,1.00839,0.096378,1.30151,0.104215,-0.026222,0.028176,"{'alpha': 5.5641802254313645e-05, 'l1_ratio': ..."
7,study_EN_RS-3_TS-0_3,2026-01-28_15-19,EN,1,3,0.3,"{'alpha': 5.5641802254313645e-05, 'l1_ratio': ...",0.052901,1,1.03109,0.141004,1.332102,0.203464,-0.052901,0.025337,"{'alpha': 5.5641802254313645e-05, 'l1_ratio': ..."
8,study_EN_RS-4_TS-0_3,2026-01-28_15-19,EN,1,4,0.3,"{'alpha': 5.5641802254313645e-05, 'l1_ratio': ...",0.009735,1,1.002447,0.090162,1.20053,0.117793,-0.009735,0.019297,"{'alpha': 5.5641802254313645e-05, 'l1_ratio': ..."
9,study_EN_RS-5_TS-0_3,2026-01-28_15-19,EN,1,5,0.3,"{'alpha': 5.5641802254313645e-05, 'l1_ratio': ...",0.005825,1,0.99253,0.02987,5.890901,8.037083,-0.005825,0.01634,"{'alpha': 5.5641802254313645e-05, 'l1_ratio': ..."


In [116]:
# Get the best study info for each model based on r2_mean
study_info_best = study_info.groupby("model").apply(
    lambda df: df.sort_values("r2_mean", ascending=False).head(1), include_groups=False
)
study_info_best = study_info_best.reset_index().drop(columns=["level_1"])
study_info_best

Unnamed: 0,model,study_name,dt,n_trials,random_state,test_size,best_param,best_value,total_trial,mse_mean,mse_std,mape_mean,mape_std,r2_mean,r2_std,model_params
0,DTR,study_DTR_RS-1_TS-0_3,2026-01-28_15-19,1,1,0.3,"{'criterion': 'friedman_mse', 'splitter': 'bes...",-0.086123,6,0.896066,0.047836,1.50236,0.315104,0.086123,0.022041,"{'criterion': 'friedman_mse', 'splitter': 'bes..."
1,EN,study_EN_RS-5_TS-0_3,2026-01-28_15-19,1,5,0.3,"{'alpha': 5.5641802254313645e-05, 'l1_ratio': ...",0.005825,1,0.99253,0.02987,5.890901,8.037083,-0.005825,0.01634,"{'alpha': 5.5641802254313645e-05, 'l1_ratio': ..."
2,GBR,study_GBR_RS-4_TS-0_3,2026-01-28_15-19,1,4,0.3,"{'n_estimators': 76, 'learning_rate': 0.001648...",-0.016851,3,0.977015,0.076141,1.073809,0.071166,0.016851,0.010099,"{'n_estimators': 76, 'learning_rate': 0.001648..."
3,KNR,study_KNR_RS-4_TS-0_3,2026-01-28_15-19,1,4,0.3,"{'n_neighbors': 19, 'weights': 'uniform', 'alg...",-0.082493,1,0.910965,0.089471,1.623037,0.426968,0.082493,0.039093,"{'n_neighbors': 19, 'weights': 'uniform', 'alg..."
4,RFR,study_RFR_RS-4_TS-0_3,2026-01-28_15-19,1,4,0.3,"{'n_estimators': 118, 'max_depth': 106, 'min_s...",-0.11279,3,0.87985,0.069949,1.339074,0.136253,0.11279,0.0314,"{'n_estimators': 118, 'max_depth': 106, 'min_s..."
5,SVR,study_SVR_RS-4_TS-0_3,2026-01-28_15-19,1,4,0.3,"{'kernel': 'linear', 'C': 7.451156502282094e-0...",0.007415,1,1.000296,0.075689,1.076004,0.085703,-0.007415,0.003851,"{'kernel': 'linear', 'C': 7.451156502282094e-0..."
6,XGBR,study_XGBR_RS-4_TS-0_3,2026-01-28_15-19,1,4,0.3,"{'n_estimators': 118, 'max_depth': 12, 'learni...",-0.122445,3,0.86802,0.07416,1.609095,0.320927,0.122445,0.043932,"{'n_estimators': 118, 'max_depth': 12, 'learni..."


In [117]:
shaps = []
infos = []
for idx, study in study_info_best.iterrows():
    # idx = 1
    # study = study_info_best.iloc[idx]
    model = study["model"]
    model_params = study["model_params"]

    # These parameters are used so that I can recover the same StandardScaler and train-test split as in the Optuna study
    random_state = study["random_state"]
    test_size = study["test_size"]

    print(
        f"Processing study {idx + 1}/{len(study_info)}: model={model}, random_state={random_state}, test_size={test_size}"
    )
    data_handler.split_and_scale(random_state=random_state, test_size=test_size)
    df_X_train, df_Y_train = data_handler.get_train(as_dataframe=True)
    df_X_test, df_Y_test = data_handler.get_test(as_dataframe=True)

    # Combine train and test sets
    df_X_comb = pd.concat([df_X_train, df_X_test], axis=0)
    df_Y_comb = pd.concat([df_Y_train, df_Y_test], axis=0)
    reg = OptunaUtil.get_model(model_name=model, **model_params)

    # Fit the model on the combined dataset for SHAP analysis. Use the underlying estimator instead of the MultiOutputRegressor
    estimator = reg.estimator
    estimator.fit(df_X_comb, df_Y_comb.values.ravel())
    explainer = ShapUtil.get_shap_explainer(
        model_name=model, estimator=estimator, X=df_X_comb
    )
    shap_values = explainer(df_X_comb)

    # Save info
    info = dict(
        model=model,
        model_params=model_params,
        test_size=test_size,
        random_state=random_state,
    )
    infos.append(info)
    
    # Store SHAP values
    shaps.append(
        dict(
            **info,
            shap_values=shap_values,
        )
    )

Processing study 1/35: model=DTR, random_state=1, test_size=0.3
Processing study 2/35: model=EN, random_state=5, test_size=0.3
Processing study 3/35: model=GBR, random_state=4, test_size=0.3
Processing study 4/35: model=KNR, random_state=4, test_size=0.3
Processing study 5/35: model=RFR, random_state=4, test_size=0.3
Processing study 6/35: model=SVR, random_state=4, test_size=0.3
Processing study 7/35: model=XGBR, random_state=4, test_size=0.3


In [118]:
with open("S01_shap_calc.pkl", "wb") as f:
    pickle.dump(dict(shaps=shaps, infos=infos), f)