In [None]:
import ast
import pickle
import re
from pathlib import Path

import pandas as pd
from sklearn.preprocessing import StandardScaler

from run1.lib.classes_ml import DataHandler
from run1.lib.directory import get_directory
from run1.lib.optuna_ml import OptunaUtil
from run1.lib.shap import ShapUtil

In [None]:
CURRENT_DIR = Path.cwd()  # Current directory of the running file
# Get data directory
directory = get_directory(CURRENT_DIR, verbose=True)
DATA_PATH = directory["DATA_PATH"]
STUDY_ML_PATH = directory["STUDY_ML_PATH"]

CURRENT_DIR: c:\Users\admin\Coding\research\weld-ml\run1\P02_MF_1\T23_shap_ml
DATA_DIR: c:\Users\admin\Coding\research\weld-ml\run1\P02_MF_1\T01_af_features
STUDY_DIR: c:\Users\admin\Coding\research\weld-ml\run1\P02_MF_1\T02_optuna


In [None]:
# Load study info
study_info = pd.read_excel(STUDY_ML_PATH)
study_info["model_params"] = study_info["model_params"].apply(ast.literal_eval)

# Load experimental data
_df = pd.read_excel(DATA_PATH)
print(f"df.shape: {_df.shape}")


df.shape: (378, 37)


In [12]:
# Select columns for features and targets
colsY = [c for c in _df.columns if re.search(r"stress_value", c)]
colsX = [c for c in _df.columns if c not in ["sample_no", "location", *colsY]]

# Select feature columns based on predefined names
colsY = [c for c in colsY if c in ["stress_value_center"]]

_dfY = _df[colsY]
_dfX = _df[colsX]
print("Selected feature columns:", colsX)
print("Selected target columns:", colsY)
print(f"dfX.shape: {_dfX.shape}")
print(f"dfY.shape: {_dfY.shape}")

Selected feature columns: ['position', 'R', 'W', 'D', 'Fx_location', 'Fy_location', 'Fz_location', 'Mz_location', 'Fx__dwell__fft_coefficient__attr_"abs"__coeff_11', 'Fx__dwell__quantile__q_0.7', 'Fx__dwell__partial_autocorrelation__lag_6', 'Fy__dwell__fft_coefficient__attr_"real"__coeff_71', 'Fy__dwell__last_location_of_minimum', 'Fy__dwell__change_quantiles__f_agg_"mean"__isabs_True__qh_0.8__ql_0.2', 'Fz__dwell__fft_coefficient__attr_"angle"__coeff_4', 'Fz__dwell__fft_coefficient__attr_"angle"__coeff_68', 'Fz__dwell__approximate_entropy__m_2__r_0.7', 'Mz__dwell__augmented_dickey_fuller__attr_"teststat"__autolag_"AIC"', 'Mz__dwell__fft_coefficient__attr_"real"__coeff_62', 'Mz__dwell__change_quantiles__f_agg_"mean"__isabs_True__qh_0.8__ql_0.2', 'Fx__weld__change_quantiles__f_agg_"var"__isabs_True__qh_0.6__ql_0.2', 'Fx__weld__fft_coefficient__attr_"real"__coeff_31', 'Fx__weld__fft_coefficient__attr_"abs"__coeff_58', 'Fy__weld__energy_ratio_by_chunks__num_segments_10__segment_focus_5', '

In [13]:
# %% Extract features and targets
_X = _dfX.values
_Y = _dfY.values
print(f"_X.shape: {_X.shape}")
print(f"_Y.shape: {_Y.shape}")

_X.shape: (378, 32)
_Y.shape: (378, 1)


In [14]:
# Create DataHandler instance
data_handler = DataHandler(
    _X=_X,
    _Y=_Y,
    scalerX=StandardScaler(),
    scalerY=StandardScaler(),
    colsX=colsX,
    colsY=colsY,
)

In [15]:
study_info

Unnamed: 0,study_name,dt,model,n_trials,random_state,test_size,best_param,best_value,total_trial,mse_mean,mse_std,mape_mean,mape_std,r2_mean,r2_std,model_params
0,study_DTR_RS-1_TS-0_3,2026-01-29_13-44,DTR,10,1,0.3,"{'criterion': 'friedman_mse', 'splitter': 'bes...",-0.134895,20,0.849797,0.072979,1.851424,0.548778,0.134895,0.044649,"{'criterion': 'friedman_mse', 'splitter': 'bes..."
1,study_DTR_RS-2_TS-0_3,2026-01-29_13-44,DTR,10,2,0.3,"{'criterion': 'absolute_error', 'splitter': 'b...",-0.12939,20,0.857561,0.138045,1.927578,0.324989,0.12939,0.095029,"{'criterion': 'absolute_error', 'splitter': 'b..."
2,study_DTR_RS-3_TS-0_3,2026-01-29_13-44,DTR,10,3,0.3,"{'criterion': 'absolute_error', 'splitter': 'b...",-0.075917,20,0.89402,0.100069,2.152368,0.5342,0.075917,0.053858,"{'criterion': 'absolute_error', 'splitter': 'b..."
3,study_DTR_RS-4_TS-0_3,2026-01-29_13-44,DTR,10,4,0.3,"{'criterion': 'absolute_error', 'splitter': 'b...",-0.1066,20,0.883583,0.060072,1.87269,0.245402,0.1066,0.011245,"{'criterion': 'absolute_error', 'splitter': 'b..."
4,study_DTR_RS-5_TS-0_3,2026-01-29_13-44,DTR,10,5,0.3,"{'criterion': 'absolute_error', 'splitter': 'b...",-0.122996,20,0.852444,0.080649,9.308267,10.717954,0.122996,0.053248,"{'criterion': 'absolute_error', 'splitter': 'b..."
5,study_EN_RS-1_TS-0_3,2026-01-29_13-44,EN,10,1,0.3,"{'alpha': 0.06944309426712889, 'l1_ratio': 0.6...",-0.19707,20,0.788233,0.032467,1.677665,0.452543,0.19707,0.021736,"{'alpha': 0.06944309426712889, 'l1_ratio': 0.6..."
6,study_EN_RS-2_TS-0_3,2026-01-29_13-44,EN,10,2,0.3,"{'alpha': 0.06944309426712889, 'l1_ratio': 0.6...",-0.167484,20,0.819554,0.097089,1.831482,0.310877,0.167484,0.049356,"{'alpha': 0.06944309426712889, 'l1_ratio': 0.6..."
7,study_EN_RS-3_TS-0_3,2026-01-29_13-44,EN,10,3,0.3,"{'alpha': 0.11480494080817177, 'l1_ratio': 0.6...",-0.145803,20,0.833466,0.104218,1.435583,0.133176,0.145803,0.021827,"{'alpha': 0.11480494080817177, 'l1_ratio': 0.6..."
8,study_EN_RS-4_TS-0_3,2026-01-29_13-44,EN,10,4,0.3,"{'alpha': 0.06944309426712889, 'l1_ratio': 0.6...",-0.179823,20,0.805234,0.019422,1.944129,0.477519,0.179823,0.045245,"{'alpha': 0.06944309426712889, 'l1_ratio': 0.6..."
9,study_EN_RS-5_TS-0_3,2026-01-29_13-44,EN,10,5,0.3,"{'alpha': 0.06036600023843763, 'l1_ratio': 0.5...",-0.221412,20,0.764626,0.03371,13.593533,12.728028,0.221412,0.017146,"{'alpha': 0.06036600023843763, 'l1_ratio': 0.5..."


In [16]:
# Get the best study info for each model based on r2_mean
study_info_best = study_info.groupby("model").apply(
    lambda df: df.sort_values("r2_mean", ascending=False).head(1), include_groups=False
)
study_info_best = study_info_best.reset_index().drop(columns=["level_1"])
study_info_best

Unnamed: 0,model,study_name,dt,n_trials,random_state,test_size,best_param,best_value,total_trial,mse_mean,mse_std,mape_mean,mape_std,r2_mean,r2_std,model_params
0,DTR,study_DTR_RS-1_TS-0_3,2026-01-29_13-44,10,1,0.3,"{'criterion': 'friedman_mse', 'splitter': 'bes...",-0.134895,20,0.849797,0.072979,1.851424,0.548778,0.134895,0.044649,"{'criterion': 'friedman_mse', 'splitter': 'bes..."
1,EN,study_EN_RS-5_TS-0_3,2026-01-29_13-44,10,5,0.3,"{'alpha': 0.06036600023843763, 'l1_ratio': 0.5...",-0.221412,20,0.764626,0.03371,13.593533,12.728028,0.221412,0.017146,"{'alpha': 0.06036600023843763, 'l1_ratio': 0.5..."
2,GBR,study_GBR_RS-5_TS-0_3,2026-01-29_13-44,10,5,0.3,"{'n_estimators': 78, 'learning_rate': 0.050261...",-0.226948,20,0.755483,0.034357,13.123965,16.740373,0.226948,0.022396,"{'n_estimators': 78, 'learning_rate': 0.050261..."
3,KNR,study_KNR_RS-4_TS-0_3,2026-01-29_13-44,10,4,0.3,"{'n_neighbors': 30, 'weights': 'distance', 'al...",-0.129268,20,0.859712,0.045187,1.742595,0.375401,0.129268,0.031176,"{'n_neighbors': 30, 'weights': 'distance', 'al..."
4,RFR,study_RFR_RS-5_TS-0_3,2026-01-29_13-44,10,5,0.3,"{'n_estimators': 189, 'max_depth': 12, 'min_sa...",-0.218557,20,0.765511,0.044428,14.411316,16.128679,0.218557,0.018864,"{'n_estimators': 189, 'max_depth': 12, 'min_sa..."
5,SVR,study_SVR_RS-5_TS-0_3,2026-01-29_13-44,10,5,0.3,"{'kernel': 'poly', 'C': 0.020025747803671577, ...",-0.129303,20,0.857655,0.02678,3.858219,2.633922,0.129303,0.008464,"{'kernel': 'poly', 'C': 0.020025747803671577, ..."
6,XGBR,study_XGBR_RS-5_TS-0_3,2026-01-29_13-44,10,5,0.3,"{'n_estimators': 162, 'max_depth': 9, 'learnin...",-0.223896,20,0.758008,0.025142,15.591819,20.313118,0.223896,0.005962,"{'n_estimators': 162, 'max_depth': 9, 'learnin..."


In [17]:
shaps = []
infos = []
for idx, study in study_info_best.iterrows():
    # idx = 1
    # study = study_info_best.iloc[idx]
    model = study["model"]
    model_params = study["model_params"]

    # These parameters are used so that I can recover the same StandardScaler and train-test split as in the Optuna study
    random_state = study["random_state"]
    test_size = study["test_size"]

    print(
        f"Processing study {idx + 1}/{len(study_info)}: model={model}, random_state={random_state}, test_size={test_size}"
    )
    data_handler.split_and_scale(random_state=random_state, test_size=test_size)
    df_X_train, df_Y_train = data_handler.get_train(as_dataframe=True)
    df_X_test, df_Y_test = data_handler.get_test(as_dataframe=True)

    # Combine train and test sets
    df_X_comb = pd.concat([df_X_train, df_X_test], axis=0)
    df_Y_comb = pd.concat([df_Y_train, df_Y_test], axis=0)
    reg = OptunaUtil.get_model(model_name=model, **model_params)

    # Fit the model on the combined dataset for SHAP analysis. Use the underlying estimator instead of the MultiOutputRegressor
    estimator = reg.estimator
    estimator.fit(df_X_comb, df_Y_comb.values.ravel())
    explainer = ShapUtil.get_shap_explainer(
        model_name=model, estimator=estimator, X=df_X_comb
    )
    shap_values = explainer(df_X_comb)

    # Save info
    info = dict(
        model=model,
        model_params=model_params,
        test_size=test_size,
        random_state=random_state,
    )
    infos.append(info)
    
    # Store SHAP values
    shaps.append(
        dict(
            **info,
            shap_values=shap_values,
        )
    )

Processing study 1/35: model=DTR, random_state=1, test_size=0.3
Processing study 2/35: model=EN, random_state=5, test_size=0.3


Processing study 3/35: model=GBR, random_state=5, test_size=0.3
Processing study 4/35: model=KNR, random_state=4, test_size=0.3


PermutationExplainer explainer: 379it [02:59,  1.99it/s]                         


Processing study 5/35: model=RFR, random_state=5, test_size=0.3
Processing study 6/35: model=SVR, random_state=5, test_size=0.3


PermutationExplainer explainer: 379it [01:03,  5.01it/s]                         


Processing study 7/35: model=XGBR, random_state=5, test_size=0.3


In [18]:
with open("S01_shap_calc.pkl", "wb") as f:
    pickle.dump(dict(shaps=shaps, infos=infos), f)