In [1]:
import os
import re
from pathlib import Path

import pandas as pd
from sklearn.preprocessing import StandardScaler
from tabpfn import TabPFNRegressor
from tabpfn_extensions import interpretability

from run1.lib.classes_ml import DataHandler
from run1.lib.directory import get_directory
from run1.lib.utils import MyUtil

In [2]:
CURRENT_DIR = Path.cwd()  # Current directory of the running file
# Get data directory
directory = get_directory(CURRENT_DIR, verbose=True)
DATA_PATH = directory["DATA_PATH"]
STUDY_TABPFN_PATH = directory["STUDY_TABPFN_PATH"]

Code is running in a Jupyter environment.
ROOT_DIR: c:\Users\admin\Coding\research\weld-ml
DATA_DIR: c:\Users\admin\Coding\research\weld-ml\run1\P04_MF2\T01_af_features
DATA_PATH: c:\Users\admin\Coding\research\weld-ml\run1\P04_MF2\T01_af_features\S01_combined_data.xlsx
STUDY_ML_DIR: c:\Users\admin\Coding\research\weld-ml\run1\P04_MF2\T02_optuna
STUDY_ML_PATH: c:\Users\admin\Coding\research\weld-ml\run1\P04_MF2\T02_optuna\S02_combine_study.xlsx
STUDY_TABPFN_DIR: c:\Users\admin\Coding\research\weld-ml\run1\P04_MF2\T11_tabPFN
STUDY_TABPFN_PATH: c:\Users\admin\Coding\research\weld-ml\run1\P04_MF2\T11_tabPFN\S01_calculate_performance.xlsx


In [3]:
OUTDIR = CURRENT_DIR / "S01"
os.makedirs(OUTDIR, exist_ok=True)

In [4]:
# Load study info
study_info = pd.read_excel(STUDY_TABPFN_PATH)

# Load experimental data
_df = pd.read_excel(DATA_PATH)
print(f"df.shape: {_df.shape}")


df.shape: (378, 25)


In [5]:
# Select columns for features and targets
colsY = [c for c in _df.columns if re.search(r"stress_value", c)]
colsX = [c for c in _df.columns if c not in ["sample_no", "location", *colsY]]

# Select feature columns based on predefined names
colsY = [c for c in colsY if c in ["stress_value_center"]]

_dfY = _df[colsY]
_dfX = _df[colsX]
print("Selected feature columns:", colsX)
print("Selected target columns:", colsY)
print(f"dfX.shape: {_dfX.shape}")
print(f"dfY.shape: {_dfY.shape}")

Selected feature columns: ['D', 'Fx__dwell__ar_coefficient__coeff_0__k_10', 'Fx__dwell__fft_coefficient__attr_"abs"__coeff_11', 'Fx__weld__change_quantiles__f_agg_"var"__isabs_True__qh_0.6__ql_0.2', 'Fx__weld__fft_coefficient__attr_"abs"__coeff_5', 'Fx_location', 'Fy__dwell__fft_coefficient__attr_"abs"__coeff_59', 'Fy__dwell__fft_coefficient__attr_"real"__coeff_71', 'Fy__weld__change_quantiles__f_agg_"mean"__isabs_True__qh_0.4__ql_0.0', 'Fy__weld__energy_ratio_by_chunks__num_segments_10__segment_focus_5', 'Fz__dwell__fft_coefficient__attr_"angle"__coeff_4', 'Fz__dwell__fft_coefficient__attr_"real"__coeff_4', 'Fz__weld__agg_linear_trend__attr_"rvalue"__chunk_len_5__f_agg_"var"', 'Fz__weld__change_quantiles__f_agg_"mean"__isabs_False__qh_1.0__ql_0.4', 'Fz_location', 'Mz__dwell__augmented_dickey_fuller__attr_"pvalue"__autolag_"AIC"', 'Mz__dwell__augmented_dickey_fuller__attr_"teststat"__autolag_"AIC"', 'R', 'W', 'position']
Selected target columns: ['stress_value_center']
dfX.shape: (378,

In [6]:
# %% Extract features and targets
_X = _dfX.values
_Y = _dfY.values
print(f"_X.shape: {_X.shape}")
print(f"_Y.shape: {_Y.shape}")

_X.shape: (378, 20)
_Y.shape: (378, 1)


In [7]:
# Create DataHandler instance
data_handler = DataHandler(
    _X=_X,
    _Y=_Y,
    scalerX=StandardScaler(),
    scalerY=StandardScaler(),
    colsX=colsX,
    colsY=colsY,
)

In [8]:
study_info

Unnamed: 0,random_state,test_size,model,Y,MSE,MAPE,R2
0,1,0.3,TabPFN,Y-1,0.908799,1.632796,0.289078
1,1,0.3,TabPFN,Y-2,0.841909,2.104804,-0.026687
2,1,0.3,TabPFN,Y-3,0.429463,1.177243,0.496522
3,1,0.3,TabPFN,Y-All,0.726724,1.638281,0.252971
4,2,0.3,TabPFN,Y-1,0.611242,1.761754,0.299447
5,2,0.3,TabPFN,Y-2,0.903019,0.978415,-0.009454
6,2,0.3,TabPFN,Y-3,0.599715,1.211344,0.456713
7,2,0.3,TabPFN,Y-All,0.704659,1.317171,0.248902
8,3,0.3,TabPFN,Y-1,0.905557,2.446737,0.195266
9,3,0.3,TabPFN,Y-2,0.924437,1.3499,-0.003143


In [9]:
# Get the best study info
study_info_best = study_info.sort_values(by="R2", ascending=False).head(1)
study_info_best

Unnamed: 0,random_state,test_size,model,Y,MSE,MAPE,R2
2,1,0.3,TabPFN,Y-3,0.429463,1.177243,0.496522


In [10]:
def get_batch_info(batch_size, data_length):
    data_arr = []
    for i in range(0, data_length, batch_size):
        start_idx = i
        end_idx = min(i + batch_size, data_length)
        current_batch_number = i // batch_size
        data = {
            "start_idx": start_idx,
            "end_idx": end_idx,
            "current_batch_number": current_batch_number,
            "idx_range": list(range(start_idx, end_idx)),
        }
        data_arr.append(data)
    return pd.DataFrame(data_arr)

In [11]:
model = study_info_best["model"].values[0]
# These parameters are used so that I can recover the same StandardScaler and train-test split as in the Optuna study
random_state = study_info_best["random_state"].values[0]
test_size = study_info_best["test_size"].values[0]

data_handler.split_and_scale(random_state=random_state, test_size=test_size)
df_X_train, df_Y_train = data_handler.get_train(as_dataframe=True)
df_X_test, df_Y_test = data_handler.get_test(as_dataframe=True)

# Combine train and test sets
df_X_comb = pd.concat([df_X_train, df_X_test], axis=0)
df_Y_comb = pd.concat([df_Y_train, df_Y_test], axis=0)

# Initialize the model
reg = TabPFNRegressor()

# Fit the model on the combined dataset for SHAP analysis. Use the underlying estimator instead of the MultiOutputRegressor
reg.fit(df_X_comb.values, df_Y_comb.values.ravel())

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client
  _validate_num_samples_for_cpu(


In [12]:
batch_size = 3
batch_info = get_batch_info(batch_size=batch_size, data_length=df_X_comb.shape[0])
print(f"Totoal batches: {batch_info.shape[0]}")
display(batch_info)


Totoal batches: 126


Unnamed: 0,start_idx,end_idx,current_batch_number,idx_range
0,0,3,0,"[0, 1, 2]"
1,3,6,1,"[3, 4, 5]"
2,6,9,2,"[6, 7, 8]"
3,9,12,3,"[9, 10, 11]"
4,12,15,4,"[12, 13, 14]"
...,...,...,...,...
121,363,366,121,"[363, 364, 365]"
122,366,369,122,"[366, 367, 368]"
123,369,372,123,"[369, 370, 371]"
124,372,375,124,"[372, 373, 374]"


In [13]:
batch_number_start = 7
batch_number_end = 13
for batch_number in range(batch_number_start, batch_number_end + 1):
    row = batch_info.loc[batch_info["current_batch_number"] == batch_number]
    start_idx = row["start_idx"].values[0]
    end_idx = row["end_idx"].values[0]
    print(
        f"Processing batch {batch_number}, Start idx: {start_idx}, End idx: {end_idx}..."
    )

    X_batch = df_X_comb.values[start_idx:end_idx]
    shap_values = interpretability.shap.get_shap_values(
        estimator=reg,
        test_x=X_batch,
        attribute_names=df_X_comb.columns.tolist(),
    )

    MyUtil.save_data(
        OUTDIR / f"batch_{batch_number}.pkl",
        dict(shap_values=shap_values),
    )

Processing batch 7, Start idx: 21, End idx: 24...


PermutationExplainer explainer: 4it [02:11, 43.84s/it]                       


Processing batch 8, Start idx: 24, End idx: 27...


PermutationExplainer explainer: 4it [02:12, 44.22s/it]                       


Processing batch 9, Start idx: 27, End idx: 30...


PermutationExplainer explainer: 4it [02:34, 51.61s/it]                       


Processing batch 10, Start idx: 30, End idx: 33...


PermutationExplainer explainer: 4it [02:23, 47.86s/it]                       


Processing batch 11, Start idx: 33, End idx: 36...


PermutationExplainer explainer: 4it [02:24, 48.20s/it]                       


Processing batch 12, Start idx: 36, End idx: 39...


PermutationExplainer explainer: 4it [02:23, 47.87s/it]                       


Processing batch 13, Start idx: 39, End idx: 42...


PermutationExplainer explainer: 4it [02:37, 52.56s/it]                       
