In [1]:
import os
import re
from pathlib import Path
from tabpfn import TabPFNRegressor
import pandas as pd
from sklearn.preprocessing import StandardScaler
from tabpfn_extensions import interpretability
from run1.lib.classes_ml import DataHandler
from run1.lib.utils import MyUtil

In [2]:
BASE_DIR = Path.cwd()  # Current directory of the running file
ROOT_DIR = BASE_DIR.parent.parent.parent
DATA_DIR = ROOT_DIR / "run1" / "data"
STUDY_DIR = ROOT_DIR / "run1" / "P01_no_af" / "T11_tabPFN"
CURRENT_DIR = BASE_DIR
dt = MyUtil.get_dt()
print(f"CURRENT_DIR: {CURRENT_DIR}")
print(f"DATA_DIR: {DATA_DIR}")
print(f"STUDY_DIR: {STUDY_DIR}")
print(f"dt: {dt}")

CURRENT_DIR: c:\Users\admin\Coding\research\weld-ml\run1\P01_no_af\T24_shap_tabFPN
DATA_DIR: c:\Users\admin\Coding\research\weld-ml\run1\data
STUDY_DIR: c:\Users\admin\Coding\research\weld-ml\run1\P01_no_af\T11_tabPFN
dt: 2026-01-29_12-16


In [3]:
OUTDIR = CURRENT_DIR / "S01"
os.makedirs(OUTDIR, exist_ok=True)

In [4]:
# Load study info
study_info_filename = "S01_calculate_performance.xlsx"
study_info = pd.read_excel(STUDY_DIR / study_info_filename)

# Load experimental data
_df = pd.read_excel(DATA_DIR / "S02_data_exp.xlsx")
print(f"df.shape: {_df.shape}")


df.shape: (378, 9)


In [5]:
# Select columns for features and targets
colsY = [c for c in _df.columns if re.search(r"stress_value", c)]

# Select feature columns based on predefined names
colsY = [c for c in colsY if c in ["stress_value_center"]]

# Predefined feature columns
colsX = [c for c in _df.columns if c in ["R", "W", "D", "position"]]
_dfY = _df[colsY]
_dfX = _df[colsX]
print("Selected feature columns:", colsX)
print("Selected target columns:", colsY)
print(f"dfX.shape: {_dfX.shape}")
print(f"dfY.shape: {_dfY.shape}")

Selected feature columns: ['position', 'R', 'W', 'D']
Selected target columns: ['stress_value_center']
dfX.shape: (378, 4)
dfY.shape: (378, 1)


In [6]:
# %% Extract features and targets
_X = _dfX.values
_Y = _dfY.values
print(f"_X.shape: {_X.shape}")
print(f"_Y.shape: {_Y.shape}")

_X.shape: (378, 4)
_Y.shape: (378, 1)


In [7]:
# Create DataHandler instance
data_handler = DataHandler(
    _X=_X,
    _Y=_Y,
    scalerX=StandardScaler(),
    scalerY=StandardScaler(),
    colsX=colsX,
    colsY=colsY,
)

In [8]:
study_info

Unnamed: 0,random_state,test_size,model,Y,MSE,MAPE,R2
0,1,0.3,TabPFN,Y-1,1.277798,1.604244,0.000423
1,1,0.3,TabPFN,Y-2,0.831836,1.053364,-0.014403
2,1,0.3,TabPFN,Y-3,0.550377,1.20343,0.354769
3,1,0.3,TabPFN,Y-All,0.88667,1.287012,0.113596
4,2,0.3,TabPFN,Y-1,0.893694,1.12366,-0.024277
5,2,0.3,TabPFN,Y-2,0.918113,0.989473,-0.026327
6,2,0.3,TabPFN,Y-3,0.75042,1.577042,0.320189
7,2,0.3,TabPFN,Y-All,0.854076,1.230058,0.089862
8,3,0.3,TabPFN,Y-1,1.079286,0.983859,0.040879
9,3,0.3,TabPFN,Y-2,0.934747,1.264602,-0.01433


In [9]:
# Get the best study info
study_info_best = study_info.sort_values(by="R2", ascending=False).head(1)
study_info_best

Unnamed: 0,random_state,test_size,model,Y,MSE,MAPE,R2
10,3,0.3,TabPFN,Y-3,0.469651,1.266372,0.360448


In [10]:
def get_batch_info(batch_size, data_length):
    data_arr = []
    for i in range(0, data_length, batch_size):
        start_idx = i
        end_idx = min(i + batch_size, data_length)
        current_batch_number = i // batch_size
        data = {
            "start_idx": start_idx,
            "end_idx": end_idx,
            "current_batch_number": current_batch_number,
            "idx_range": list(range(start_idx, end_idx)),
        }
        data_arr.append(data)
    return pd.DataFrame(data_arr)

In [11]:
model = study_info_best["model"].values[0]
# These parameters are used so that I can recover the same StandardScaler and train-test split as in the Optuna study
random_state = study_info_best["random_state"].values[0]
test_size = study_info_best["test_size"].values[0]

data_handler.split_and_scale(random_state=random_state, test_size=test_size)
df_X_train, df_Y_train = data_handler.get_train(as_dataframe=True)
df_X_test, df_Y_test = data_handler.get_test(as_dataframe=True)

# Combine train and test sets
df_X_comb = pd.concat([df_X_train, df_X_test], axis=0)
df_Y_comb = pd.concat([df_Y_train, df_Y_test], axis=0)

# Initialize the model
reg = TabPFNRegressor()

# Fit the model on the combined dataset for SHAP analysis. Use the underlying estimator instead of the MultiOutputRegressor
reg.fit(df_X_comb.values, df_Y_comb.values.ravel())

Consider using a GPU or the tabpfn-client API: https://github.com/PriorLabs/tabpfn-client
  _validate_num_samples_for_cpu(


In [12]:
batch_size = 3
batch_info = get_batch_info(batch_size=batch_size, data_length=df_X_comb.shape[0])
print(f"Totoal batches: {batch_info.shape[0]}")
display(batch_info)


Totoal batches: 126


Unnamed: 0,start_idx,end_idx,current_batch_number,idx_range
0,0,3,0,"[0, 1, 2]"
1,3,6,1,"[3, 4, 5]"
2,6,9,2,"[6, 7, 8]"
3,9,12,3,"[9, 10, 11]"
4,12,15,4,"[12, 13, 14]"
...,...,...,...,...
121,363,366,121,"[363, 364, 365]"
122,366,369,122,"[366, 367, 368]"
123,369,372,123,"[369, 370, 371]"
124,372,375,124,"[372, 373, 374]"


In [13]:
batch_number_start = 0
batch_number_end = 2
for batch_number in range(batch_number_start, batch_number_end + 1):
    row = batch_info.loc[batch_info["current_batch_number"] == batch_number]
    start_idx = row["start_idx"].values[0]
    end_idx = row["end_idx"].values[0]
    print(
        f"Processing batch {batch_number}, Start idx: {start_idx}, End idx: {end_idx}..."
    )

    X_batch = df_X_comb.values[start_idx:end_idx]
    shap_values = interpretability.shap.get_shap_values(
        estimator=reg,
        test_x=X_batch,
        attribute_names=df_X_comb.columns.tolist(),
    )

    MyUtil.save_data(
        OUTDIR / f"batch_{batch_number}.pkl",
        dict(shap_values=shap_values),
    )

Processing batch 0, Start idx: 0, End idx: 3...


ExactExplainer explainer: 4it [00:10, 10.41s/it]               


Processing batch 1, Start idx: 3, End idx: 6...
Processing batch 2, Start idx: 6, End idx: 9...
