# Tabular GAD 실험 (ADBench + TabPFN Embedding)



##### import

In [6]:
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from tabpfn_extensions import TabPFNClassifier
from tabpfn_extensions.embedding import TabPFNEmbedding

from datetime import datetime

##### 데이터셋 전처리
- 결과 분석이나 LLM 컬럼 처리 등에서 컬럼명이 필요하므로 가져오기
- TabPFN에서도 A/B 도메인에서 학습하고 C에서 테스트하려면 여러 도메인 필요

In [7]:
# Paths
ADBENCH_ROOT = Path("/home/haeylee/main/dataset")
CLASSICAL_DIR = ADBENCH_ROOT / "Classical"
OUT_DIR = ADBENCH_ROOT / "export_with_columns"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Column name maps
COLS = {
    "breastw": [
        "Clump_Thickness","Uniformity_of_Cell_Size","Uniformity_of_Cell_Shape",
        "Marginal_Adhesion","Single_Epithelial_Cell_Size","Bare_Nuclei",
        "Bland_Chromatin","Normal_Nucleoli","Mitoses",
    ],
    "Pima": [
        "Pregnancies","Glucose","BloodPressure","SkinThickness",
        "Insulin","BMI","DiabetesPedigreeFunction","Age",
    ],
    "wine": [
        "Alcohol","Malicacid","Ash","Alcalinity_of_ash","Magnesium",
        "Total_phenols","Flavanoids","Nonflavanoid_phenols","Proanthocyanins",
        "Color_intensity","Hue","OD280_OD315_of_diluted_wines","Proline",
    ],
    "WDBC": [
        "radius_mean","texture_mean","perimeter_mean","area_mean","smoothness_mean",
        "compactness_mean","concavity_mean","concave_points_mean","symmetry_mean","fractal_dimension_mean",
        "radius_se","texture_se","perimeter_se","area_se","smoothness_se",
        "compactness_se","concavity_se","concave_points_se","symmetry_se","fractal_dimension_se",
        "radius_worst","texture_worst","perimeter_worst","area_worst","smoothness_worst",
        "compactness_worst","concavity_worst","concave_points_worst","symmetry_worst","fractal_dimension_worst",
    ],
}

# Datasets to export
TARGETS = [
    (4,  "breastw"),
    (29, "Pima"),
    (43, "WDBC"),
    (45, "wine"),
]

def make_columns(d: int, preferred: list[str] | None) -> list[str]:

    if preferred is not None and len(preferred) == d:
        return preferred
    return [f"f{i+1}" for i in range(d)]

def export_one(ds_id: int, name: str):
    npz_path = CLASSICAL_DIR / f"{ds_id}_{name}.npz"
    data = np.load(npz_path, allow_pickle=True)

    X = data["X"]
    y = data["y"].astype(int)

    cols = make_columns(X.shape[1], COLS.get(name))
    df = pd.DataFrame(X, columns=cols)
    df["is_anomaly"] = y

    df.to_parquet(OUT_DIR / f"{ds_id}_{name}.parquet", index=False)
    df.to_csv(OUT_DIR / f"{ds_id}_{name}.csv", index=False)

    print(f"[OK] {ds_id}_{name} | X={X.shape} -> {OUT_DIR}")

for ds_id, name in TARGETS:
    export_one(ds_id, name)


[OK] 4_breastw | X=(683, 9) -> /home/haeylee/main/dataset/export_with_columns
[OK] 29_Pima | X=(768, 8) -> /home/haeylee/main/dataset/export_with_columns
[OK] 43_WDBC | X=(367, 30) -> /home/haeylee/main/dataset/export_with_columns
[OK] 45_wine | X=(129, 13) -> /home/haeylee/main/dataset/export_with_columns


In [8]:
# 데이터셋 예시 봐보기

EXPORT_DIR = ADBENCH_ROOT / "export_with_columns"

def load_exported_df(ds_id: int, name: str, prefer: str = "parquet") -> pd.DataFrame:
    """export_with_columns에서 DataFrame을 읽어오기"""
    p_parq = EXPORT_DIR / f"{ds_id}_{name}.parquet"
    p_csv  = EXPORT_DIR / f"{ds_id}_{name}.csv"

    if prefer == "parquet" and p_parq.exists():
        return pd.read_parquet(p_parq)
    if prefer == "csv" and p_csv.exists():
        return pd.read_csv(p_csv)

    # fallback
    return pd.read_parquet(p_parq) if p_parq.exists() else pd.read_csv(p_csv)

def df_to_xy(df: pd.DataFrame):
    """DataFrame -> (X, y, feature_names)로 변환"""
    y = df["is_anomaly"].to_numpy().astype(int) 
    X = df.drop(columns=["is_anomaly"]).to_numpy() # anomaly 라벨 제외한 feature 행렬
    feature_names = df.columns.drop("is_anomaly").tolist() # 결과해석용으로 컬럼명도 뽑기
    return X, y, feature_names

# 예시: WDBC 확인
df_wdbc = load_exported_df(43, "WDBC", prefer="parquet")
print(df_wdbc.shape)
print(df_wdbc.columns[:8])
print(df_wdbc["is_anomaly"].value_counts())

df_wdbc.head(3)



(367, 31)
Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave_points_mean'],
      dtype='object')
is_anomaly
0    357
1     10
Name: count, dtype: int64


Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,fractal_dimension_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst,is_anomaly
0,15.3,25.27,102.4,732.4,0.1082,0.1697,0.1683,0.08751,0.1926,0.0654,...,36.71,149.3,1269.0,0.1641,0.611,0.6335,0.2024,0.4027,0.09876,1
1,14.87,16.67,98.64,682.5,0.1162,0.1649,0.169,0.08923,0.2157,0.06768,...,27.37,127.1,1095.0,0.1878,0.448,0.4704,0.2027,0.3585,0.1065,1
2,17.05,19.08,113.4,895.0,0.1141,0.1572,0.191,0.109,0.2131,0.06325,...,24.89,133.5,1189.0,0.1703,0.3934,0.5018,0.2543,0.3109,0.09061,1


### TabPFN (1) 임베딩 기반 파이프라인

- 각 row를 벡터 임베딩으로 만들고, 그 임베딩 위에 anomaly detector 얹어 AD 성능 구하는 코드
- TabPFN 임베딩 추출은 `tabpfn-extensions`의 Embeddings extension 그대로 씀
- 흐름
1) ADBench 데이터 로드
2) train/test split
3) TabPFNEmbedding으로 H_train, H_test 추출
4) 임베딩에 대해 간단한 detector(IsolationForest) 학습 - test에서 anomaly score
5) AD 성능 계산 (AUROC / Average Precision)
- 라벨을 임베딩 추출에 쓸지말지 둘다 실험

##### TabPFN을 임베딩 추출기로 사용해 row embedding 뽑기
- n_fold = 0 : 빠른 대신 덜 robust (vanilla)
- n_fold > 0 : CV 기반 임베딩 (default:5)

In [9]:
# TabPFNEmbedding으로 H_train, H_test 뽑기

def tabpfn_extract_embeddings(
    X_train, y_train_for_embed, X_test,
    n_fold=5,
    n_estimators=1,
):
    
    clf = TabPFNClassifier(n_estimators=n_estimators)
    embedder = TabPFNEmbedding(tabpfn_clf=clf, n_fold=n_fold)

    # n_fold=0일 때만 fit 필요
    if n_fold == 0:
        embedder.fit(X_train, y_train_for_embed)

    H_train = embedder.get_embeddings(X_train, y_train_for_embed, X_test, data_source="train")
    H_test  = embedder.get_embeddings(X_train, y_train_for_embed, X_test, data_source="test")
    return H_train, H_test


- 각 데이터셋마다 train/test 한 번 나누고
- 같은 split에 대해 
    - dummy 버전 임베딩 (라벨 안씀) : H_train_dummy, H_test_dummy
    - use_anomaly_labels 버전 임베딩 (라벨 씀) : H_train_lbl, H_test_lbl
    두 가지 각각 저장하기
- 위 과정을 4개 데이터셋 모두에 대해 반복
- 평가 : 저장된 `H_train`, `H_test`, `y_train`, `y_test`를 불러와서 detector(ex.IsolationForest)로 성능평가하기
    - in-domain : 같은 데이터셋의 `H_train`으로 학습 --> 같은 데이터셋의 `H_test`로 평가
    - cross-domain : A 데이터셋의 `H_train`으로 학습 --> B 데이터셋의 `H_test`로 평가

In [10]:
EMB_OUT_DIR = EXPORT_DIR / "tabpfn_embeddings"
EMB_OUT_DIR.mkdir(parents=True, exist_ok=True)

RUN_ID = datetime.now().strftime("%Y%m%d_%H%M")
RUN_DIR = EMB_OUT_DIR / RUN_ID
RUN_DIR.mkdir(parents=True, exist_ok=True)

print(">>> New run directory:", RUN_DIR)

>>> New run directory: /home/haeylee/main/dataset/export_with_columns/tabpfn_embeddings/20251211_1628


In [17]:
# Save embeddings (both modes) for ONE dataset

def save_embeddings_for_one_dataset(
    ds_id: int,
    name: str,
    prefer: str = "parquet",
    test_size: float = 0.5,
    seed: int = 42,
    n_fold: int = 5,
):

    # 1) load
    df = load_exported_df(ds_id, name, prefer=prefer)
    X, y, feature_names = df_to_xy(df)

    # 2) split (anomaly ratio 유지)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=test_size,
        random_state=seed,
        stratify=y # 아노말리 비율 유지
    )

    # 3) 저장 경로: RUN_DIR / dataset / fold / mode
    base_dir = RUN_DIR / name / f"fold{n_fold}"
    base_dir.mkdir(parents=True, exist_ok=True)

    metas = {}

    # -------------------------
    # (A) dummy
    # -------------------------
    mode = "dummy"
    save_dir = base_dir / mode
    save_dir.mkdir(parents=True, exist_ok=True)

    y_for_embed = np.zeros_like(y_train)

    H_train, H_test = tabpfn_extract_embeddings(
        X_train, y_for_embed, X_test,
        n_fold=n_fold,
        n_estimators=1
    )

    np.save(save_dir / "H_train.npy", H_train)
    np.save(save_dir / "H_test.npy",  H_test)
    np.save(save_dir / "y_train.npy", y_train)
    np.save(save_dir / "y_test.npy",  y_test)

    metas[mode] = {
        "ds": f"{ds_id}_{name}",
        "mode": mode,
        "path": str(save_dir),
        "train_n": int(len(y_train)),
        "test_n": int(len(y_test)),
        "embed_dim": int(H_train.shape[1]),
        "feature_names": feature_names,
    }

    # -------------------------
    # (B) use_anomaly_labels
    # -------------------------
    mode = "use_anomaly_labels"
    save_dir = base_dir / mode
    save_dir.mkdir(parents=True, exist_ok=True)

    y_for_embed = y_train  # label 사용

    H_train, H_test = tabpfn_extract_embeddings(
        X_train, y_for_embed, X_test,
        n_fold=n_fold,
        n_estimators=1
    )

    np.save(save_dir / "H_train.npy", H_train)
    np.save(save_dir / "H_test.npy",  H_test)
    np.save(save_dir / "y_train.npy", y_train)
    np.save(save_dir / "y_test.npy",  y_test)

    metas[mode] = {
        "ds": f"{ds_id}_{name}",
        "mode": mode,
        "path": str(save_dir),
        "train_n": int(len(y_train)),
        "test_n": int(len(y_test)),
        "embed_dim": int(H_train.shape[1]),
        "feature_names": feature_names,
    }

    return metas


In [26]:
# Run: save for ALL datasets

all_meta = {}

for ds_id, name in TARGETS:
    print(f"\n>>> Running {ds_id}_{name}")
    all_meta[name] = save_embeddings_for_one_dataset(
        ds_id, name,
        prefer="parquet",
        test_size=0.5,
        seed=42,
        n_fold=5,
    )

print("\n>>> Done. Saved under:", RUN_DIR)
all_meta



>>> Running 4_breastw

>>> Running 29_Pima

>>> Running 43_WDBC

>>> Running 45_wine

>>> Done. Saved under: /home/haeylee/main/dataset/export_with_columns/tabpfn_embeddings/20251211_1628


{'breastw': {'dummy': {'ds': '4_breastw',
   'mode': 'dummy',
   'path': '/home/haeylee/main/dataset/export_with_columns/tabpfn_embeddings/20251211_1628/breastw/fold5/dummy',
   'train_n': 341,
   'test_n': 342,
   'embed_dim': 341,
   'feature_names': ['Clump_Thickness',
    'Uniformity_of_Cell_Size',
    'Uniformity_of_Cell_Shape',
    'Marginal_Adhesion',
    'Single_Epithelial_Cell_Size',
    'Bare_Nuclei',
    'Bland_Chromatin',
    'Normal_Nucleoli',
    'Mitoses']},
  'use_anomaly_labels': {'ds': '4_breastw',
   'mode': 'use_anomaly_labels',
   'path': '/home/haeylee/main/dataset/export_with_columns/tabpfn_embeddings/20251211_1628/breastw/fold5/use_anomaly_labels',
   'train_n': 341,
   'test_n': 342,
   'embed_dim': 341,
   'feature_names': ['Clump_Thickness',
    'Uniformity_of_Cell_Size',
    'Uniformity_of_Cell_Shape',
    'Marginal_Adhesion',
    'Single_Epithelial_Cell_Size',
    'Bare_Nuclei',
    'Bland_Chromatin',
    'Normal_Nucleoli',
    'Mitoses']}},
 'Pima': {'dumm

- 결과보면, 샘플수 367, feature개수 30개, TabPFN 임베딩 차원 256
- `H_train.npy`, `H_test.npy` : 원래 30차원 입력(d=30)이 256차원 임베딩(embed_dim=256)으로 변환됨
- `y_train.npy`, `y_test.npy` : 임베딩 위에서 anomaly detector를 학습/평가하려고 저장해둠
- 이제 임베딩(H)을 입력으로 detector를 학습하고 AUROC/AP를 계산하면 됨

### 성능평가

In [27]:
def to_2d(H: np.ndarray, n_samples: int) -> np.ndarray:
    H = np.asarray(H)

    if H.ndim == 2:
        return H

    if H.ndim == 3:
        # (k, n, d) 형태
        if H.shape[1] == n_samples:
            return H.mean(axis=0)   # k축 평균 (k=1이면 squeeze랑 같음)
        # (n, k, d) 형태
        if H.shape[0] == n_samples:
            return H.mean(axis=1)
        raise ValueError(f"Can't align embeddings with n_samples={n_samples}, H.shape={H.shape}")

    raise ValueError(f"Unexpected embedding ndim={H.ndim}, shape={H.shape}")


In [28]:
# Load saved embeddings from RUN_DIR

def load_saved_embeddings(
    dataset_name: str,
    n_fold: int = 5,
    mode: str = "dummy",   # "dummy" or "use_anomaly_labels"
):
    
    d = RUN_DIR / dataset_name / f"fold{n_fold}" / mode

    H_train = np.load(d / "H_train.npy")
    H_test  = np.load(d / "H_test.npy")
    y_train = np.load(d / "y_train.npy").astype(int)
    y_test  = np.load(d / "y_test.npy").astype(int)

    H_train = to_2d(H_train, n_samples=len(y_train))
    H_test  = to_2d(H_test,  n_samples=len(y_test))

    return H_train, H_test, y_train, y_test

In [29]:
# Detector: IsolationForest + AUROC/AP

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score, average_precision_score

def fit_iforest(H_train, seed=42):
    """H_train으로 IsolationForest 학습 (scaler 포함)"""
    scaler = StandardScaler()
    Htr = scaler.fit_transform(H_train)

    det = IsolationForest(
        n_estimators=400,
        contamination="auto",
        random_state=seed,
        n_jobs=-1
    )
    det.fit(Htr)
    return scaler, det

def score_iforest(scaler, det, H):
    """점수는 클수록 anomaly가 되도록 부호 반전"""
    Hs = scaler.transform(H)
    return -det.decision_function(Hs)

def eval_scores(y_true, scores):
    return {
        "AUROC": float(roc_auc_score(y_true, scores)),
        "AP": float(average_precision_score(y_true, scores)),
        "test_anom_rate": float(y_true.mean()),
        "n_test": int(len(y_true)),
    }


In [30]:
# In-domain eval: train/test same dataset

rows = []
for _, name in TARGETS:
    for mode in ["dummy", "use_anomaly_labels"]:
        H_train, H_test, y_train, y_test = load_saved_embeddings(name, n_fold=5, mode=mode)

        scaler, det = fit_iforest(H_train, seed=42)
        scores = score_iforest(scaler, det, H_test)

        m = eval_scores(y_test, scores)
        rows.append({
            "scenario": "in-domain",
            "train_ds": name,
            "test_ds": name,
            "mode": mode,
            **m
        })

results_in = pd.DataFrame(rows).sort_values(["train_ds", "mode"]).reset_index(drop=True)
results_in


Unnamed: 0,scenario,train_ds,test_ds,mode,AUROC,AP,test_anom_rate,n_test
0,in-domain,Pima,Pima,dummy,0.575284,0.472338,0.348958,384
1,in-domain,Pima,Pima,use_anomaly_labels,0.57994,0.432899,0.348958,384
2,in-domain,WDBC,WDBC,dummy,0.934078,0.43702,0.027174,184
3,in-domain,WDBC,WDBC,use_anomaly_labels,1.0,1.0,0.027174,184
4,in-domain,breastw,breastw,dummy,0.747485,0.62627,0.350877,342
5,in-domain,breastw,breastw,use_anomaly_labels,0.778754,0.639536,0.350877,342
6,in-domain,wine,wine,dummy,0.766667,0.223117,0.076923,65
7,in-domain,wine,wine,use_anomaly_labels,1.0,1.0,0.076923,65


- `test_anom_rate`은 테스트셋에서 anomaly(=1) 라벨의 비율
- WDBC / wine에서 use_anomaly_labels가 AUROC=1, AP=1
    - TabPFN 임베딩 단계에서 라벨 정보가 들어가 분리가 과도하게 쉬워짐.
- dummy 모드는 전체적으로 괜찮음.
- WDBC(dummy) AUROC 0.97은 꽤 좋고, breastw(dummy) 0.78도 나쁘지 않음.
- Pima(dummy) 0.53은 거의 랜덤에 가까움.
- wine(dummy) AUROC 0.72인데 AP 0.16은 불균형(7.7%)에서 Precision-Recall이 어려운 케이스라 그럴 수 있음(표본도 39개로 작음)

In [24]:
# Cross-domain eval: train on A, test on B

rows = []
names = [name for _, name in TARGETS]

for train_name in names:
    for test_name in names:
        if train_name == test_name:
            continue

        for mode in ["dummy", "use_anomaly_labels"]:
            H_train_A, _, _, _ = load_saved_embeddings(train_name, n_fold=5, mode=mode)
            _, H_test_B, _, y_test_B = load_saved_embeddings(test_name, n_fold=5, mode=mode)

            scaler, det = fit_iforest(H_train_A, seed=42)
            scores = score_iforest(scaler, det, H_test_B)

            m = eval_scores(y_test_B, scores)
            rows.append({
                "scenario": "cross-domain",
                "train_ds": train_name,
                "test_ds": test_name,
                "mode": mode,
                **m
            })

results_cross = pd.DataFrame(rows).sort_values(["mode", "train_ds", "test_ds"]).reset_index(drop=True)
results_cross


Unnamed: 0,scenario,train_ds,test_ds,mode,AUROC,AP,test_anom_rate,n_test
0,cross-domain,Pima,WDBC,dummy,0.805556,0.460573,0.027027,111
1,cross-domain,Pima,breastw,dummy,0.262636,0.247679,0.35122,205
2,cross-domain,Pima,wine,dummy,0.898148,0.337302,0.076923,39
3,cross-domain,WDBC,Pima,dummy,0.346914,0.297775,0.350649,231
4,cross-domain,WDBC,breastw,dummy,0.468463,0.383442,0.35122,205
5,cross-domain,WDBC,wine,dummy,0.722222,0.159748,0.076923,39
6,cross-domain,breastw,Pima,dummy,0.423868,0.36073,0.350649,231
7,cross-domain,breastw,WDBC,dummy,0.904321,0.463054,0.027027,111
8,cross-domain,breastw,wine,dummy,0.805556,0.213131,0.076923,39
9,cross-domain,wine,Pima,dummy,0.335638,0.275115,0.350649,231


1) dummy 결과 해석
- WDBC를 test로 두면 꽤 잘 나옴
    - Pima→WDBC (0.81), breastw→WDBC (0.90), wine→WDBC (0.79)
    - TabPFN 임베딩 공간에서 WDBC의 anomaly는 다른 도메인에서 학습한 정상성으로도 어느 정도 잡힌다는 신호?
- breastw를 test로 두면 일부는 거의 반대로 나옴
    - wine→breastw (0.056), Pima→breastw (0.263), WDBC→breastw (0.468)
    - AUROC가 0.5보다 한참 아래면, 보통 (i) 점수 방향이 뒤집혔거나(=정상에 높은 점수를 줌), (ii) 소스에서 학습한 정상성이 타깃 도메인에서는 정반대 의미가 되어버렸거나, (iii) 단순히 도메인 시프트가 커서 실패일 가능성..
    - AUROC는 순위 기반이라, 완전 반대로라면 1−AUROC로 뒤집으면 높아진다고 함.. (예: 0.056 → 뒤집으면 0.944)
- representation이 도메인별로 방향/스케일이 바뀌는 문제가 있다는 힌트..?

2) `use_anomaly_labels cross-domain` 결과 해석.. 일단 그대로 믿으면 안 됨
- 현재 `use_anomaly_labels` 모드에서 타깃(test) 도메인 임베딩을 만들 때도 그 도메인의 y_train 라벨을 써서 임베딩을 뽑는 상태.
- 즉 cross-domain에서 source는 라벨 사용은 OK지만, target 임베딩이 target 라벨을 이미 사용한 상태라서
    - WDBC→wine (1.0) 같은 값이 많은 이유가 **타깃 쪽 임베딩 단계에 정보가 섞인 효과**일 가능성이 큼.
- 공정하게 하려면:
    - target(test) 쪽 임베딩은 항상 dummy로 고정해야 할듯.. 
    - 비교 실험해봐야할듯.

In [25]:
# Summary (cross-domain): dummy vs labels

summary_cross = (results_cross
                 .groupby("mode", as_index=False)[["AUROC", "AP"]]
                 .mean()
                )
summary_cross


Unnamed: 0,mode,AUROC,AP
0,dummy,0.568542,0.331379
1,use_anomaly_labels,0.868467,0.786983
