# Tabular GAD 실험 (ADBench + TabPFN + LLM Embedding)



##### import

In [10]:
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from tabpfn_extensions import TabPFNClassifier
from tabpfn_extensions.embedding import TabPFNEmbedding

from datetime import datetime

##### 데이터셋 전처리
- 결과 분석이나 LLM 컬럼 처리 등에서 컬럼명이 필요하므로 가져오기
- TabPFN에서도 A/B 도메인에서 학습하고 C에서 테스트하려면 여러 도메인 필요

In [3]:
# Paths
ADBENCH_ROOT = Path("/home/haeylee/main/dataset")
CLASSICAL_DIR = ADBENCH_ROOT / "Classical"
OUT_DIR = ADBENCH_ROOT / "export_with_columns"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Column name maps
COLS = {
    "breastw": [
        "Clump_Thickness","Uniformity_of_Cell_Size","Uniformity_of_Cell_Shape",
        "Marginal_Adhesion","Single_Epithelial_Cell_Size","Bare_Nuclei",
        "Bland_Chromatin","Normal_Nucleoli","Mitoses",
    ],
    "Pima": [
        "Pregnancies","Glucose","BloodPressure","SkinThickness",
        "Insulin","BMI","DiabetesPedigreeFunction","Age",
    ],
    "wine": [
        "Alcohol","Malicacid","Ash","Alcalinity_of_ash","Magnesium",
        "Total_phenols","Flavanoids","Nonflavanoid_phenols","Proanthocyanins",
        "Color_intensity","Hue","OD280_OD315_of_diluted_wines","Proline",
    ],
    "WDBC": [
        "radius_mean","texture_mean","perimeter_mean","area_mean","smoothness_mean",
        "compactness_mean","concavity_mean","concave_points_mean","symmetry_mean","fractal_dimension_mean",
        "radius_se","texture_se","perimeter_se","area_se","smoothness_se",
        "compactness_se","concavity_se","concave_points_se","symmetry_se","fractal_dimension_se",
        "radius_worst","texture_worst","perimeter_worst","area_worst","smoothness_worst",
        "compactness_worst","concavity_worst","concave_points_worst","symmetry_worst","fractal_dimension_worst",
    ],
}

# Datasets to export
TARGETS = [
    (4,  "breastw"),
    (29, "Pima"),
    (43, "WDBC"),
    (45, "wine"),
]

def make_columns(d: int, preferred: list[str] | None) -> list[str]:

    if preferred is not None and len(preferred) == d:
        return preferred
    return [f"f{i+1}" for i in range(d)]

def export_one(ds_id: int, name: str):
    npz_path = CLASSICAL_DIR / f"{ds_id}_{name}.npz"
    data = np.load(npz_path, allow_pickle=True)

    X = data["X"]
    y = data["y"].astype(int)

    cols = make_columns(X.shape[1], COLS.get(name))
    df = pd.DataFrame(X, columns=cols)
    df["is_anomaly"] = y

    df.to_parquet(OUT_DIR / f"{ds_id}_{name}.parquet", index=False)
    df.to_csv(OUT_DIR / f"{ds_id}_{name}.csv", index=False)

    print(f"[OK] {ds_id}_{name} | X={X.shape} -> {OUT_DIR}")

for ds_id, name in TARGETS:
    export_one(ds_id, name)


[OK] 4_breastw | X=(683, 9) -> /home/haeylee/main/dataset/export_with_columns
[OK] 29_Pima | X=(768, 8) -> /home/haeylee/main/dataset/export_with_columns
[OK] 43_WDBC | X=(367, 30) -> /home/haeylee/main/dataset/export_with_columns
[OK] 45_wine | X=(129, 13) -> /home/haeylee/main/dataset/export_with_columns


In [4]:
# 데이터셋 예시 봐보기

EXPORT_DIR = ADBENCH_ROOT / "export_with_columns"

def load_exported_df(ds_id: int, name: str, prefer: str = "parquet") -> pd.DataFrame:
    """export_with_columns에서 DataFrame을 읽어오기"""
    p_parq = EXPORT_DIR / f"{ds_id}_{name}.parquet"
    p_csv  = EXPORT_DIR / f"{ds_id}_{name}.csv"

    if prefer == "parquet" and p_parq.exists():
        return pd.read_parquet(p_parq)
    if prefer == "csv" and p_csv.exists():
        return pd.read_csv(p_csv)

    # fallback
    return pd.read_parquet(p_parq) if p_parq.exists() else pd.read_csv(p_csv)

def df_to_xy(df: pd.DataFrame):
    """DataFrame -> (X, y, feature_names)로 변환"""
    y = df["is_anomaly"].to_numpy().astype(int) 
    X = df.drop(columns=["is_anomaly"]).to_numpy() # anomaly 라벨 제외한 feature 행렬
    feature_names = df.columns.drop("is_anomaly").tolist() # 결과해석용으로 컬럼명도 뽑기
    return X, y, feature_names

# 예시: WDBC 확인
df_wdbc = load_exported_df(43, "WDBC", prefer="parquet")
print(df_wdbc.shape)
print(df_wdbc.columns[:8])
print(df_wdbc["is_anomaly"].value_counts())

df_wdbc.head(3)



(367, 31)
Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave_points_mean'],
      dtype='object')
is_anomaly
0    357
1     10
Name: count, dtype: int64


Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,fractal_dimension_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst,is_anomaly
0,15.3,25.27,102.4,732.4,0.1082,0.1697,0.1683,0.08751,0.1926,0.0654,...,36.71,149.3,1269.0,0.1641,0.611,0.6335,0.2024,0.4027,0.09876,1
1,14.87,16.67,98.64,682.5,0.1162,0.1649,0.169,0.08923,0.2157,0.06768,...,27.37,127.1,1095.0,0.1878,0.448,0.4704,0.2027,0.3585,0.1065,1
2,17.05,19.08,113.4,895.0,0.1141,0.1572,0.191,0.109,0.2131,0.06325,...,24.89,133.5,1189.0,0.1703,0.3934,0.5018,0.2543,0.3109,0.09061,1


### TabPFN (1) 임베딩 기반 파이프라인

- 각 row를 벡터 임베딩으로 만들고, 그 임베딩 위에 anomaly detector 얹어 AD 성능 구하는 코드
- TabPFN 임베딩 추출은 `tabpfn-extensions`의 Embeddings extension 그대로 씀
- 흐름
1) ADBench 데이터 로드
2) train/test split
3) TabPFNEmbedding으로 H_train, H_test 추출
4) 임베딩에 대해 간단한 detector(IsolationForest) 학습 - test에서 anomaly score
5) AD 성능 계산 (AUROC / Average Precision)
- 라벨을 임베딩 추출에 쓸지말지 둘다 실험

##### TabPFN을 임베딩 추출기로 사용해 row embedding 뽑기
- n_fold = 0 : 빠른 대신 덜 robust (vanilla)
- n_fold > 0 : CV 기반 임베딩 (default:5)

In [5]:
# TabPFNEmbedding으로 H_train, H_test 뽑기

def tabpfn_extract_embeddings(
    X_train, y_train_for_embed, X_test,
    n_fold=5,
    n_estimators=1,
):
    
    clf = TabPFNClassifier(n_estimators=n_estimators)
    embedder = TabPFNEmbedding(tabpfn_clf=clf, n_fold=n_fold)

    # n_fold=0일 때만 fit 필요
    if n_fold == 0:
        embedder.fit(X_train, y_train_for_embed)

    H_train = embedder.get_embeddings(X_train, y_train_for_embed, X_test, data_source="train")
    H_test  = embedder.get_embeddings(X_train, y_train_for_embed, X_test, data_source="test")
    return H_train, H_test


- 하나의 데이터셋에 대해 split - 임베딩 - 저장하기
    1) export된 데이터 로드
    2) train/test split
    3) TabPFN 임베딩 추출 (H_train, H_test)
    4) npy로 저장

In [None]:
# =========================
# (1) Save directories
# =========================

EMB_OUT_DIR = EXPORT_DIR / "tabpfn_embeddings"
EMB_OUT_DIR.mkdir(parents=True, exist_ok=True)

RUN_ID = datetime.now().strftime("%Y%m%d_%H%M%S")
RUN_DIR = EMB_OUT_DIR / RUN_ID
RUN_DIR.mkdir(parents=True, exist_ok=True)

print(">>> New run directory:", RUN_DIR)


In [6]:
EMB_OUT_DIR = EXPORT_DIR / "tabpfn_embeddings"
EMB_OUT_DIR.mkdir(parents=True, exist_ok=True)

def run_one_dataset_tabpfn_embedding(
    ds_id: int, name: str,
    prefer="parquet",
    test_size=0.3,
    seed=42,
    max_n=4000,
    n_fold=5,
    label_mode="dummy",  # "dummy" or "use_anomaly_labels"
):

    df = load_exported_df(ds_id, name, prefer=prefer)
    X, y, feature_names = df_to_xy(df)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=seed, stratify=y # 아노말리 비율 맞추면서 나눔
    )

    # 임베딩 추출에 쓸 y
    if label_mode == "dummy":
        y_for_embed = np.zeros_like(y_train)
    elif label_mode == "use_anomaly_labels":
        y_for_embed = y_train
    else:
        raise ValueError("label_mode must be 'dummy' or 'use_anomaly_labels'")

    H_train, H_test = tabpfn_extract_embeddings(
        X_train, y_for_embed, X_test,
        n_fold=n_fold,
        n_estimators=1
    )

    prefix = f"{ds_id}_{name}_fold{n_fold}_{label_mode}"
    np.save(EMB_OUT_DIR / f"{prefix}_H_train.npy", H_train)
    np.save(EMB_OUT_DIR / f"{prefix}_H_test.npy",  H_test)
    np.save(EMB_OUT_DIR / f"{prefix}_y_train.npy", y_train)
    np.save(EMB_OUT_DIR / f"{prefix}_y_test.npy",  y_test)

    meta = {
        "ds": f"{ds_id}_{name}",
        "n": int(X.shape[0]),
        "d": int(X.shape[1]),
        "embed_dim": int(H_train.shape[1]),
        "n_fold": int(n_fold),
        "label_mode": label_mode,
        "saved_prefix": prefix,
        "feature_names": feature_names,
    }
    return meta


In [7]:
meta = run_one_dataset_tabpfn_embedding(
    43, "WDBC",
    label_mode="dummy",
    n_fold=5,
    max_n=4000
)
print(meta)

{'ds': '43_WDBC', 'n': 367, 'd': 30, 'embed_dim': 256, 'n_fold': 5, 'label_mode': 'dummy', 'saved_prefix': '43_WDBC_fold5_dummy', 'feature_names': ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave_points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave_points_worst', 'symmetry_worst', 'fractal_dimension_worst']}


- 결과보면, 샘플수 367, feature개수 30개, TabPFN 임베딩 차원 256
- `H_train.npy`, `H_test.npy` : 원래 30차원 입력(d=30)이 256차원 임베딩(embed_dim=256)으로 변환됨
- `y_train.npy`, `y_test.npy` : 임베딩 위에서 anomaly detector를 학습/평가하려고 저장해둠
- 이제 임베딩(H)을 입력으로 detector를 학습하고 AUROC/AP를 계산하면 됨

In [8]:
# dummy, use_labels 둘다 임베딩 저장 (4개 전체)

def save_embeddings_both_modes(ds_id: int, name: str, n_fold=5, seed=42, test_size=0.3, prefer="parquet"):
    """같은 split으로 dummy / use_anomaly_labels 임베딩 둘 다 저장"""
    meta_dummy = run_one_dataset_tabpfn_embedding(
        ds_id, name,
        prefer=prefer,
        test_size=test_size,
        seed=seed,
        n_fold=n_fold,
        label_mode="dummy",
    )
    meta_lbl = run_one_dataset_tabpfn_embedding(
        ds_id, name,
        prefer=prefer,
        test_size=test_size,
        seed=seed,
        n_fold=n_fold,
        label_mode="use_anomaly_labels",
    )
    return meta_dummy, meta_lbl


In [9]:
metas = []
for ds_id, name in TARGETS:
    md, ml = save_embeddings_both_modes(ds_id, name, n_fold=5, seed=42, test_size=0.3)
    metas.append(md)
    metas.append(ml)
    print("[saved]", md["saved_prefix"])
    print("[saved]", ml["saved_prefix"])

[saved] 4_breastw_fold5_dummy
[saved] 4_breastw_fold5_use_anomaly_labels
[saved] 29_Pima_fold5_dummy
[saved] 29_Pima_fold5_use_anomaly_labels
[saved] 43_WDBC_fold5_dummy
[saved] 43_WDBC_fold5_use_anomaly_labels
[saved] 45_wine_fold5_dummy
[saved] 45_wine_fold5_use_anomaly_labels
