# Tabular GAD 실험 (ADBench + LLM Embedding)


##### import

In [None]:
import torch
import numpy as np
import pandas as pd
import os
from pathlib import Path
from datetime import datetime

from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score, average_precision_score


In [2]:
# Paths
DATA_ROOT = Path("/home/haeylee/main/dataset")
EXPORT_DIR = DATA_ROOT / "export_with_columns"   # 이미 전처리된 parquet/csv가 있음
OUT_ROOT = DATA_ROOT / "llm_embeddings"          # 새로 만들 저장 폴더 (dataset 아래에 정리)
OUT_ROOT.mkdir(parents=True, exist_ok=True)

# Targets (전처리된 파일명과 동일해야 함)
TARGETS = [
    (4,  "breastw"),
    (29, "Pima"),
    (43, "WDBC"),
    (45, "wine"),
]

print("EXPORT_DIR =", EXPORT_DIR)
print("OUT_ROOT   =", OUT_ROOT)


EXPORT_DIR = /home/haeylee/main/dataset/export_with_columns
OUT_ROOT   = /home/haeylee/main/dataset/llm_embeddings


In [3]:
# 데이터 로딩

def load_exported_df(ds_id: int, name: str, prefer: str = "parquet") -> pd.DataFrame:
    p_parq = EXPORT_DIR / f"{ds_id}_{name}.parquet"
    p_csv  = EXPORT_DIR / f"{ds_id}_{name}.csv"

    if prefer == "parquet" and p_parq.exists():
        return pd.read_parquet(p_parq)
    if prefer == "csv" and p_csv.exists():
        return pd.read_csv(p_csv)

    # fallback
    if p_parq.exists():
        return pd.read_parquet(p_parq)
    if p_csv.exists():
        return pd.read_csv(p_csv)

    raise FileNotFoundError(f"Cannot find {ds_id}_{name}.parquet/csv under {EXPORT_DIR}")


def df_to_xy(df: pd.DataFrame):
    y = df["is_anomaly"].to_numpy().astype(int)
    X = df.drop(columns=["is_anomaly"]).to_numpy()
    feature_names = df.columns.drop("is_anomaly").tolist()
    return X, y, feature_names


In [4]:
df_wdbc = load_exported_df(43, "WDBC", prefer="parquet")
X, y, feature_names = df_to_xy(df_wdbc)
print("WDBC X shape:", X.shape, "| anomaly rate:", y.mean())
df_wdbc.head(3)


WDBC X shape: (367, 30) | anomaly rate: 0.027247956403269755


Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,fractal_dimension_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst,is_anomaly
0,15.3,25.27,102.4,732.4,0.1082,0.1697,0.1683,0.08751,0.1926,0.0654,...,36.71,149.3,1269.0,0.1641,0.611,0.6335,0.2024,0.4027,0.09876,1
1,14.87,16.67,98.64,682.5,0.1162,0.1649,0.169,0.08923,0.2157,0.06768,...,27.37,127.1,1095.0,0.1878,0.448,0.4704,0.2027,0.3585,0.1065,1
2,17.05,19.08,113.4,895.0,0.1141,0.1572,0.191,0.109,0.2131,0.06325,...,24.89,133.5,1189.0,0.1703,0.3934,0.5018,0.2543,0.3109,0.09061,1


In [None]:
# tabular --> text 텍스트화

def row_to_text(values, feature_names, precision=4, nan_token="NA"): # 일단 precision 소수점이후 4자리까지
    parts = []
    for k, v in zip(feature_names, values):
        if v is None or (isinstance(v, float) and np.isnan(v)): # 값이 없거나 nan 이면 "Na" 로 기록
            parts.append(f"{k}={nan_token}")
        else:                                                   # 값이 숫자이면 너무 길어지지 않게 precision 자리로 반올림해서 기록
            if isinstance(v, (float, np.floating)):
                parts.append(f"{k}={float(v):.{precision}f}")
            else:                                               # 그 외 그대로 값 기록
                parts.append(f"{k}={v}")
    return "; ".join(parts) # 리스트 내용 모두 ; 로 이어붙이기

def X_to_texts(X, feature_names, precision=4):
    return [row_to_text(xi, feature_names, precision=precision) for xi in X]


In [16]:
# 텍스트화 예시 출력
ds_id, name = 43, "WDBC"
df = load_exported_df(ds_id, name, prefer="parquet")
X, y, feature_names = df_to_xy(df)

# 앞에서 5개만
k = 5
texts = X_to_texts(X[:k], feature_names, precision=4)

for i, t in enumerate(texts):
    print(f"[sample {i}] y={y[i]}")
    print(t)
    print("-" * 120)


[sample 0] y=1
radius_mean=15.3000; texture_mean=25.2700; perimeter_mean=102.4000; area_mean=732.4000; smoothness_mean=0.1082; compactness_mean=0.1697; concavity_mean=0.1683; concave_points_mean=0.0875; symmetry_mean=0.1926; fractal_dimension_mean=0.0654; radius_se=0.4390; texture_se=1.0120; perimeter_se=3.4980; area_se=43.5000; smoothness_se=0.0052; compactness_se=0.0306; concavity_se=0.0358; concave_points_se=0.0108; symmetry_se=0.0177; fractal_dimension_se=0.0030; radius_worst=20.2700; texture_worst=36.7100; perimeter_worst=149.3000; area_worst=1269.0000; smoothness_worst=0.1641; compactness_worst=0.6110; concavity_worst=0.6335; concave_points_worst=0.2024; symmetry_worst=0.4027; fractal_dimension_worst=0.0988
------------------------------------------------------------------------------------------------------------------------
[sample 1] y=1
radius_mean=14.8700; texture_mean=16.6700; perimeter_mean=98.6400; area_mean=682.5000; smoothness_mean=0.1162; compactness_mean=0.1649; conca

In [15]:
# qwen3-embedding 로드 (sentenceTransformer)

QWEN_EMB_MODEL = "Qwen/Qwen3-Embedding-0.6B"   # 우선 가벼운 버전
device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device)

llm_model = SentenceTransformer(QWEN_EMB_MODEL, device=device)


device: cuda


### 임베딩 함수
- 입력: texts = ["radius_mean=17.99; ...", "radius_mean=12.34; ...", ...] 처럼 문자열 리스트
- 출력: H = np.ndarray 형태의 임베딩 행렬
    - shape 보통 (n_samples, embed_dim) (ex, (367, 1024) 또는 (367, 768))
- batch_size : 한 번에 몇 문장씩 묶어서 GPU로 처리할지
- normalize_embeddings=True : 각 벡터를 길이 1로 정규화(L2 normalize)
    - 거리 비교(코사인 유사도)나 스케일 안정성을 위해
- prompt_name : 이 텍스트를 어떤 역할로 임베딩할지 모델에게 알려주는 옵션 
    - 큐웬3임베딩에서 제공해주는 템플릿 붙여서 encode하는 기능으로, `query`, `None`(=document) 옵션이 있음. 
    - `query`는 검색어용 프롬프트를 붙여서 임베딩을 만들고, (이때 실제로 쿼리를 직접 주는게 아니라, 내부에 저장된 쿼리모드로 인코딩하는 것)
    - `None` = `document`는 그냥 기본 방식(문서처럼) 임베딩 만드는 것.
    - 표의 샘플 텍스트화해서 임베딩하는거니까 일단 document 처럼 임베딩하는 것으로 1차 테스트
    - 2차 테스트로는 : anomaly detection 목적으로, 이 샘플의 이상 여부를 판별하기 위한 임베딩으로 만들자고 쿼리 주고 만들어보기

In [None]:
def embed_texts(texts, batch_size=64, normalize=True, prompt_name=None):
    """
    texts: list[str]; 입력 문자열 리스트
    normalize: True면 L2 normalize된 임베딩 반환
    prompt_name: None(기본) / "query"(비교실험용)
    """
    return llm_model.encode(
        texts,
        batch_size=batch_size,
        convert_to_numpy=True,
        normalize_embeddings=normalize,
        prompt_name=prompt_name
    )

In [7]:
# 임베딩 저장

RUN_ID = datetime.now().strftime("%Y%m%d_%H%M%S")
RUN_DIR = OUT_ROOT / RUN_ID
RUN_DIR.mkdir(parents=True, exist_ok=True)
print("RUN_DIR =", RUN_DIR)


RUN_DIR = /home/haeylee/main/dataset/llm_embeddings/20251209_134712


In [17]:
# Save embeddings for ONE dataset

def save_one_dataset_llm_embeddings(
    ds_id: int,
    name: str,
    prefer="parquet",
    test_size=0.3,
    seed=42,
    precision=4,
    batch_size=64,
    prompt_name=None,   # None or "query"
):
    # (1) load
    df = load_exported_df(ds_id, name, prefer=prefer)
    X, y, feature_names = df_to_xy(df)

    # (2) split (in-domain 평가용: train에서 학습, test에서 평가)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=seed, stratify=y
    )

    # (3) tabular -> text
    tr_texts = X_to_texts(X_train, feature_names, precision=precision)
    te_texts = X_to_texts(X_test,  feature_names, precision=precision)

    # (4) embed
    H_train = embed_texts(tr_texts, batch_size=batch_size, normalize=True, prompt_name=prompt_name)
    H_test  = embed_texts(te_texts, batch_size=batch_size, normalize=True, prompt_name=prompt_name)

    # (5) save
    ds_dir = RUN_DIR / name
    ds_dir.mkdir(parents=True, exist_ok=True)

    tag = f"qwen3emb_prompt{prompt_name if prompt_name else 'none'}_seed{seed}_ts{test_size}_prec{precision}"
    np.save(ds_dir / f"{tag}_H_train.npy", H_train)
    np.save(ds_dir / f"{tag}_H_test.npy",  H_test)
    np.save(ds_dir / f"{tag}_y_train.npy", y_train.astype(int))
    np.save(ds_dir / f"{tag}_y_test.npy",  y_test.astype(int))

    meta = {
        "ds": f"{ds_id}_{name}",
        "n": int(X.shape[0]),
        "d": int(X.shape[1]),
        "embed_dim": int(H_train.shape[1]),
        "anom_rate": float(y.mean()),
        "run_dir": str(ds_dir),
        "tag": tag,
        "prompt_name": (prompt_name if prompt_name else "none"),
    }
    return meta


In [20]:
# Run for ALL datasets

llm_metas = []
for ds_id, name in TARGETS:
    print(f"\n>>> Embedding {ds_id}_{name}")
    meta = save_one_dataset_llm_embeddings(
        ds_id, name,
        prefer="parquet",
        test_size=0.3,
        seed=42,
        precision=4,
        batch_size=64,
        prompt_name=None,
    )
    llm_metas.append(meta)
    print(meta)

llm_metas



>>> Embedding 4_breastw
{'ds': '4_breastw', 'n': 683, 'd': 9, 'embed_dim': 1024, 'anom_rate': 0.34992679355783307, 'run_dir': '/home/haeylee/main/dataset/llm_embeddings/20251209_134712/breastw', 'tag': 'qwen3emb_promptnone_seed42_ts0.3_prec4', 'prompt_name': 'none'}

>>> Embedding 29_Pima
{'ds': '29_Pima', 'n': 768, 'd': 8, 'embed_dim': 1024, 'anom_rate': 0.3489583333333333, 'run_dir': '/home/haeylee/main/dataset/llm_embeddings/20251209_134712/Pima', 'tag': 'qwen3emb_promptnone_seed42_ts0.3_prec4', 'prompt_name': 'none'}

>>> Embedding 43_WDBC
{'ds': '43_WDBC', 'n': 367, 'd': 30, 'embed_dim': 1024, 'anom_rate': 0.027247956403269755, 'run_dir': '/home/haeylee/main/dataset/llm_embeddings/20251209_134712/WDBC', 'tag': 'qwen3emb_promptnone_seed42_ts0.3_prec4', 'prompt_name': 'none'}

>>> Embedding 45_wine
{'ds': '45_wine', 'n': 129, 'd': 13, 'embed_dim': 1024, 'anom_rate': 0.07751937984496124, 'run_dir': '/home/haeylee/main/dataset/llm_embeddings/20251209_134712/wine', 'tag': 'qwen3emb_pr

[{'ds': '4_breastw',
  'n': 683,
  'd': 9,
  'embed_dim': 1024,
  'anom_rate': 0.34992679355783307,
  'run_dir': '/home/haeylee/main/dataset/llm_embeddings/20251209_134712/breastw',
  'tag': 'qwen3emb_promptnone_seed42_ts0.3_prec4',
  'prompt_name': 'none'},
 {'ds': '29_Pima',
  'n': 768,
  'd': 8,
  'embed_dim': 1024,
  'anom_rate': 0.3489583333333333,
  'run_dir': '/home/haeylee/main/dataset/llm_embeddings/20251209_134712/Pima',
  'tag': 'qwen3emb_promptnone_seed42_ts0.3_prec4',
  'prompt_name': 'none'},
 {'ds': '43_WDBC',
  'n': 367,
  'd': 30,
  'embed_dim': 1024,
  'anom_rate': 0.027247956403269755,
  'run_dir': '/home/haeylee/main/dataset/llm_embeddings/20251209_134712/WDBC',
  'tag': 'qwen3emb_promptnone_seed42_ts0.3_prec4',
  'prompt_name': 'none'},
 {'ds': '45_wine',
  'n': 129,
  'd': 13,
  'embed_dim': 1024,
  'anom_rate': 0.07751937984496124,
  'run_dir': '/home/haeylee/main/dataset/llm_embeddings/20251209_134712/wine',
  'tag': 'qwen3emb_promptnone_seed42_ts0.3_prec4',
  '

In [21]:
# Load saved embeddings

def load_saved_llm_embeddings(name: str, tag: str):
    ds_dir = RUN_DIR / name
    H_train = np.load(ds_dir / f"{tag}_H_train.npy")
    H_test  = np.load(ds_dir / f"{tag}_H_test.npy")
    y_train = np.load(ds_dir / f"{tag}_y_train.npy").astype(int)
    y_test  = np.load(ds_dir / f"{tag}_y_test.npy").astype(int)
    return H_train, H_test, y_train, y_test

# 위에서 저장한 기본 tag(모든 데이터셋이 동일 설정이면 동일 tag)
TAG = f"qwen3emb_promptnone_seed42_ts0.3_prec4"
print("TAG =", TAG)


TAG = qwen3emb_promptnone_seed42_ts0.3_prec4


In [22]:
# Evaluation helpers

def fit_iforest(H_train, seed=42):
    scaler = StandardScaler()
    Htr = scaler.fit_transform(H_train)

    det = IsolationForest(
        n_estimators=400,
        contamination="auto",
        random_state=seed,
        n_jobs=-1
    )
    det.fit(Htr)
    return scaler, det

def score_iforest(scaler, det, H):
    Hz = scaler.transform(H)
    # IsolationForest는 정상일수록 score_samples가 큼 -> anomaly score로 쓰려면 - 붙임
    return -det.score_samples(Hz)

def eval_scores(y_true, scores):
    return {
        "AUROC": float(roc_auc_score(y_true, scores)),
        "AP": float(average_precision_score(y_true, scores)),
        "test_anom_rate": float(y_true.mean()),
        "n_test": int(len(y_true)),
    }


In [23]:
# In-domain evaluation

names = [name for _, name in TARGETS]

rows = []
for name in names:
    H_train, H_test, y_train, y_test = load_saved_llm_embeddings(name, TAG)

    scaler, det = fit_iforest(H_train, seed=42)
    scores = score_iforest(scaler, det, H_test)

    rows.append({
        "scenario": "in-domain",
        "train_ds": name,
        "test_ds": name,
        **eval_scores(y_test, scores)
    })

results_llm_in = pd.DataFrame(rows).sort_values(["train_ds"]).reset_index(drop=True)
results_llm_in


Unnamed: 0,scenario,train_ds,test_ds,AUROC,AP,test_anom_rate,n_test
0,in-domain,Pima,Pima,0.565761,0.395157,0.350649,231
1,in-domain,WDBC,WDBC,0.944444,0.241703,0.027027,111
2,in-domain,breastw,breastw,0.924081,0.775968,0.35122,205
3,in-domain,wine,wine,0.833333,0.245238,0.076923,39


In [24]:
# Cross-domain evaluation

rows = []
for train_name in names:
    H_train_src, _, _, _ = load_saved_llm_embeddings(train_name, TAG)
    scaler, det = fit_iforest(H_train_src, seed=42)

    for test_name in names:
        if test_name == train_name:
            continue

        _, H_test_tgt, _, y_test_tgt = load_saved_llm_embeddings(test_name, TAG)
        scores = score_iforest(scaler, det, H_test_tgt)

        rows.append({
            "scenario": "cross-domain",
            "train_ds": train_name,
            "test_ds": test_name,
            **eval_scores(y_test_tgt, scores)
        })

results_llm_cross = pd.DataFrame(rows).sort_values(["train_ds","test_ds"]).reset_index(drop=True)
results_llm_cross


Unnamed: 0,scenario,train_ds,test_ds,AUROC,AP,test_anom_rate,n_test
0,cross-domain,Pima,WDBC,0.462963,0.032996,0.027027,111
1,cross-domain,Pima,breastw,0.594089,0.475221,0.35122,205
2,cross-domain,Pima,wine,0.898148,0.341667,0.076923,39
3,cross-domain,WDBC,Pima,0.433827,0.316216,0.350649,231
4,cross-domain,WDBC,breastw,0.559524,0.493231,0.35122,205
5,cross-domain,WDBC,wine,0.833333,0.248316,0.076923,39
6,cross-domain,breastw,Pima,0.507819,0.353772,0.350649,231
7,cross-domain,breastw,WDBC,0.611111,0.065828,0.027027,111
8,cross-domain,breastw,wine,0.842593,0.506944,0.076923,39
9,cross-domain,wine,Pima,0.552263,0.378919,0.350649,231


In [25]:
# check

for name in names:
    H_train, H_test, y_train, y_test = load_saved_llm_embeddings(name, TAG)
    print(name, "H_train", H_train.shape, "H_test", H_test.shape, "anom_rate(test)", y_test.mean())


breastw H_train (478, 1024) H_test (205, 1024) anom_rate(test) 0.35121951219512193
Pima H_train (537, 1024) H_test (231, 1024) anom_rate(test) 0.35064935064935066
WDBC H_train (256, 1024) H_test (111, 1024) anom_rate(test) 0.02702702702702703
wine H_train (90, 1024) H_test (39, 1024) anom_rate(test) 0.07692307692307693


In [14]:
# 요약
results_llm_cross[["AUROC","AP"]].mean()


AUROC    0.607117
AP       0.297409
dtype: float64

### query 버전 추가실험

In [26]:
# query 버전도 같은 RUN_DIR 아래에 저장(파일명 tag로 구분)

llm_metas_query = []
for ds_id, name in TARGETS:
    print(f"\n>>> [QUERY] Embedding {ds_id}_{name}")
    meta = save_one_dataset_llm_embeddings(
        ds_id, name,
        prefer="parquet",
        test_size=0.3,
        seed=42,
        precision=4,
        batch_size=64,
        prompt_name="query",   # 여기만 다름
    )
    llm_metas_query.append(meta)
    print(meta)

llm_metas_query



>>> [QUERY] Embedding 4_breastw
{'ds': '4_breastw', 'n': 683, 'd': 9, 'embed_dim': 1024, 'anom_rate': 0.34992679355783307, 'run_dir': '/home/haeylee/main/dataset/llm_embeddings/20251209_134712/breastw', 'tag': 'qwen3emb_promptquery_seed42_ts0.3_prec4', 'prompt_name': 'query'}

>>> [QUERY] Embedding 29_Pima
{'ds': '29_Pima', 'n': 768, 'd': 8, 'embed_dim': 1024, 'anom_rate': 0.3489583333333333, 'run_dir': '/home/haeylee/main/dataset/llm_embeddings/20251209_134712/Pima', 'tag': 'qwen3emb_promptquery_seed42_ts0.3_prec4', 'prompt_name': 'query'}

>>> [QUERY] Embedding 43_WDBC
{'ds': '43_WDBC', 'n': 367, 'd': 30, 'embed_dim': 1024, 'anom_rate': 0.027247956403269755, 'run_dir': '/home/haeylee/main/dataset/llm_embeddings/20251209_134712/WDBC', 'tag': 'qwen3emb_promptquery_seed42_ts0.3_prec4', 'prompt_name': 'query'}

>>> [QUERY] Embedding 45_wine
{'ds': '45_wine', 'n': 129, 'd': 13, 'embed_dim': 1024, 'anom_rate': 0.07751937984496124, 'run_dir': '/home/haeylee/main/dataset/llm_embeddings/2025

[{'ds': '4_breastw',
  'n': 683,
  'd': 9,
  'embed_dim': 1024,
  'anom_rate': 0.34992679355783307,
  'run_dir': '/home/haeylee/main/dataset/llm_embeddings/20251209_134712/breastw',
  'tag': 'qwen3emb_promptquery_seed42_ts0.3_prec4',
  'prompt_name': 'query'},
 {'ds': '29_Pima',
  'n': 768,
  'd': 8,
  'embed_dim': 1024,
  'anom_rate': 0.3489583333333333,
  'run_dir': '/home/haeylee/main/dataset/llm_embeddings/20251209_134712/Pima',
  'tag': 'qwen3emb_promptquery_seed42_ts0.3_prec4',
  'prompt_name': 'query'},
 {'ds': '43_WDBC',
  'n': 367,
  'd': 30,
  'embed_dim': 1024,
  'anom_rate': 0.027247956403269755,
  'run_dir': '/home/haeylee/main/dataset/llm_embeddings/20251209_134712/WDBC',
  'tag': 'qwen3emb_promptquery_seed42_ts0.3_prec4',
  'prompt_name': 'query'},
 {'ds': '45_wine',
  'n': 129,
  'd': 13,
  'embed_dim': 1024,
  'anom_rate': 0.07751937984496124,
  'run_dir': '/home/haeylee/main/dataset/llm_embeddings/20251209_134712/wine',
  'tag': 'qwen3emb_promptquery_seed42_ts0.3_prec

In [27]:
# 쿼리 버전 평가

TAG_QUERY = f"qwen3emb_promptquery_seed42_ts0.3_prec4"
print("TAG_QUERY =", TAG_QUERY)

# in-domain (query)
rows = []
names = [name for _, name in TARGETS]
for name in names:
    H_train, H_test, y_train, y_test = load_saved_llm_embeddings(name, TAG_QUERY)
    scaler, det = fit_iforest(H_train, seed=42)
    scores = score_iforest(scaler, det, H_test)
    rows.append({"scenario":"in-domain", "train_ds":name, "test_ds":name, **eval_scores(y_test, scores)})

results_llm_in_query = pd.DataFrame(rows).sort_values(["train_ds"]).reset_index(drop=True)
results_llm_in_query


TAG_QUERY = qwen3emb_promptquery_seed42_ts0.3_prec4


Unnamed: 0,scenario,train_ds,test_ds,AUROC,AP,test_anom_rate,n_test
0,in-domain,Pima,Pima,0.535802,0.381578,0.350649,231
1,in-domain,WDBC,WDBC,0.947531,0.261905,0.027027,111
2,in-domain,breastw,breastw,0.930869,0.752574,0.35122,205
3,in-domain,wine,wine,0.87963,0.729167,0.076923,39


In [28]:
# cross-domain (query)
rows = []
for train_name in names:
    H_train_src, _, _, _ = load_saved_llm_embeddings(train_name, TAG_QUERY)
    scaler, det = fit_iforest(H_train_src, seed=42)

    for test_name in names:
        if test_name == train_name:
            continue
        _, H_test_tgt, _, y_test_tgt = load_saved_llm_embeddings(test_name, TAG_QUERY)
        scores = score_iforest(scaler, det, H_test_tgt)
        rows.append({"scenario":"cross-domain", "train_ds":train_name, "test_ds":test_name, **eval_scores(y_test_tgt, scores)})

results_llm_cross_query = pd.DataFrame(rows).sort_values(["train_ds","test_ds"]).reset_index(drop=True)
results_llm_cross_query


Unnamed: 0,scenario,train_ds,test_ds,AUROC,AP,test_anom_rate,n_test
0,cross-domain,Pima,WDBC,0.453704,0.036463,0.027027,111
1,cross-domain,Pima,breastw,0.08751,0.209928,0.35122,205
2,cross-domain,Pima,wine,0.796296,0.325397,0.076923,39
3,cross-domain,WDBC,Pima,0.612428,0.481816,0.350649,231
4,cross-domain,WDBC,breastw,0.472327,0.318745,0.35122,205
5,cross-domain,WDBC,wine,0.75,0.239899,0.076923,39
6,cross-domain,breastw,Pima,0.435885,0.312022,0.350649,231
7,cross-domain,breastw,WDBC,0.354938,0.031714,0.027027,111
8,cross-domain,breastw,wine,0.555556,0.135962,0.076923,39
9,cross-domain,wine,Pima,0.500658,0.357441,0.350649,231


In [33]:
# 한번에 비교

in_comp = pd.concat(
    [results_llm_in.assign(mode="none"),
     results_llm_in_query.assign(mode="query")],
    ignore_index=True
)

cross_comp = pd.concat(
    [results_llm_cross.assign(mode="none"),
     results_llm_cross_query.assign(mode="query")],
    ignore_index=True
)

display(in_comp.sort_values(["train_ds","mode"]).reset_index(drop=True))
display(cross_comp.sort_values(["train_ds","test_ds","mode"]).reset_index(drop=True))


Unnamed: 0,scenario,train_ds,test_ds,AUROC,AP,test_anom_rate,n_test,mode
0,in-domain,Pima,Pima,0.565761,0.395157,0.350649,231,none
1,in-domain,Pima,Pima,0.535802,0.381578,0.350649,231,query
2,in-domain,WDBC,WDBC,0.944444,0.241703,0.027027,111,none
3,in-domain,WDBC,WDBC,0.947531,0.261905,0.027027,111,query
4,in-domain,breastw,breastw,0.924081,0.775968,0.35122,205,none
5,in-domain,breastw,breastw,0.930869,0.752574,0.35122,205,query
6,in-domain,wine,wine,0.833333,0.245238,0.076923,39,none
7,in-domain,wine,wine,0.87963,0.729167,0.076923,39,query


Unnamed: 0,scenario,train_ds,test_ds,AUROC,AP,test_anom_rate,n_test,mode
0,cross-domain,Pima,WDBC,0.462963,0.032996,0.027027,111,none
1,cross-domain,Pima,WDBC,0.453704,0.036463,0.027027,111,query
2,cross-domain,Pima,breastw,0.594089,0.475221,0.35122,205,none
3,cross-domain,Pima,breastw,0.08751,0.209928,0.35122,205,query
4,cross-domain,Pima,wine,0.898148,0.341667,0.076923,39,none
5,cross-domain,Pima,wine,0.796296,0.325397,0.076923,39,query
6,cross-domain,WDBC,Pima,0.433827,0.316216,0.350649,231,none
7,cross-domain,WDBC,Pima,0.612428,0.481816,0.350649,231,query
8,cross-domain,WDBC,breastw,0.559524,0.493231,0.35122,205,none
9,cross-domain,WDBC,breastw,0.472327,0.318745,0.35122,205,query


### 결과 해석
- In-domain : `none` vs `query`
    - 둘다 in-domain 은 괜찮음. 
    - query가 항상 좋고 나쁘다는건 없고, wine에서 query가 AP 많이 개선됨.
    - WDBC 매우 좋음.
    - wine은 none에 비해 query에서 크게 상승. 표본이 n_test=39라 변동성이 크지만, 쿼리 모드가 작거나 짧은 데이터에서 더 잘 맞을수도?
- Cross-domain
    - none이 query보다 전반적으로 훨씬 안정적!!!
    - none cross-domain에서 성능 높은 조합:
        - Pima→wine: AUROC 0.898
        - WDBC→wine: AUROC 0.833
        - breastw→wine: AUROC 0.843
        - wine→WDBC: AUROC 0.676 (AP는 낮지만 WDBC 특성임)
        - 즉 도메인 바뀌어도 어느 정도 버티는 조합들이 존재.
    - 하지만 query cross-domain은 많은 조합에서 0.5 근처 또는 그 이하로 떨어짐:
        - Pima→breastw: AUROC 0.087 (거의 완전 반대 수준)
        - breastw→WDBC: AUROC 0.355
        - wine→WDBC: AUROC 0.488
        - breastw→wine: AUROC 0.556 (none 대비 크게 하락)
- 즉 query 모드는 cross-domain에서 표현이 더 도메인/스케일/분포에 민감해진 느낌.

### 추가실험 : custom instruction
- 앞서 baseline (no prompt), built-in query prompt 를 썼다면, 이제 custom instruction으로 임베딩 뽑기
- 큐웬3임베딩 모델에서 입력 문자열 줄 때 원하는 task 에 맞게 임베딩 뽑도록 프롬프트 함께 줄 수 있음. 그 기능을 활용할 것임.
- `Instruct: {TASK}\nQuery:{row_text}` 로 바꿔서 encode 
- 이때는 prompt_name="query"를 같이 쓰지 않아서 이중 방지

In [None]:
# Instruct+Task 버전

TASK = "Embed a tabular record for cross-domain anomaly detection based on feature relations."

def instruct(task: str, text: str) -> str:
    return f"Instruct: {task}\nQuery:{text}"


In [35]:
def save_one_dataset_llm_embeddings_instruct(
    ds_id: int,
    name: str,
    prefer="parquet",
    test_size=0.3,
    seed=42,
    precision=4,
    batch_size=64,
    task: str = TASK,
):
    """
    Instruct: {task}\nQuery:{row_text} 형태로 텍스트를 만든 뒤 임베딩 저장.
    - prompt_name은 사용하지 않음(이중 프롬프트 방지)
    """

    # (1) load
    df = load_exported_df(ds_id, name, prefer=prefer)
    X, y, feature_names = df_to_xy(df)

    # (2) split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=seed, stratify=y
    )

    # (3) tabular -> row_text
    tr_row_texts = X_to_texts(X_train, feature_names, precision=precision)
    te_row_texts = X_to_texts(X_test,  feature_names, precision=precision)

    # (4) row_text -> instruct_text
    tr_texts = [instruct(task, t) for t in tr_row_texts]
    te_texts = [instruct(task, t) for t in te_row_texts]

    # (5) embed (prompt_name=None 고정)
    H_train = embed_texts(tr_texts, batch_size=batch_size, normalize=True, prompt_name=None)
    H_test  = embed_texts(te_texts, batch_size=batch_size, normalize=True, prompt_name=None)

    # (6) save
    ds_dir = RUN_DIR / name
    ds_dir.mkdir(parents=True, exist_ok=True)

    # tag에 task 버전임을 명확히 표시
    tag = f"qwen3emb_instruct_seed{seed}_ts{test_size}_prec{precision}"
    np.save(ds_dir / f"{tag}_H_train.npy", H_train)
    np.save(ds_dir / f"{tag}_H_test.npy",  H_test)
    np.save(ds_dir / f"{tag}_y_train.npy", y_train.astype(int))
    np.save(ds_dir / f"{tag}_y_test.npy",  y_test.astype(int))

    meta = {
        "ds": f"{ds_id}_{name}",
        "n": int(X.shape[0]),
        "d": int(X.shape[1]),
        "embed_dim": int(H_train.shape[1]),
        "anom_rate": float(y.mean()),
        "run_dir": str(ds_dir),
        "tag": tag,
        "prompt": "instruct",
        "task": task,
    }
    return meta


In [36]:
# 4개 데이터셋에 대해 instruct 임베딩 생성

llm_metas_instruct = []
for ds_id, name in TARGETS:
    print(f"\n>>> [INSTRUCT] Embedding {ds_id}_{name}")
    meta = save_one_dataset_llm_embeddings_instruct(
        ds_id, name,
        prefer="parquet",
        test_size=0.3,
        seed=42,
        precision=4,
        batch_size=64,
        task=TASK
    )
    llm_metas_instruct.append(meta)
    print(meta)

llm_metas_instruct



>>> [INSTRUCT] Embedding 4_breastw
{'ds': '4_breastw', 'n': 683, 'd': 9, 'embed_dim': 1024, 'anom_rate': 0.34992679355783307, 'run_dir': '/home/haeylee/main/dataset/llm_embeddings/20251209_134712/breastw', 'tag': 'qwen3emb_instruct_seed42_ts0.3_prec4', 'prompt': 'instruct', 'task': 'Embed a tabular record for cross-domain anomaly detection based on feature relations.'}

>>> [INSTRUCT] Embedding 29_Pima
{'ds': '29_Pima', 'n': 768, 'd': 8, 'embed_dim': 1024, 'anom_rate': 0.3489583333333333, 'run_dir': '/home/haeylee/main/dataset/llm_embeddings/20251209_134712/Pima', 'tag': 'qwen3emb_instruct_seed42_ts0.3_prec4', 'prompt': 'instruct', 'task': 'Embed a tabular record for cross-domain anomaly detection based on feature relations.'}

>>> [INSTRUCT] Embedding 43_WDBC
{'ds': '43_WDBC', 'n': 367, 'd': 30, 'embed_dim': 1024, 'anom_rate': 0.027247956403269755, 'run_dir': '/home/haeylee/main/dataset/llm_embeddings/20251209_134712/WDBC', 'tag': 'qwen3emb_instruct_seed42_ts0.3_prec4', 'prompt': 'in

[{'ds': '4_breastw',
  'n': 683,
  'd': 9,
  'embed_dim': 1024,
  'anom_rate': 0.34992679355783307,
  'run_dir': '/home/haeylee/main/dataset/llm_embeddings/20251209_134712/breastw',
  'tag': 'qwen3emb_instruct_seed42_ts0.3_prec4',
  'prompt': 'instruct',
  'task': 'Embed a tabular record for cross-domain anomaly detection based on feature relations.'},
 {'ds': '29_Pima',
  'n': 768,
  'd': 8,
  'embed_dim': 1024,
  'anom_rate': 0.3489583333333333,
  'run_dir': '/home/haeylee/main/dataset/llm_embeddings/20251209_134712/Pima',
  'tag': 'qwen3emb_instruct_seed42_ts0.3_prec4',
  'prompt': 'instruct',
  'task': 'Embed a tabular record for cross-domain anomaly detection based on feature relations.'},
 {'ds': '43_WDBC',
  'n': 367,
  'd': 30,
  'embed_dim': 1024,
  'anom_rate': 0.027247956403269755,
  'run_dir': '/home/haeylee/main/dataset/llm_embeddings/20251209_134712/WDBC',
  'tag': 'qwen3emb_instruct_seed42_ts0.3_prec4',
  'prompt': 'instruct',
  'task': 'Embed a tabular record for cross-

#### instruct 성능평가

In [37]:
# in-domain

TAG_INST = "qwen3emb_instruct_seed42_ts0.3_prec4"
print("TAG_INST =", TAG_INST)

rows = []
for name in names:
    H_train, H_test, y_train, y_test = load_saved_llm_embeddings(name, TAG_INST)
    scaler, det = fit_iforest(H_train, seed=42)
    scores = score_iforest(scaler, det, H_test)
    rows.append({"scenario":"in-domain", "train_ds":name, "test_ds":name, **eval_scores(y_test, scores)})

results_llm_in_inst = pd.DataFrame(rows).sort_values(["train_ds"]).reset_index(drop=True)
results_llm_in_inst


TAG_INST = qwen3emb_instruct_seed42_ts0.3_prec4


Unnamed: 0,scenario,train_ds,test_ds,AUROC,AP,test_anom_rate,n_test
0,in-domain,Pima,Pima,0.564938,0.387036,0.350649,231
1,in-domain,WDBC,WDBC,0.959877,0.527778,0.027027,111
2,in-domain,breastw,breastw,0.885025,0.765553,0.35122,205
3,in-domain,wine,wine,0.87963,0.622222,0.076923,39


In [38]:
# cross-domain

rows = []
for train_name in names:
    H_train_src, _, _, _ = load_saved_llm_embeddings(train_name, TAG_INST)
    scaler, det = fit_iforest(H_train_src, seed=42)

    for test_name in names:
        if test_name == train_name:
            continue
        _, H_test_tgt, _, y_test_tgt = load_saved_llm_embeddings(test_name, TAG_INST)
        scores = score_iforest(scaler, det, H_test_tgt)
        rows.append({"scenario":"cross-domain", "train_ds":train_name, "test_ds":test_name, **eval_scores(y_test_tgt, scores)})

results_llm_cross_inst = pd.DataFrame(rows).sort_values(["train_ds","test_ds"]).reset_index(drop=True)
results_llm_cross_inst


Unnamed: 0,scenario,train_ds,test_ds,AUROC,AP,test_anom_rate,n_test
0,cross-domain,Pima,WDBC,0.441358,0.184534,0.027027,111
1,cross-domain,Pima,breastw,0.305347,0.292505,0.35122,205
2,cross-domain,Pima,wine,0.296296,0.086355,0.076923,39
3,cross-domain,WDBC,Pima,0.48856,0.35349,0.350649,231
4,cross-domain,WDBC,breastw,0.805973,0.734786,0.35122,205
5,cross-domain,WDBC,wine,0.907407,0.433333,0.076923,39
6,cross-domain,breastw,Pima,0.452922,0.346785,0.350649,231
7,cross-domain,breastw,WDBC,0.429012,0.031224,0.027027,111
8,cross-domain,breastw,wine,0.157407,0.059765,0.076923,39
9,cross-domain,wine,Pima,0.495391,0.380467,0.350649,231


### 전체 비교표

In [None]:
in_comp = pd.concat(
    [
        results_llm_in.assign(mode="none"),
        results_llm_in_query.assign(mode="query"),
        results_llm_in_inst.assign(mode="instruct"),
    ],
    ignore_index=True
)

cross_comp = pd.concat(
    [
        results_llm_cross.assign(mode="none"),
        results_llm_cross_query.assign(mode="query"),
        results_llm_cross_inst.assign(mode="instruct"),
    ],
    ignore_index=True
)

display(in_comp.sort_values(["train_ds","mode"]).reset_index(drop=True))
display(cross_comp.sort_values(["train_ds","test_ds","mode"]).reset_index(drop=True))


Unnamed: 0,scenario,train_ds,test_ds,AUROC,AP,test_anom_rate,n_test,mode
0,in-domain,Pima,Pima,0.564938,0.387036,0.350649,231,instruct
1,in-domain,Pima,Pima,0.565761,0.395157,0.350649,231,none
2,in-domain,Pima,Pima,0.535802,0.381578,0.350649,231,query
3,in-domain,WDBC,WDBC,0.959877,0.527778,0.027027,111,instruct
4,in-domain,WDBC,WDBC,0.944444,0.241703,0.027027,111,none
5,in-domain,WDBC,WDBC,0.947531,0.261905,0.027027,111,query
6,in-domain,breastw,breastw,0.885025,0.765553,0.35122,205,instruct
7,in-domain,breastw,breastw,0.924081,0.775968,0.35122,205,none
8,in-domain,breastw,breastw,0.930869,0.752574,0.35122,205,query
9,in-domain,wine,wine,0.87963,0.622222,0.076923,39,instruct


Unnamed: 0,scenario,train_ds,test_ds,AUROC,AP,test_anom_rate,n_test,mode
0,cross-domain,Pima,WDBC,0.441358,0.184534,0.027027,111,instruct
1,cross-domain,Pima,WDBC,0.462963,0.032996,0.027027,111,none
2,cross-domain,Pima,WDBC,0.453704,0.036463,0.027027,111,query
3,cross-domain,Pima,breastw,0.305347,0.292505,0.35122,205,instruct
4,cross-domain,Pima,breastw,0.594089,0.475221,0.35122,205,none
5,cross-domain,Pima,breastw,0.08751,0.209928,0.35122,205,query
6,cross-domain,Pima,wine,0.296296,0.086355,0.076923,39,instruct
7,cross-domain,Pima,wine,0.898148,0.341667,0.076923,39,none
8,cross-domain,Pima,wine,0.796296,0.325397,0.076923,39,query
9,cross-domain,WDBC,Pima,0.48856,0.35349,0.350649,231,instruct


: 

- instruct 효과 별로임.
    - in-domain에서는 instruct가 가끔 좋아지고,
    - cross-domain에서는 instruct 대부분 많이 하락. (WDBC가 train일 때는 instruct 꽤 좋음)
- `none` 버전을 main baseline으로 하는게 좋겠음.