In [1]:

## 라이브러리 설치 및 임포트
import os, sys, json, math, time, warnings
from typing import List, Tuple, Dict, Any, Optional

import numpy as np
import pandas as pd

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA

try:
    import umap  # Optional
except Exception:
    umap = None

try:
    from sentence_transformers import SentenceTransformer
except Exception as e:
    SentenceTransformer = None
    warnings.warn(f"sentence-transformers 불러오기 실패: {e}")

try:
    from xgboost import XGBClassifier
except Exception as e:
    XGBClassifier = None
    warnings.warn(f"xgboost 불러오기 실패: {e}")

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# 디렉터리 준비
os.makedirs('data/raw', exist_ok=True)
os.makedirs('data/processed', exist_ok=True)
os.makedirs('data/embeddings', exist_ok=True)
os.makedirs('models/encoders', exist_ok=True)
os.makedirs('models/checkpoints', exist_ok=True)


  from .autonotebook import tqdm as notebook_tqdm





In [2]:

## 설정 파일 생성
config: Dict[str, Any] = {
    "data": {
        "raw_file": "data/raw/gangwon_places_1000.xlsx",
        "processed_file": "data/processed/gangwon_places_1000.csv",
        "embeddings_file": "data/embeddings/place_embeddings.npy",
        "reduced_embeddings_file": "data/embeddings/place_embeddings_pca128.npy",
    },
    "text": {
        "fields_for_embedding": ["name", "short_description"],
        "max_tokens_per_field": 64
    },
    "model": {
        "sbert_model": "snunlp/KR-SBERT-V40K-klueNLI-augSTS",
        "dimensionality_reduction": "PCA",   # 'PCA' or 'UMAP'
        "reduced_dim": 128,
        "xgb_params": {
            "n_estimators": 300,
            "max_depth": 6,
            "learning_rate": 0.08,
            "subsample": 0.9,
            "colsample_bytree": 0.9,
            "reg_lambda": 1.0,
            "random_state": RANDOM_STATE,
            "n_jobs": -1,
            "tree_method": "hist"
        }
    }
}


In [3]:

## 데이터 전처리 함수 정의
EXPECTED_COLUMNS = [
    "name","season","nature","vibe","target","fee","parking","address",
    "open_time","latitude","longitude","full_address","short_description"
]

def _split_labels(raw: Any) -> List[str]:
    if pd.isna(raw):
        return []
    s = str(raw).strip()
    if not s:
        return []
    for sep in [',',';','/','|']:
        s = s.replace(sep, ' ')
    parts = [p.strip() for p in s.split() if p.strip()]
    return parts

class DataPreprocessor:
    def __init__(self, max_labels_per_tag: int = 5):
        self.max_labels_per_tag = max_labels_per_tag
        self.encoders: Dict[str, MultiLabelBinarizer] = {}

    def preprocess_data(self, df: pd.DataFrame) -> pd.DataFrame:
        missing = [c for c in EXPECTED_COLUMNS if c not in df.columns]
        if missing:
            raise ValueError(f"스키마 불일치: 누락 컬럼 {missing}")
        df = df[EXPECTED_COLUMNS].copy()

        for c in ["name","short_description","address","full_address","open_time","fee","parking","season","nature","vibe","target"]:
            df[c] = df[c].astype(str).str.strip()

        for c in ["latitude","longitude"]:
            df[c] = pd.to_numeric(df[c], errors="coerce")

        df.drop_duplicates(subset=["name","address"], inplace=True)

        for tag in ["season","nature","vibe","target"]:
            df[tag] = df[tag].apply(_split_labels).apply(lambda arr: arr[: self.max_labels_per_tag])
        return df

    def fit_encoders(self, df: pd.DataFrame) -> None:
        for tag in ["season","nature","vibe","target"]:
            mlb = MultiLabelBinarizer()
            mlb.fit(df[tag].tolist())
            self.encoders[tag] = mlb
        for tag, mlb in self.encoders.items():
            np.save(f"models/encoders/{tag}_classes.npy", mlb.classes_)

    def load_encoders(self) -> None:
        enc = {}
        for tag in ["season","nature","vibe","target"]:
            path = f"models/encoders/{tag}_classes.npy"
            if not os.path.exists(path):
                raise FileNotFoundError(f"인코더 파일이 없습니다: {path}")
            classes = np.load(path, allow_pickle=True)
            mlb = MultiLabelBinarizer()
            mlb.fit([classes.tolist()])
            enc[tag] = mlb
        self.encoders = enc

    def encode_labels(self, df: pd.DataFrame) -> Dict[str, np.ndarray]:
        if not self.encoders:
            self.load_encoders()
        y = {}
        for tag, mlb in self.encoders.items():
            y[tag] = mlb.transform(df[tag].tolist())
        return y


In [4]:

## 임베딩 생성 클래스 정의
class EmbeddingGenerator:
    def __init__(self, model_name: Optional[str] = None):
        self.model_name = model_name or config["model"]["sbert_model"]
        self.model = None
        self.dimension_reducer = None
        self.reduced_dim = config["model"]["reduced_dim"]

    def load_model(self):
        if self.model is not None:
            return
        if SentenceTransformer is None:
            raise RuntimeError("SentenceTransformer를 불러올 수 없습니다.")
        self.model = SentenceTransformer(self.model_name)

    def _concat_text_fields(self, row: pd.Series, fields: List[str]) -> str:
        parts = []
        for f in fields:
            val = str(row.get(f, "")).strip()
            if val and val.lower() != "nan":
                parts.append(val)
        return " ".join(parts)

    def build_texts(self, df: pd.DataFrame, fields: Optional[List[str]] = None) -> List[str]:
        fields = fields or config["text"]["fields_for_embedding"]
        return [self._concat_text_fields(row, fields) for _, row in df.iterrows()]

    def generate_embeddings(self, texts: List[str], cache_path: str) -> np.ndarray:
        os.makedirs(os.path.dirname(cache_path), exist_ok=True)
        if os.path.exists(cache_path):
            embs = np.load(cache_path)
            print(f"🔁 임베딩 캐시 로드: {cache_path} {embs.shape}")
            return embs
        self.load_model()
        embs = self.model.encode(texts, batch_size=64, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)
        np.save(cache_path, embs)
        print(f"✅ 임베딩 저장: {cache_path} {embs.shape}")
        return embs

    def fit_dimension_reducer(self, X: np.ndarray, method: str = "PCA", n_components: int = 128):
        method = (method or "PCA").upper()
        if method == "UMAP":
            if umap is None:
                warnings.warn("UMAP이 설치되어 있지 않아 PCA로 대체합니다.")
            else:
                self.dimension_reducer = umap.UMAP(
                    n_components=n_components, n_neighbors=15, min_dist=0.1, metric="cosine", random_state=RANDOM_STATE
                )
                self.reduced_dim = n_components
                self.dimension_reducer.fit(X)
                return
        self.dimension_reducer = PCA(n_components=n_components, random_state=RANDOM_STATE)
        self.reduced_dim = n_components
        self.dimension_reducer.fit(X)

    def reduce_dimensions(self, X: np.ndarray) -> np.ndarray:
        if self.dimension_reducer is None:
            raise RuntimeError("dimension_reducer가 없습니다. fit_dimension_reducer 먼저 호출하세요.")
        return self.dimension_reducer.transform(X)


In [14]:

## 실제 csv 파일 로드 및 검증
def load_and_validate_csv(file_path: str) -> pd.DataFrame:
    """XLSX/CSV 자동 판별 + 13컬럼 검증 + processed 저장"""
    candidates = [
        config["data"]["raw_file"],
        "data/raw/gangwon_places_1000.xlsx",
        file_path
    ]
    src = next((p for p in candidates if p and os.path.exists(p)), None)
    if src is None:
        raise FileNotFoundError("데이터 파일을 찾을 수 없습니다. data/raw에 1000개 파일을 두세요.")

    if src.lower().endswith((".xlsx", ".xls")):
        df = pd.read_excel(src)
    else:
        df = pd.read_csv(src)

    missing = [c for c in EXPECTED_COLUMNS if c not in df.columns]
    if missing:
        raise ValueError(f"스키마 불일치: 누락 컬럼 {missing}")
    df = df[EXPECTED_COLUMNS].copy()

    out = config["data"]["processed_file"]
    os.makedirs(os.path.dirname(out), exist_ok=True)
    df.to_csv(out, index=False, encoding="utf-8-sig")
    print(f"✅ 데이터 저장: {out} ({len(df):,} rows)")
    return df


In [16]:

## XGBoost 트레이너
class XGBoostTrainer:
    def __init__(self, params: Optional[Dict[str, Any]] = None):
        if XGBClassifier is None:
            raise RuntimeError("XGBoost를 불러올 수 없습니다.")
        self.params = params or config["model"]["xgb_params"]
        self.models: Dict[str, Any] = {}

    def train_models(self, X: np.ndarray, y_dict: Dict[str, np.ndarray]) -> None:
        self.models = {}
        for tag, y in y_dict.items():
            clf = XGBClassifier(**self.params)
            X_tr, X_val, y_tr, y_val = train_test_split(
                X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=(y.sum(axis=1)>0)
            )
            clf.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
            self.models[tag] = clf

    def evaluate_models(self, X_test: np.ndarray, y_true: Dict[str, np.ndarray]) -> Dict[str, Any]:
        report = {}
        for tag, clf in self.models.items():
            if hasattr(clf, "predict_proba"):
                y_pred = (clf.predict_proba(X_test) > 0.5).astype(int)
            else:
                y_pred = (clf.predict(X_test) > 0.5).astype(int)
            micro_f1 = f1_score(y_true[tag], y_pred, average="micro", zero_division=0)
            samples_f1 = f1_score(y_true[tag], y_pred, average="samples", zero_division=0)
            report[tag] = {"micro_f1": micro_f1, "samples_f1": samples_f1}
        return report


In [18]:

## 추천기 (공개 메서드 이름 유지)
class GangwonPlaceRecommender:
    def __init__(self, cfg: Dict[str, Any] = None):
        self.config = cfg or config
        self.dp = DataPreprocessor()
        self.eg = EmbeddingGenerator()
        self.trainer = None
        self.df: Optional[pd.DataFrame] = None
        self.embeddings: Optional[np.ndarray] = None
        self.reduced: Optional[np.ndarray] = None

    def prepare(self) -> None:
        df = load_and_validate_csv(self.config["data"]["raw_file"])
        df = self.dp.preprocess_data(df)
        self.dp.fit_encoders(df)
        self.df = df

        texts = self.eg.build_texts(df, self.config["text"]["fields_for_embedding"])
        self.embeddings = self.eg.generate_embeddings(texts, self.config["data"]["embeddings_file"])

        method = self.config["model"]["dimensionality_reduction"]
        n_comp = self.config["model"]["reduced_dim"]
        self.eg.fit_dimension_reducer(self.embeddings, method=method, n_components=n_comp)
        self.reduced = self.eg.reduce_dimensions(self.embeddings)
        np.save(self.config["data"]["reduced_embeddings_file"], self.reduced)

        y = self.dp.encode_labels(df)
        self.trainer = XGBoostTrainer(self.config["model"]["xgb_params"])
        self.trainer.train_models(self.reduced, y)

    def recommend_places(self, query_text: str, top_k: int = 10) -> pd.DataFrame:
        if self.df is None or self.reduced is None:
            raise RuntimeError("모델이 준비되지 않았습니다. prepare()를 먼저 호출하세요.")
        q_emb = self.eg.generate_embeddings([query_text], cache_path="data/embeddings/_tmp_query.npy")
        q_red = self.eg.reduce_dimensions(q_emb)[0]

        a = self.reduced / (np.linalg.norm(self.reduced, axis=1, keepdims=True) + 1e-9)
        b = q_red / (np.linalg.norm(q_red) + 1e-9)
        sims = a @ b

        idx = np.argsort(-sims)[:top_k]
        out = self.df.iloc[idx].copy()
        out["score"] = sims[idx]
        return out.reset_index(drop=True)

    def recommend_places_api(self, payload: Dict[str, Any]) -> Dict[str, Any]:
        query = payload.get("q") or payload.get("query") or ""
        k = int(payload.get("k", 10))
        rec = self.recommend_places(query, top_k=k)
        return {"results": rec.to_dict(orient="records")}


In [20]:
## 실행 예시: 벤치마크 + 추천 결과 출력 (표준 Jupyter 표시)
from IPython.display import display
import pandas as pd
import os

# 벤치마크 실행
rec, summary = benchmark_pipeline()

print(f"✅ 준비(임베딩+차원축소+학습) 시간: {summary['prepare_time_sec']:.2f}s")
print(f"✅ 홀드아웃 재학습 시간: {summary['retrain_time_sec']:.2f}s")
print("✅ 평가 리포트(마이크로/샘플 F1):")
for tag, rpt in summary["eval_report"].items():
    print(f"  - {tag}: micro_f1={rpt['micro_f1']:.4f}, samples_f1={rpt['samples_f1']:.4f}")

# 추천 질의 응답 시간(ms) 표
rec_times_df = pd.DataFrame(summary["recommend_times"]).assign(ms=lambda d: d["elapsed_sec"]*1000)\
                                                      .drop(columns=["elapsed_sec"])
print("\n[추천 질의 응답 시간(ms)]")
display(rec_times_df)

# 결과 저장(원하면 첨부/공유 가능)
os.makedirs("outputs", exist_ok=True)
rec_times_path = "outputs/recommend_latency_ms.csv"
rec_times_df.to_csv(rec_times_path, index=False, encoding="utf-8-sig")
print(f"📁 저장: {rec_times_path}")

# 각 예시 질의의 Top-K 추천 미리보기 테이블 표시 + 저장
for q, df_topk in summary["examples"]:
    print(f"\n[{q}] Top 추천 미리보기")
    display(df_topk)
    out_path = f"outputs/preview_{q.replace(' ','_')}.csv"
    df_topk.to_csv(out_path, index=False, encoding="utf-8-sig")
    print(f"📁 저장: {out_path}")


NameError: name 'benchmark_pipeline' is not defined

In [22]:
## 실행 예시: 벤치마크 + 추천 결과 출력
from caas_jupyter_tools import display_dataframe_to_user
rec, summary = benchmark_pipeline()

print(f"✅ 준비(임베딩+차원축소+학습) 시간: {summary['prepare_time_sec']:.2f}s")
print(f"✅ 홀드아웃 재학습 시간: {summary['retrain_time_sec']:.2f}s")
print("✅ 평가 리포트(마이크로/샘플 F1):")
for tag, rpt in summary["eval_report"].items():
    print(f"  - {tag}: micro_f1={rpt['micro_f1']:.4f}, samples_f1={rpt['samples_f1']:.4f}")

import pandas as pd
rec_times_df = pd.DataFrame(summary["recommend_times"])
display_dataframe_to_user("추천 질의 응답 시간(ms)", (rec_times_df.assign(ms=(rec_times_df["elapsed_sec"]*1000)).drop(columns=["elapsed_sec"])))

# 각 예시 쿼리의 Top-K 추천 미리보기 테이블도 띄우기
for q, df in summary["examples"]:
    display_dataframe_to_user(f"[{q}] Top 추천 미리보기", df)


ModuleNotFoundError: No module named 'caas_jupyter_tools'

In [25]:
# ============================================================
# Gangwon Recommendation System — Upgraded (1000개 + 차원축소 + 벤치마크)
# ※ 함수/클래스/공개 메서드 이름은 기존과 호환되도록 유지
# ============================================================

# --- Imports / Dirs / Constants --------------------------------------------
import os, sys, json, math, time, warnings
from typing import List, Tuple, Dict, Any, Optional

import numpy as np
import pandas as pd

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA

try:
    import umap  # Optional
except Exception:
    umap = None

try:
    from sentence_transformers import SentenceTransformer
except Exception as e:
    SentenceTransformer = None
    warnings.warn(f"sentence-transformers 불러오기 실패: {e}")

try:
    from xgboost import XGBClassifier
except Exception as e:
    XGBClassifier = None
    warnings.warn(f"xgboost 불러오기 실패: {e}")

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# 디렉터리 준비
os.makedirs('data/raw', exist_ok=True)
os.makedirs('data/processed', exist_ok=True)
os.makedirs('data/embeddings', exist_ok=True)
os.makedirs('models/encoders', exist_ok=True)
os.makedirs('models/checkpoints', exist_ok=True)
os.makedirs('outputs', exist_ok=True)

# --- Config -----------------------------------------------------------------
config: Dict[str, Any] = {
    "data": {
        "raw_file": "data/raw/gangwon_places_1000.xlsx",          # 우선 사용
        "processed_file": "data/processed/gangwon_places_1000.csv",
        "embeddings_file": "data/embeddings/place_embeddings.npy",
        "reduced_embeddings_file": "data/embeddings/place_embeddings_pca128.npy",
    },
    "text": {
        "fields_for_embedding": ["name", "short_description"],     # 임베딩에 사용할 필드
        "max_tokens_per_field": 64
    },
    "model": {
        "sbert_model": "snunlp/KR-SBERT-V40K-klueNLI-augSTS",
        "dimensionality_reduction": "PCA",   # 'PCA' or 'UMAP'
        "reduced_dim": 128,
        "xgb_params": {
            "n_estimators": 300,
            "max_depth": 6,
            "learning_rate": 0.08,
            "subsample": 0.9,
            "colsample_bytree": 0.9,
            "reg_lambda": 1.0,
            "random_state": RANDOM_STATE,
            "n_jobs": -1,
            "tree_method": "hist"
        }
    }
}

# --- Schema / Preprocessor ---------------------------------------------------
EXPECTED_COLUMNS = [
    "name","season","nature","vibe","target","fee","parking","address",
    "open_time","latitude","longitude","full_address","short_description"
]

def _split_labels(raw: Any) -> List[str]:
    if pd.isna(raw):
        return []
    s = str(raw).strip()
    if not s:
        return []
    for sep in [',',';','/','|']:
        s = s.replace(sep, ' ')
    parts = [p.strip() for p in s.split() if p.strip()]
    return parts

class DataPreprocessor:
    def __init__(self, max_labels_per_tag: int = 5):
        self.max_labels_per_tag = max_labels_per_tag
        self.encoders: Dict[str, MultiLabelBinarizer] = {}

    def preprocess_data(self, df: pd.DataFrame) -> pd.DataFrame:
        # 스키마 검증/정렬
        missing = [c for c in EXPECTED_COLUMNS if c not in df.columns]
        if missing:
            raise ValueError(f"스키마 불일치: 누락 컬럼 {missing}")
        df = df[EXPECTED_COLUMNS].copy()

        # 문자열 클린
        for c in ["name","short_description","address","full_address","open_time","fee","parking","season","nature","vibe","target"]:
            df[c] = df[c].astype(str).str.strip()

        # 위/경도 숫자화
        for c in ["latitude","longitude"]:
            df[c] = pd.to_numeric(df[c], errors="coerce")

        # 중복 제거
        df.drop_duplicates(subset=["name","address"], inplace=True)

        # 라벨 파싱(다중라벨)
        for tag in ["season","nature","vibe","target"]:
            df[tag] = df[tag].apply(_split_labels).apply(lambda arr: arr[: self.max_labels_per_tag])
        return df

    def fit_encoders(self, df: pd.DataFrame) -> None:
        for tag in ["season","nature","vibe","target"]:
            mlb = MultiLabelBinarizer()
            mlb.fit(df[tag].tolist())
            self.encoders[tag] = mlb
        # 저장
        for tag, mlb in self.encoders.items():
            np.save(f"models/encoders/{tag}_classes.npy", mlb.classes_)

    def load_encoders(self) -> None:
        enc = {}
        for tag in ["season","nature","vibe","target"]:
            path = f"models/encoders/{tag}_classes.npy"
            if not os.path.exists(path):
                raise FileNotFoundError(f"인코더 파일이 없습니다: {path}")
            classes = np.load(path, allow_pickle=True)
            mlb = MultiLabelBinarizer()
            mlb.fit([classes.tolist()])  # 동일 클래스 세트로 맞춤
            enc[tag] = mlb
        self.encoders = enc

    def encode_labels(self, df: pd.DataFrame) -> Dict[str, np.ndarray]:
        if not self.encoders:
            self.load_encoders()
        y = {}
        for tag, mlb in self.encoders.items():
            y[tag] = mlb.transform(df[tag].tolist())
        return y

# --- Embedding Generator -----------------------------------------------------
class EmbeddingGenerator:
    def __init__(self, model_name: Optional[str] = None):
        self.model_name = model_name or config["model"]["sbert_model"]
        self.model = None
        self.dimension_reducer = None
        self.reduced_dim = config["model"]["reduced_dim"]

    def load_model(self):
        if self.model is not None:
            return
        if SentenceTransformer is None:
            raise RuntimeError("SentenceTransformer를 불러올 수 없습니다.")
        self.model = SentenceTransformer(self.model_name)

    def _concat_text_fields(self, row: pd.Series, fields: List[str]) -> str:
        parts = []
        for f in fields:
            val = str(row.get(f, "")).strip()
            if val and val.lower() != "nan":
                parts.append(val)
        return " ".join(parts)

    def build_texts(self, df: pd.DataFrame, fields: Optional[List[str]] = None) -> List[str]:
        fields = fields or config["text"]["fields_for_embedding"]
        return [self._concat_text_fields(row, fields) for _, row in df.iterrows()]

    def generate_embeddings(self, texts: List[str], cache_path: str) -> np.ndarray:
        os.makedirs(os.path.dirname(cache_path), exist_ok=True)
        if os.path.exists(cache_path):
            embs = np.load(cache_path)
            print(f"🔁 임베딩 캐시 로드: {cache_path} {embs.shape}")
            return embs
        self.load_model()
        embs = self.model.encode(texts, batch_size=64, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)
        np.save(cache_path, embs)
        print(f"✅ 임베딩 저장: {cache_path} {embs.shape}")
        return embs

    def fit_dimension_reducer(self, X: np.ndarray, method: str = "PCA", n_components: int = 128):
        method = (method or "PCA").upper()
        if method == "UMAP":
            if umap is None:
                warnings.warn("UMAP이 설치되어 있지 않아 PCA로 대체합니다.")
            else:
                self.dimension_reducer = umap.UMAP(
                    n_components=n_components, n_neighbors=15, min_dist=0.1, metric="cosine", random_state=RANDOM_STATE
                )
                self.reduced_dim = n_components
                self.dimension_reducer.fit(X)
                return
        # PCA fallback
        self.dimension_reducer = PCA(n_components=n_components, random_state=RANDOM_STATE)
        self.reduced_dim = n_components
        self.dimension_reducer.fit(X)

    def reduce_dimensions(self, X: np.ndarray) -> np.ndarray:
        if self.dimension_reducer is None:
            raise RuntimeError("dimension_reducer가 없습니다. fit_dimension_reducer 먼저 호출하세요.")
        return self.dimension_reducer.transform(X)

# --- Data Loader -------------------------------------------------------------
def load_and_validate_csv(file_path: str) -> pd.DataFrame:
    """XLSX/CSV 자동 판별 + 13컬럼 검증 + processed 저장"""
    candidates = [
        config["data"]["raw_file"],
        "data/raw/gangwon_places_1000.csv",
        file_path
    ]
    src = next((p for p in candidates if p and os.path.exists(p)), None)
    if src is None:
        raise FileNotFoundError("데이터 파일을 찾을 수 없습니다. data/raw에 1000개 파일을 두세요.")

    if src.lower().endswith((".xlsx", ".xls")):
        df = pd.read_excel(src)
    else:
        df = pd.read_csv(src)

    missing = [c for c in EXPECTED_COLUMNS if c not in df.columns]
    if missing:
        raise ValueError(f"스키마 불일치: 누락 컬럼 {missing}")
    df = df[EXPECTED_COLUMNS].copy()

    out = config["data"]["processed_file"]
    os.makedirs(os.path.dirname(out), exist_ok=True)
    df.to_csv(out, index=False, encoding="utf-8-sig")
    print(f"✅ 데이터 저장: {out} ({len(df):,} rows)")
    return df

# --- XGBoost Trainer ---------------------------------------------------------
class XGBoostTrainer:
    def __init__(self, params: Optional[Dict[str, Any]] = None):
        if XGBClassifier is None:
            raise RuntimeError("XGBoost를 불러올 수 없습니다.")
        self.params = params or config["model"]["xgb_params"]
        self.models: Dict[str, Any] = {}

    def train_models(self, X: np.ndarray, y_dict: Dict[str, np.ndarray]) -> None:
        self.models = {}
        for tag, y in y_dict.items():
            clf = XGBClassifier(**self.params)
            X_tr, X_val, y_tr, y_val = train_test_split(
                X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=(y.sum(axis=1)>0)
            )
            clf.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
            self.models[tag] = clf

    def evaluate_models(self, X_test: np.ndarray, y_true: Dict[str, np.ndarray]) -> Dict[str, Any]:
        report = {}
        for tag, clf in self.models.items():
            if hasattr(clf, "predict_proba"):
                y_pred = (clf.predict_proba(X_test) > 0.5).astype(int)
            else:
                y_pred = (clf.predict(X_test) > 0.5).astype(int)
            micro_f1 = f1_score(y_true[tag], y_pred, average="micro", zero_division=0)
            samples_f1 = f1_score(y_true[tag], y_pred, average="samples", zero_division=0)
            report[tag] = {"micro_f1": micro_f1, "samples_f1": samples_f1}
        return report

# --- Recommender (Public API kept) ------------------------------------------
class GangwonPlaceRecommender:
    def __init__(self, cfg: Dict[str, Any] = None):
        self.config = cfg or config
        self.dp = DataPreprocessor()
        self.eg = EmbeddingGenerator()
        self.trainer = None
        self.df: Optional[pd.DataFrame] = None
        self.embeddings: Optional[np.ndarray] = None
        self.reduced: Optional[np.ndarray] = None

    def prepare(self) -> None:
        # 1) 데이터 로드 & 저장
        df = load_and_validate_csv(self.config["data"]["raw_file"])
        # 2) 전처리
        df = self.dp.preprocess_data(df)
        self.dp.fit_encoders(df)
        self.df = df
        # 3) 임베딩 (캐시)
        texts = self.eg.build_texts(df, self.config["text"]["fields_for_embedding"])
        self.embeddings = self.eg.generate_embeddings(texts, self.config["data"]["embeddings_file"])
        # 4) 차원축소
        method = self.config["model"]["dimensionality_reduction"]
        n_comp = self.config["model"]["reduced_dim"]
        self.eg.fit_dimension_reducer(self.embeddings, method=method, n_components=n_comp)
        self.reduced = self.eg.reduce_dimensions(self.embeddings)
        np.save(self.config["data"]["reduced_embeddings_file"], self.reduced)
        # 5) 분류기 학습
        y = self.dp.encode_labels(df)
        self.trainer = XGBoostTrainer(self.config["model"]["xgb_params"])
        self.trainer.train_models(self.reduced, y)

    def recommend_places(self, query_text: str, top_k: int = 10) -> pd.DataFrame:
        if self.df is None or self.reduced is None:
            raise RuntimeError("모델이 준비되지 않았습니다. prepare()를 먼저 호출하세요.")
        # 쿼리 임베딩 → 차원축소
        q_emb = self.eg.generate_embeddings([query_text], cache_path="data/embeddings/_tmp_query.npy")
        q_red = self.eg.reduce_dimensions(q_emb)[0]
        # 코사인 유사도
        a = self.reduced / (np.linalg.norm(self.reduced, axis=1, keepdims=True) + 1e-9)
        b = q_red / (np.linalg.norm(q_red) + 1e-9)
        sims = a @ b
        idx = np.argsort(-sims)[:top_k]
        out = self.df.iloc[idx].copy()
        out["score"] = sims[idx]
        return out.reset_index(drop=True)

    def recommend_places_api(self, payload: Dict[str, Any]) -> Dict[str, Any]:
        query = payload.get("q") or payload.get("query") or ""
        k = int(payload.get("k", 10))
        rec = self.recommend_places(query, top_k=k)
        return {"results": rec.to_dict(orient="records")}

# --- Benchmark / Demo --------------------------------------------------------
def _split_y_dict(y_dict, idx_train, idx_test):
    y_tr, y_te = {}, {}
    for tag, y in y_dict.items():
        y_tr[tag] = y[idx_train]
        y_te[tag] = y[idx_test]
    return y_tr, y_te

def benchmark_pipeline(config_override=None, demo_queries=None, top_k=5):
    """
    파이프라인 준비 시간, 간단 홀드아웃 정확도, 추천 API 응답 시간까지 측정.
    - 기존 함수/클래스 이름은 변경하지 않음.
    """
    cfg = config_override or config

    # 1) 준비(임베딩+차원축소+학습) 시간
    t0 = time.perf_counter()
    rec = GangwonPlaceRecommender(cfg)
    rec.prepare()
    t1 = time.perf_counter()

    # 2) 간단 홀드아웃 평가 (별도 트레이너로 재학습/평가)
    y_full = rec.dp.encode_labels(rec.df)
    idx = list(range(len(rec.reduced)))
    idx_tr, idx_te = train_test_split(idx, test_size=0.2, random_state=RANDOM_STATE)

    X_tr = rec.reduced[idx_tr]
    X_te = rec.reduced[idx_te]
    y_tr, y_te = _split_y_dict(y_full, idx_tr, idx_te)

    trainer = XGBoostTrainer(cfg["model"]["xgb_params"])
    t2 = time.perf_counter()
    trainer.train_models(X_tr, y_tr)
    t3 = time.perf_counter()
    report = trainer.evaluate_models(X_te, y_te)

    # 3) 추천 API 데모 및 응답 시간
    queries = demo_queries or ["가을 감성 바다 카페", "가족 산책 코스", "스릴 액티비티", "역사 유적"]
    rec_times = []
    rec_examples = []
    for q in queries:
        s = time.perf_counter()
        df_rec = rec.recommend_places(q, top_k=top_k)
        e = time.perf_counter()
        rec_times.append({"query": q, "elapsed_sec": e - s})
        rec_examples.append((q, df_rec[["name","address","score"]].head(top_k)))

    summary = {
        "prepare_time_sec": t1 - t0,
        "retrain_time_sec": t3 - t2,
        "eval_report": report,
        "recommend_times": rec_times,
        "examples": rec_examples
    }
    return rec, summary

# --- Main (optional) ---------------------------------------------------------
if __name__ == "__main__":
    # 벤치마크 실행
    rec, summary = benchmark_pipeline()

    print(f"✅ 준비(임베딩+차원축소+학습) 시간: {summary['prepare_time_sec']:.2f}s")
    print(f"✅ 홀드아웃 재학습 시간: {summary['retrain_time_sec']:.2f}s")
    print("✅ 평가 리포트(마이크로/샘플 F1):")
    for tag, rpt in summary["eval_report"].items():
        print(f"  - {tag}: micro_f1={rpt['micro_f1']:.4f}, samples_f1={rpt['samples_f1']:.4f}")

    # 추천 질의 응답 시간(ms) 표
    rec_times_df = pd.DataFrame(summary["recommend_times"]).assign(ms=lambda d: d["elapsed_sec"]*1000)\
                                                          .drop(columns=["elapsed_sec"])
    print("\n[추천 질의 응답 시간(ms)]")
    print(rec_times_df.to_string(index=False))

    # 결과 저장
    rec_times_path = "outputs/recommend_latency_ms.csv"
    rec_times_df.to_csv(rec_times_path, index=False, encoding="utf-8-sig")
    print(f"📁 저장: {rec_times_path}")

    # 각 예시 질의의 Top-K 추천 미리보기 테이블 표시 + 저장
    for q, df_topk in summary["examples"]:
        print(f"\n[{q}] Top 추천 미리보기")
        print(df_topk.to_string(index=False))
        out_path = f"outputs/preview_{q.replace(' ','_')}.csv"
        df_topk.to_csv(out_path, index=False, encoding="utf-8-sig")
        print(f"📁 저장: {out_path}")


✅ 데이터 저장: data/processed/gangwon_places_1000.csv (1,000 rows)
🔁 임베딩 캐시 로드: data/embeddings/place_embeddings.npy (923, 768)
🔁 임베딩 캐시 로드: data/embeddings/_tmp_query.npy (1, 768)
🔁 임베딩 캐시 로드: data/embeddings/_tmp_query.npy (1, 768)
🔁 임베딩 캐시 로드: data/embeddings/_tmp_query.npy (1, 768)
🔁 임베딩 캐시 로드: data/embeddings/_tmp_query.npy (1, 768)
✅ 준비(임베딩+차원축소+학습) 시간: 25.60s
✅ 홀드아웃 재학습 시간: 20.25s
✅ 평가 리포트(마이크로/샘플 F1):
  - season: micro_f1=0.9426, samples_f1=0.9315
  - nature: micro_f1=0.7083, samples_f1=0.6807
  - vibe: micro_f1=0.8262, samples_f1=0.8568
  - target: micro_f1=0.9100, samples_f1=0.8973

[추천 질의 응답 시간(ms)]
      query     ms
가을 감성 바다 카페 8.8387
   가족 산책 코스 3.6357
    스릴 액티비티 2.6373
      역사 유적 4.5844
📁 저장: outputs/recommend_latency_ms.csv

[가을 감성 바다 카페] Top 추천 미리보기
       name                      address    score
   경포 아쿠아리움         강원특별자치도 강릉시 난설헌로 131 0.508496
      자작도해변  강원특별자치도 고성군 죽왕면 자작도선사길(죽왕면) 0.507921
   작은후진해수욕장    강원특별자치도 삼척시 새천년도로 467(교동) 0.499675
쏠비치삼척 오션플레이  강원특별자치도 삼척시 수