## Chapter 2-6, 2강 NLP 임베딩 검색 — SBERT/TF‑IDF/Hybrid (CPU)

- 목표: AG News 코퍼스에서 SBERT/TF‑IDF/Hybrid 검색 성능 비교 분석


### 구성 (Overview)
- 0. 환경 설정 및 라이브러리
- 1. 데이터 로드 및 corpus 생성 (AG News)
- 2. 임베딩 2가지 — SBERT / TF‑IDF
- 3. 검색 — SBERT / TF‑IDF / Hybrid
- 4. 검색 성능 및 속도 분석 (Recall@k, MRR, 시간)
- 5. 추가: 하이브리드 알파 스윕, 파라미터화


### 0. 환경 설정 및 라이브러리


In [1]:
# =========================
# 0. 환경 설정 및 라이브러리
# =========================

# 표준 라이브러리
import os, time, random
from typing import List, Tuple, Dict
from contextlib import contextmanager

# 서드파티: 수치/데이터/시각화
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 서드파티: 사이킷런(전처리/평가/최근접 이웃)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances, pairwise_distances_argmin_min
from sklearn.metrics.pairwise import cosine_similarity

# SBERT 임베딩
from sentence_transformers import SentenceTransformer, util
# 데이터셋 로드
from datasets import load_dataset

# 경고 억제 (환경 차이로 발생하는 워닝 최소화)
import warnings
warnings.filterwarnings("ignore", message=r".*matmul.*")

# -----------------------------------------
# Matplotlib: 한글 폰트 및 마이너스 기호 설정
# -----------------------------------------
plt.rcParams["font.family"] = "AppleGothic"  # macOS
plt.rcParams["axes.unicode_minus"] = False

# ------------------------
# 재현성(시드) 고정
# ------------------------
def set_seed(seed: int = 42) -> None:
    random.seed(seed)
    np.random.seed(seed)

set_seed(42)

# =========================
# 타이머 유틸
# =========================
@contextmanager
def timer(msg: str):
    t0 = time.perf_counter()
    yield
    print(f"[TIME] {msg}: {time.perf_counter()-t0:.2f}s")


  from .autonotebook import tqdm as notebook_tqdm


### 1. 데이터 로드 및 corpus 생성 (AG News)
- label, title, description을 결합해 문서 텍스트를 구성
- 소규모 서브셋(클래스당 N개)으로 코퍼스와 쿼리 세트 분리


In [2]:
# =========================
# 1. AG News 로드 및 코퍼스/쿼리 구성
# =========================
LABEL_NAMES: List[str] = ["World", "Sports", "Business", "Sci/Tech"]
N_PER_CLASS = int(os.environ.get("N_PER_CLASS", 300))  # 코퍼스용
N_QUERY_PER_CLASS = int(os.environ.get("N_QUERY_PER_CLASS", 30))  # 쿼리용


def load_ag_news_texts(n_per_class: int) -> Tuple[List[str], List[int]]:
    ds = load_dataset("ag_news", split="train")
    texts, labels = [], []
    for lab in range(4):
        sub = ds.filter(lambda ex: ex["label"] == lab).select(range(n_per_class))
        # title + description 결합 (없으면 text만 사용)
        for r in sub:
            title = r.get("title") or ""
            desc = r.get("description") or r.get("text") or ""
            txt = (str(title) + " \n" + str(desc)).strip()
            texts.append(txt)
            labels.append(int(r["label"]))
    return texts, labels

# 전체에서 코퍼스/쿼리 분할 (클래스 균등)
corpus_texts, corpus_labels = load_ag_news_texts(N_PER_CLASS)
query_texts, query_labels = load_ag_news_texts(N_QUERY_PER_CLASS)

print(len(corpus_texts), "corpus", "|", len(query_texts), "queries")
print("labels:", sorted(set(corpus_labels)))


Filter: 100%|██████████| 120000/120000 [00:00<00:00, 172457.19 examples/s]
Filter: 100%|██████████| 120000/120000 [00:00<00:00, 264589.25 examples/s]
Filter: 100%|██████████| 120000/120000 [00:00<00:00, 303549.95 examples/s]
Filter: 100%|██████████| 120000/120000 [00:00<00:00, 304123.31 examples/s]


1200 corpus | 120 queries
labels: [0, 1, 2, 3]


### 2. 임베딩 — SBERT / TF‑IDF
- CPU 강제 사용, 임베딩 L2 정규화로 코사인 검색 가속
- 대용량 대비를 위한 간단 캐싱(npz) 지원


In [3]:
# =========================
# 2. 임베딩 생성 유틸 (캐싱 포함)
# =========================
VEC_MAX_FEATURES = int(os.environ.get("VEC_MAX_FEATURES", 30000))
SBERT_MODEL = os.environ.get("SBERT_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
CACHE_DIR = os.environ.get("EMB_CACHE", "data")

os.makedirs(CACHE_DIR, exist_ok=True)


def sbert_encode(texts: List[str], batch_size: int = 64) -> np.ndarray:
    model = SentenceTransformer(SBERT_MODEL, device="cpu")
    # encode → numpy, no auto-normalize (직접 수행)
    embs = model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=False,
        convert_to_numpy=True,
        normalize_embeddings=False,
    ).astype(np.float64)
    # L2 normalize + NaN/Inf 방지
    norms = np.linalg.norm(embs, axis=1, keepdims=True)
    embs = embs / np.clip(norms, 1e-12, None)
    embs = np.nan_to_num(embs, nan=0.0, posinf=0.0, neginf=0.0)
    return embs


def tfidf_fit_transform(corpus: List[str]) -> Tuple[object, np.ndarray]:
    vec = TfidfVectorizer(max_features=VEC_MAX_FEATURES, ngram_range=(1,2))
    X = vec.fit_transform(corpus)
    return vec, X


def save_npz(path: str, **arrays):
    np.savez_compressed(path, **arrays)


def load_npz(path: str) -> Dict[str, np.ndarray]:
    with np.load(path, allow_pickle=False) as f:
        return {k: f[k] for k in f.files}


# 실제 임베딩 생성 + 캐싱
with timer("임베딩 생성 및 캐싱"):
    # SBERT 캐시
    sbert_cache = os.path.join(CACHE_DIR, "embeddings_agnews_sbert.npz")
    if os.path.exists(sbert_cache):
        dat = load_npz(sbert_cache)
        corpus_sbert = dat["corpus_sbert"]
    else:
        corpus_sbert = sbert_encode(corpus_texts)
        save_npz(sbert_cache, corpus_sbert=corpus_sbert)

    # 쿼리도 즉시 생성 (캐싱은 선택)
    query_sbert = sbert_encode(query_texts)

    # TF-IDF 캐시
    tfidf_cache = os.path.join(CACHE_DIR, "embeddings_agnews_tfidf.npz")
    if os.path.exists(tfidf_cache):
        dat = load_npz(tfidf_cache)
        # 벡터라이저는 재학습 필요하므로 캐시에서는 행렬만 사용하지 않음
        # 학습 재현을 위해 다시 fit 수행
        vectorizer, X_corpus = tfidf_fit_transform(corpus_texts)
    else:
        vectorizer, X_corpus = tfidf_fit_transform(corpus_texts)
        save_npz(tfidf_cache, X_corpus=X_corpus.astype(np.float32).toarray())
        # 주의: 간단화를 위해 dense 저장. 실제 대용량은 joblib+희소 저장 권장

    X_query = vectorizer.transform(query_texts)


[TIME] 임베딩 생성 및 캐싱: 34.26s


### 3. 검색 — SBERT / TF‑IDF / Hybrid
- SBERT: 코사인 유사도 기반 top‑k
- TF‑IDF: 코사인 유사도 기반 top‑k (희소 행렬 고려)
- Hybrid: α·SBERT + (1-α)·TF‑IDF 점수 결합


In [4]:
# =========================
# 3. 검색 함수들 (SBERT / TF-IDF / Hybrid)
# =========================
from typing import Optional


def topk_indices(scores: np.ndarray, k: int) -> np.ndarray:
    # scores: shape (num_queries, num_corpus)
    return np.argpartition(-scores, kth=min(k, scores.shape[1]-1), axis=1)[:, :k]


def search_sbert(query_embs: np.ndarray, corpus_embs: np.ndarray, top_k: int = 5) -> Tuple[np.ndarray, np.ndarray]:
    # 코사인 = dot (이미 L2 정규화 가정)
    sims = (query_embs @ corpus_embs.T)  # (Q, C)
    idx = topk_indices(sims, top_k)
    # 정렬된 인덱스와 점수 반환
    sorted_idx = np.take_along_axis(idx, np.argsort(np.take_along_axis(sims, idx, axis=1) * -1, axis=1), axis=1)
    sorted_scores = np.take_along_axis(sims, sorted_idx, axis=1)
    return sorted_idx, sorted_scores


def search_tfidf(X_query, X_corpus, top_k: int = 5) -> Tuple[np.ndarray, np.ndarray]:
    sims = cosine_similarity(X_query, X_corpus)  # (Q, C)
    idx = topk_indices(sims, top_k)
    sorted_idx = np.take_along_axis(idx, np.argsort(np.take_along_axis(sims, idx, axis=1) * -1, axis=1), axis=1)
    sorted_scores = np.take_along_axis(sims, sorted_idx, axis=1)
    return sorted_idx, sorted_scores


def search_hybrid(query_embs: np.ndarray, corpus_embs: np.ndarray, X_query, X_corpus, alpha: float = 0.5, top_k: int = 5):
    idx_s, sbert = search_sbert(query_embs, corpus_embs, top_k)
    idx_t, tfidf = search_tfidf(X_query, X_corpus, top_k)
    # 동일 top_k 위치에서 단순 가중 합산 (후처리 간소화)
    # 실제 서비스는 전체 점수 재정렬 권장
    hybrid = alpha * sbert + (1 - alpha) * tfidf
    # 각 쿼리별로 재정렬
    idx = np.arange(hybrid.shape[0])[:, None]
    order = np.argsort(-hybrid, axis=1)
    final_idx = np.take_along_axis(idx_s, order, axis=1)
    final_scores = np.take_along_axis(hybrid, order, axis=1)
    return final_idx, final_scores


# 샘플 검색 실행
with timer("검색 실행 (top-5)"):
    topk = 5
    s_idx, s_scores = search_sbert(query_sbert, corpus_sbert, topk)
    t_idx, t_scores = search_tfidf(X_query, X_corpus, topk)
    h_idx, h_scores = search_hybrid(query_sbert, corpus_sbert, X_query, X_corpus, alpha=0.5, top_k=topk)

# 결과 예시 출력 (첫 쿼리)
q0 = 0
print("[Query]", query_texts[q0][:120].replace("\n"," "), "...")
print("SBERT →", [corpus_texts[i][:80].replace("\n"," ") for i in s_idx[q0]])
print("TF-IDF →", [corpus_texts[i][:80].replace("\n"," ") for i in t_idx[q0]])
print("Hybrid →", [corpus_texts[i][:80].replace("\n"," ") for i in h_idx[q0]])


[TIME] 검색 실행 (top-5): 0.03s
[Query] Venezuelans Vote Early in Referendum on Chavez Rule (Reuters) Reuters - Venezuelans turned out early\and in large number ...
SBERT → ['Venezuelans Vote Early in Referendum on Chavez Rule (Reuters) Reuters - Venezuel', 'Venezuelans vote on Chavez rule A referendum is under way in Venezuela to decide', 'Venezuelans Rush to Vote in Referendum on Chavez  CARACAS, Venezuela (Reuters) -', "Venezuela Holds Referendum on President CARACAS, Venezuela - The opposition's lo", 'Chavez Wins Venezuela Referendum-Preliminary Result  CARACAS, Venezuela (Reuters']
TF-IDF → ['Venezuelans Vote Early in Referendum on Chavez Rule (Reuters) Reuters - Venezuel', 'Venezuelans Rush to Vote in Referendum on Chavez  CARACAS, Venezuela (Reuters) -', 'Venezuelans Throng to Polls in Chavez Referendum  CARACAS, Venezuela (Reuters) -', 'Venezuela Voters Crowd Polls in Chavez Referendum  CARACAS, Venezuela (Reuters) ', 'Venezuelans vote on Chavez rule A referendum is under way in Ve

In [8]:
# 결과 예시 출력 (첫 쿼리)
q0 = "what is the impact of interest rate hikes on the stock market?"
print("[Query]", query_texts[q0][:120].replace("\n"," "), "...")
print("SBERT →", [corpus_texts[i][:80].replace("\n"," ") for i in s_idx[q0]])
print("TF-IDF →", [corpus_texts[i][:80].replace("\n"," ") for i in t_idx[q0]])
print("Hybrid →", [corpus_texts[i][:80].replace("\n"," ") for i in h_idx[q0]])

TypeError: list indices must be integers or slices, not str

### 4. 검색 결과 성능 및 속도 분석
- 지표: Recall@k, MRR
- 속도: 쿼리 임베딩 계산 포함/제외를 분리 측정 (SBERT), TF‑IDF 변환 포함/제외


In [5]:
# =========================
# 4. 평가: Recall@k, MRR + 속도
# =========================
from collections import defaultdict


def recall_at_k(topk_idx: np.ndarray, true_labels: List[int], corpus_labels: List[int], k: int) -> float:
    Q = len(true_labels)
    hit = 0
    corpus_labels_arr = np.array(corpus_labels)
    for q in range(Q):
        labs = corpus_labels_arr[topk_idx[q, :k]]
        if true_labels[q] in labs:
            hit += 1
    return hit / Q


def mrr(topk_idx: np.ndarray, true_labels: List[int], corpus_labels: List[int]) -> float:
    Q = len(true_labels)
    rr_sum = 0.0
    corpus_labels_arr = np.array(corpus_labels)
    for q in range(Q):
        labs = corpus_labels_arr[topk_idx[q]]
        target = true_labels[q]
        pos = np.where(labs == target)[0]
        if pos.size > 0:
            rr_sum += 1.0 / (pos[0] + 1)
    return rr_sum / Q


# 지표 계산
for k in [1, 3, 5, 10]:
    r_s = recall_at_k(s_idx, query_labels, corpus_labels, k)
    r_t = recall_at_k(t_idx, query_labels, corpus_labels, k)
    r_h = recall_at_k(h_idx, query_labels, corpus_labels, k)
    print(f"Recall@{k} | SBERT={r_s:.3f} TF-IDF={r_t:.3f} Hybrid={r_h:.3f}")

print("MRR | SBERT={:.3f} TF-IDF={:.3f} Hybrid={:.3f}".format(
    mrr(s_idx, query_labels, corpus_labels),
    mrr(t_idx, query_labels, corpus_labels),
    mrr(h_idx, query_labels, corpus_labels),
))

# 속도 벤치마크 (소규모)
def bench_speed(nq=64):
    samp_q = query_texts[:nq]
    # SBERT: 임베딩 포함 시간
    t0 = time.perf_counter()
    q_emb = sbert_encode(samp_q)
    _ = search_sbert(q_emb, corpus_sbert, 5)
    t_sbert_total = time.perf_counter() - t0

    # SBERT: 임베딩 제외 시간 (순수 검색)
    t0 = time.perf_counter()
    _ = search_sbert(q_emb, corpus_sbert, 5)
    t_sbert_search = time.perf_counter() - t0

    # TF-IDF: 변환 포함 시간
    t0 = time.perf_counter()
    Xq = vectorizer.transform(samp_q)
    _ = search_tfidf(Xq, X_corpus, 5)
    t_tfidf_total = time.perf_counter() - t0

    # TF-IDF: 변환 제외 (순수 검색)
    t0 = time.perf_counter()
    _ = search_tfidf(Xq, X_corpus, 5)
    t_tfidf_search = time.perf_counter() - t0

    print({
        'nq': nq,
        'sbert_total_s': round(t_sbert_total, 3),
        'sbert_search_s': round(t_sbert_search, 3),
        'tfidf_total_s': round(t_tfidf_total, 3),
        'tfidf_search_s': round(t_tfidf_search, 3),
    })

bench_speed(64)
bench_speed(128)


Recall@1 | SBERT=1.000 TF-IDF=1.000 Hybrid=1.000
Recall@3 | SBERT=1.000 TF-IDF=1.000 Hybrid=1.000
Recall@5 | SBERT=1.000 TF-IDF=1.000 Hybrid=1.000
Recall@10 | SBERT=1.000 TF-IDF=1.000 Hybrid=1.000
MRR | SBERT=1.000 TF-IDF=1.000 Hybrid=1.000
{'nq': 64, 'sbert_total_s': 3.526, 'sbert_search_s': 0.002, 'tfidf_total_s': 0.015, 'tfidf_search_s': 0.005}
{'nq': 128, 'sbert_total_s': 4.461, 'sbert_search_s': 0.005, 'tfidf_total_s': 0.027, 'tfidf_search_s': 0.012}


### 5. 추가: 하이브리드 알파 스윕 및 설정 파라미터화
- α ∈ {0.0, 0.25, 0.5, 0.75, 1.0}에 대해 Recall@k, MRR 비교
- 환경변수로 주요 파라미터 제어(N_PER_CLASS, VEC_MAX_FEATURES, SBERT_MODEL)


In [6]:
# =========================
# 5. 하이브리드 알파 스윕
# =========================
ALPHAS = [0.0, 0.25, 0.5, 0.75, 1.0]
K = 5

rows = []
for a in ALPHAS:
    idx, _ = search_hybrid(query_sbert, corpus_sbert, X_query, X_corpus, alpha=a, top_k=K)
    rows.append({
        'alpha': a,
        'Recall@1': recall_at_k(idx, query_labels, corpus_labels, 1),
        'Recall@5': recall_at_k(idx, query_labels, corpus_labels, 5),
        'MRR': mrr(idx, query_labels, corpus_labels)
    })

pd.DataFrame(rows)


Unnamed: 0,alpha,Recall@1,Recall@5,MRR
0,0.0,1.0,1.0,1.0
1,0.25,1.0,1.0,1.0
2,0.5,1.0,1.0,1.0
3,0.75,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0


### 6. 자유 질의 검색 및 간단 평가
- 사용자가 입력한 자연어 질의에 대해 SBERT/TF‑IDF/Hybrid 결과를 비교
- 정답 라벨이 없는 경우 키워드 기반 약식 qrels로 Precision@k/Recall@k/nDCG@k 산출


In [7]:
# =========================
# 6. 자유 질의 검색 + 약식 평가
# =========================
from typing import Iterable


def tokenize_simple(s: str) -> List[str]:
    return [w.lower() for w in str(s).split() if w.strip()]


def keyword_qrels(query: str, docs: List[str], top_k: int = 10) -> np.ndarray:
    """
    아주 단순한 키워드 일치 기반 relevance 벡터 생성(0/1/2 점수).
    - query 토큰이 문서에 포함되면 가산
    - 점수 0/1/2로 제한하여 nDCG에서 사용
    """
    qtok = set(tokenize_simple(query))
    rel = np.zeros(len(docs), dtype=np.int32)
    for i, d in enumerate(docs):
        dtok = set(tokenize_simple(d))
        inter = len(qtok.intersection(dtok))
        if inter >= 3:
            rel[i] = 2
        elif inter >= 1:
            rel[i] = 1
    return rel


def precision_at_k(rels: np.ndarray, top_idx: np.ndarray, k: int) -> float:
    hits = 0
    for i in top_idx[:k]:
        hits += 1 if rels[i] > 0 else 0
    return hits / k


def recall_at_k_from_rels(rels: np.ndarray, top_idx: np.ndarray, k: int) -> float:
    total_rel = int(np.sum(rels > 0))
    if total_rel == 0:
        return 0.0
    hits = 0
    for i in top_idx[:k]:
        hits += 1 if rels[i] > 0 else 0
    return hits / total_rel


def ndcg_at_k(rels: np.ndarray, top_idx: np.ndarray, k: int) -> float:
    gains = [(2**int(rels[i]) - 1) for i in top_idx[:k]]
    dcg = 0.0
    for r, g in enumerate(gains, start=1):
        dcg += g / np.log2(r + 1)
    # Ideal DCG
    ideal = sorted([(2**int(x) - 1) for x in rels], reverse=True)[:k]
    idcg = 0.0
    for r, g in enumerate(ideal, start=1):
        idcg += g / np.log2(r + 1)
    return (dcg / idcg) if idcg > 0 else 0.0


# 사용자 질의 예시
q = 'What is the impact of interest rate hikes on the stock market?'

# 검색 실행
q_emb = sbert_encode([q])
Xq = vectorizer.transform([q])

s_idx1, s_sc1 = search_sbert(q_emb, corpus_sbert, top_k=10)
t_idx1, t_sc1 = search_tfidf(Xq, X_corpus, top_k=10)
h_idx1, h_sc1 = search_hybrid(q_emb, corpus_sbert, Xq, X_corpus, alpha=0.5, top_k=10)

# 약식 qrels 생성 (전체 코퍼스 기준 relevance)
rels = keyword_qrels(q, corpus_texts)

# 지표 계산 (top-10)
for name, idxs in [("SBERT", s_idx1[0]), ("TF-IDF", t_idx1[0]), ("Hybrid", h_idx1[0])]:
    p5 = precision_at_k(rels, idxs, 5)
    r5 = recall_at_k_from_rels(rels, idxs, 5)
    n5 = ndcg_at_k(rels, idxs, 5)
    print(f"{name} | P@5={p5:.3f} R@5={r5:.3f} nDCG@5={n5:.3f}")

# 상위 결과 미리보기
def preview(name: str, idxs: Iterable[int], k: int = 5):
    print(f"\n[{name} Top-{k}] {q}")
    for i in list(idxs)[:k]:
        print("-", corpus_texts[i][:140].replace("\n"," "))

preview("SBERT", s_idx1[0])
preview("TF-IDF", t_idx1[0])
preview("Hybrid", h_idx1[0])


SBERT | P@5=1.000 R@5=0.004 nDCG@5=0.476
TF-IDF | P@5=1.000 R@5=0.004 nDCG@5=1.000
Hybrid | P@5=1.000 R@5=0.004 nDCG@5=0.476

[SBERT Top-5] What is the impact of interest rate hikes on the stock market?
- Treasuries Up, Rate Hike Still in Offing (Reuters) Reuters - U.S. Treasury debt made moderate gains\on Tuesday after a key reading of U.S. i
- Election-Year Rate Hike Puzzles Some WASHINGTON - Going against conventional wisdom, the Federal Reserve is raising interest rates in an ele
- South Korea lowers interest rates South Korea's central bank cuts interest rates by a quarter percentage point to 3.5 in a bid to drive grow
- Stocks Higher on Oil Price Relief (Reuters) Reuters - U.S. stocks gained on Monday, getting\a boost from lower oil prices after news the Ven
- Stocks Rise on Drop in Consumer Prices A drop in consumer prices and a decline in crude oil futures Tuesday allowed investors to put aside w

[TF-IDF Top-5] What is the impact of interest rate hikes on the stock market?
- D