## Chapter 2-6, 2강 NLP 임베딩 기반 검색 — TF‑IDF vs SBERT vs Hybrid (CPU)

- 목표: 의미/키워드 검색 방식을 비교하고, 하이브리드로 직관적 성능 차이를 체감
- 제약: CPU 전용 환경(훈련/추론 모두 CPU)
- 데이터: AG News 코퍼스(소규모 서브셋)


### 구성 (Overview)
- 0. 환경 설정 및 라이브러리
- 1. 데이터 로드 및 corpus 생성 (AG News)
- 2. 임베딩 2가지: SBERT / TF‑IDF
- 3. 단일 질문 검색: TF‑IDF / SBERT / Hybrid
- 4. 결과 비교: 직관적 성능 분석(토픽/키워드/문맥)
- 5. 추가: 하이브리드 가중치/속도/stopwords 영향


### 0. 환경 설정 및 라이브러리


In [9]:
# =========================
# 0. 환경 설정 및 라이브러리
# =========================

# 표준 라이브러리
import os, time, random
from typing import List, Tuple
from contextlib import contextmanager

# 수치/데이터/시각화
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 사이킷런
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction import text as sk_text

# datasets / sentence-transformers (CPU 고정)
from datasets import load_dataset
from sentence_transformers import SentenceTransformer

import warnings
warnings.filterwarnings("ignore", message=".*matmul.*")

# -----------------------------------------
# Matplotlib: 한글 폰트 및 마이너스 기호 설정
# -----------------------------------------
plt.rcParams["font.family"] = "AppleGothic"
plt.rcParams["axes.unicode_minus"] = False

# ------------------------
# 재현성(시드) 고정
# ------------------------
def set_seed(seed: int = 42) -> None:
    random.seed(seed)
    np.random.seed(seed)

set_seed(42)

# ------------------------
# 간단 타이머
# ------------------------
@contextmanager
def timer(msg: str):
    t0 = time.perf_counter()
    yield
    print(f"[TIME] {msg}: {time.perf_counter() - t0:.2f}s")


### 1. 데이터 로드 및 corpus 생성 (AG News)


In [10]:
# ------------------------
# AG News 서브셋 로더
# ------------------------
label_names: List[str] = ["World", "Sports", "Business", "Sci/Tech"]


def load_ag_news_subset(n_per_class: int = 200) -> Tuple[List[str], List[int]]:
    """
    AG News에서 클래스별 동일 개수(n_per_class) 샘플을 뽑아 텍스트/라벨을 반환
    - CPU 환경 기준, 전체 120k 중 소규모만 사용
    """
    ds = load_dataset("ag_news", split="train")
    xs, ys = [], []
    for lab in range(4):
        sub = ds.filter(lambda ex: ex["label"] == lab).select(range(n_per_class))
        xs += [r["text"] for r in sub]
        ys += [int(r["label"]) for r in sub]
    return xs, ys


with timer("AG News 로드"):
    corpus_texts, corpus_labels = load_ag_news_subset(n_per_class=200)

print(len(corpus_texts), "samples")
print("label_names:", label_names)

# 클래스별 샘플 확인(2개씩)
print("\n[클래스별 샘플 예시]")
shown = {i: 0 for i in range(4)}
for t, lab in zip(corpus_texts, corpus_labels):
    if shown[lab] < 2:
        print(f"[{label_names[lab]}] {str(t)[:120]}...")
        shown[lab] += 1


[TIME] AG News 로드: 2.91s
800 samples
label_names: ['World', 'Sports', 'Business', 'Sci/Tech']

[클래스별 샘플 예시]
[World] Venezuelans Vote Early in Referendum on Chavez Rule (Reuters) Reuters - Venezuelans turned out early\and in large number...
[World] S.Koreans Clash with Police on Iraq Troop Dispatch (Reuters) Reuters - South Korean police used water cannon in\central ...
[Sports] Phelps, Thorpe Advance in 200 Freestyle (AP) AP - Michael Phelps took care of qualifying for the Olympic 200-meter frees...
[Sports] Reds Knock Padres Out of Wild-Card Lead (AP) AP - Wily Mo Pena homered twice and drove in four runs, helping the Cincinn...
[Business] Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics,...
[Business] Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\which has a reputat...
[Sci/Tech] 'Madden,' 'ESPN' Football Score in Different Ways (Reuters) Reuters - Was absente

### 2. 임베딩 2가지: SBERT / TF‑IDF


In [None]:
# ------------------------
# TF-IDF 인덱스 구축
# ------------------------
# stopwords: 영어 불용어 사용(키워드 매칭 과다 방지)
stop_words = list(sk_text.ENGLISH_STOP_WORDS) 

In [None]:
vectorizer = TfidfVectorizer(
    max_features=30000,
    ngram_range=(1, 2),
    stop_words=stop_words
)

with timer("TF-IDF fit_transform(corpus)"):
    X_tfidf = vectorizer.fit_transform(corpus_texts)

print("TF-IDF shape:", X_tfidf.shape)

In [None]:
# ------------------------
# SBERT 임베딩 구축 (CPU)
# ------------------------

def sbert_encode(texts: List[str], batch_size: int = 64,
                 model_name: str = "sentence-transformers/all-MiniLM-L6-v2") -> np.ndarray:
    model = SentenceTransformer(model_name, device="cpu")
    arrs = []
    for i in range(0, len(texts), batch_size):
        arr = model.encode(
            texts[i:i+batch_size],
            batch_size=batch_size,
            show_progress_bar=False,
            convert_to_numpy=True,
            normalize_embeddings=False
        ).astype(np.float64)
        # L2 정규화 + NaN/Inf 방지
        norms = np.linalg.norm(arr, axis=1, keepdims=True)
        arr = arr / np.clip(norms, 1e-12, None)
        arr = np.nan_to_num(arr, nan=0.0, posinf=0.0, neginf=0.0)
        arrs.append(arr)
    return np.vstack(arrs)

with timer("SBERT encode(corpus)"):
    X_sbert = sbert_encode(corpus_texts)

print("SBERT shape:", X_sbert.shape)


[TIME] TF-IDF fit_transform(corpus): 0.21s
TF-IDF shape: (800, 21987)
[TIME] SBERT encode(corpus): 27.35s
SBERT shape: (800, 384)


### 3. 단일 질문 검색: TF‑IDF / SBERT / Hybrid


In [16]:
# ------------------------
# 검색 함수 3종
# ------------------------

def search_tfidf(query: str, topk: int = 5):
    qv = vectorizer.transform([query])
    sims = cosine_similarity(qv, X_tfidf)[0]
    idx = np.argsort(-sims)[:topk]
    return [(int(i), float(sims[i])) for i in idx]


def search_sbert(query: str, topk: int = 5):
    qv = sbert_encode([query])  # (1, d)
    sims = (X_sbert @ qv.T).ravel()  # 정규화된 코사인 유사도(내적)
    idx = np.argsort(-sims)[:topk]
    return [(int(i), float(sims[i])) for i in idx]


def search_hybrid(query: str, topk: int = 5, alpha: float = 0.5):
    # alpha: SBERT 가중치(0~1). 1이면 SBERT만, 0이면 TF-IDF만
    qv_tfidf = vectorizer.transform([query])
    sims_tfidf = cosine_similarity(qv_tfidf, X_tfidf)[0]
    qv_sbert = sbert_encode([query])
    sims_sbert = (X_sbert @ qv_sbert.T).ravel()
    sims = alpha * sims_sbert + (1 - alpha) * sims_tfidf
    idx = np.argsort(-sims)[:topk]
    return [(int(i), float(sims[i])) for i in idx]


In [21]:
# ------------------------
# 단일 고정 질의(지시 사항대로)
# ------------------------
# 결과 수집 리스트
rows_tfidf, rows_sbert, rows_hybrid = [], [], []
# TF-IDF 루프 안에 추가
for k, (i, s) in enumerate(search_tfidf(query, topk=5), start=1):
    print(f"- ({s:.3f}) [{label_names[corpus_labels[i]]}] {corpus_texts[i]}")
    rows_tfidf.append({"rank": k, "score": float(s), "doc_id": int(i),
                       "label": label_names[corpus_labels[i]], "text": corpus_texts[i]})

# SBERT 루프 안에 추가
for k, (i, s) in enumerate(search_sbert(query, topk=5), start=1):
    print(f"- ({s:.3f}) [{label_names[corpus_labels[i]]}] {corpus_texts[i]}")
    rows_sbert.append({"rank": k, "score": float(s), "doc_id": int(i),
                       "label": label_names[corpus_labels[i]], "text": corpus_texts[i]})

# Hybrid 루프 안에 추가
for k, (i, s) in enumerate(search_hybrid(query, topk=5, alpha=0.6), start=1):
    print(f"- ({s:.3f}) [{label_names[corpus_labels[i]]}] {corpus_texts[i]}")
    rows_hybrid.append({"rank": k, "score": float(s), "doc_id": int(i),
                        "label": label_names[corpus_labels[i]], "text": corpus_texts[i]})

# CSV 저장 (각각)
out = os.path.join(os.path.abspath("."), "outputs"); os.makedirs(out, exist_ok=True)
pd.DataFrame(rows_tfidf).to_csv(os.path.join(out, "tfidf_top5.csv"), index=False, encoding="utf-8")
pd.DataFrame(rows_sbert).to_csv(os.path.join(out, "sbert_top5.csv"), index=False, encoding="utf-8")
pd.DataFrame(rows_hybrid).to_csv(os.path.join(out, "hybrid_top5.csv"), index=False, encoding="utf-8")

- (0.188) [Business] Veteran inventor in market float Trevor Baylis, the veteran inventor famous for creating the Freeplay clockwork radio, is planning to float his company on the stock market.
- (0.180) [Business] In a Down Market, Head Toward Value Funds There is little cause for celebration in the stock market these days, but investors in value-focused mutual funds have reason to feel a bit smug -- if only because they've lost less than the folks who stuck with growth.
- (0.172) [Business] Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums.
- (0.172) [Business] Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market this week during the depth of the\summer doldrums.
- (0.170) [Busi

In [None]:

# query = "What is the impact of interest rate hikes on the stock market?"

# print("\n[TF-IDF 검색]")
# for i, s in search_tfidf(query, topk=5):
#     print(f"- ({s:.3f}) [{label_names[corpus_labels[i]]}] {corpus_texts[i]}")

# print("\n[SBERT 검색]")
# for i, s in search_sbert(query, topk=5):
#     print(f"- ({s:.3f}) [{label_names[corpus_labels[i]]}] {corpus_texts[i]}")

# print("\n[Hybrid 검색, alpha=0.6]")
# for i, s in search_hybrid(query, topk=5, alpha=0.6):
#     print(f"- ({s:.3f}) [{label_names[corpus_labels[i]]}] {corpus_texts[i]}")



[TF-IDF 검색]
- (0.188) [Business] Veteran inventor in market float Trevor Baylis, the veteran inventor famous for creating the Freeplay clockwork radio, is planning to float his company on the stock market.
- (0.180) [Business] In a Down Market, Head Toward Value Funds There is little cause for celebration in the stock market these days, but investors in value-focused mutual funds have reason to feel a bit smug -- if only because they've lost less than the folks who stuck with growth.
- (0.172) [Business] Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums.
- (0.172) [Business] Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market this week during the depth of the\summer doldrums.
- 

### 4. 결과 비교: 직관적 성능 분석(토픽/키워드/문맥)


In [22]:
# ------------------------
# 비교용 테이블/프린트
# ------------------------

def show_results(title: str, pairs):
    print(f"\n[{title}]")
    for rank, (i, s) in enumerate(pairs, start=1):
        txt = corpus_texts[i].replace("\n", " ")
        print(f"{rank:>2}. ({s:.3f}) [{label_names[corpus_labels[i]]}] {txt}")

r_tfidf = search_tfidf(query, topk=5)
r_sbert = search_sbert(query, topk=5)
r_hybrid = search_hybrid(query, topk=5, alpha=0.6)

show_results("TF-IDF Top-5", r_tfidf)
show_results("SBERT Top-5", r_sbert)
show_results("Hybrid(0.6) Top-5", r_hybrid)

# 간단한 직관 비교 포인트 카운팅(레이블 기준)
from collections import Counter

def label_counts(pairs):
    return Counter([label_names[corpus_labels[i]] for i, _ in pairs])

print("\n[레이블 빈도 비교]")
for name, pairs in [("TF-IDF", r_tfidf), ("SBERT", r_sbert), ("Hybrid", r_hybrid)]:
    print(name, dict(label_counts(pairs)))

# r_*에서 바로 CSV 저장
rows_tfidf  = [{"rank": k, "score": float(s), "doc_id": int(i),
                "label": label_names[corpus_labels[i]], "text": corpus_texts[i]}
               for k, (i, s) in enumerate(r_tfidf, start=1)]
rows_sbert  = [{"rank": k, "score": float(s), "doc_id": int(i),
                "label": label_names[corpus_labels[i]], "text": corpus_texts[i]}
               for k, (i, s) in enumerate(r_sbert, start=1)]
rows_hybrid = [{"rank": k, "score": float(s), "doc_id": int(i),
                "label": label_names[corpus_labels[i]], "text": corpus_texts[i]}
               for k, (i, s) in enumerate(r_hybrid, start=1)]

out = os.path.join(os.path.abspath("."), "outputs"); os.makedirs(out, exist_ok=True)
pd.DataFrame(rows_tfidf).to_csv(os.path.join(out, "tfidf_top5_cell14.csv"), index=False, encoding="utf-8")
pd.DataFrame(rows_sbert).to_csv(os.path.join(out, "sbert_top5_cell14.csv"), index=False, encoding="utf-8")
pd.DataFrame(rows_hybrid).to_csv(os.path.join(out, "hybrid_top5_cell14.csv"), index=False, encoding="utf-8")



[TF-IDF Top-5]
 1. (0.188) [Business] Veteran inventor in market float Trevor Baylis, the veteran inventor famous for creating the Freeplay clockwork radio, is planning to float his company on the stock market.
 2. (0.180) [Business] In a Down Market, Head Toward Value Funds There is little cause for celebration in the stock market these days, but investors in value-focused mutual funds have reason to feel a bit smug -- if only because they've lost less than the folks who stuck with growth.
 3. (0.172) [Business] Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums.
 4. (0.172) [Business] Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market this week during the depth of the\summer d

### 5. 추가: 하이브리드 가중치/속도/stopwords 영향


In [15]:
# ------------------------
# 하이브리드 가중치 변화 예시
# ------------------------
for a in [0.0, 0.3, 0.6, 0.9, 1.0]:
    pairs = search_hybrid(query, topk=5, alpha=a)
    top_labels = [label_names[corpus_labels[i]] for i, _ in pairs]
    print(f"alpha={a:.1f} -> {top_labels}")

# ------------------------
# 속도 비교(간단)
# ------------------------
with timer("TF-IDF 질의 시간"):
    _ = search_tfidf(query, topk=10)
with timer("SBERT 질의 시간"):
    _ = search_sbert(query, topk=10)
with timer("Hybrid(alpha=0.6) 질의 시간"):
    _ = search_hybrid(query, topk=10, alpha=0.6)

# ------------------------
# stopwords 영향: 불용어 제거 on/off 비교
# ------------------------
alt_vectorizer = TfidfVectorizer(max_features=30000, ngram_range=(1, 2), stop_words=None)
with timer("TF-IDF(no stopwords) fit_transform"):
    X_tfidf_raw = alt_vectorizer.fit_transform(corpus_texts)

def search_tfidf_raw(q: str, topk: int = 5):
    qv = alt_vectorizer.transform([q])
    sims = cosine_similarity(qv, X_tfidf_raw)[0]
    idx = np.argsort(-sims)[:topk]
    return [(int(i), float(sims[i])) for i in idx]

print("\n[TF-IDF(stopwords) vs TF-IDF(raw) 비교]")
print("- with stopwords:")
for i, s in search_tfidf(query, topk=3):
    print(f"  ({s:.3f}) [{label_names[corpus_labels[i]]}] {corpus_texts[i][:100]}...")
print("- no stopwords:")
for i, s in search_tfidf_raw(query, topk=3):
    print(f"  ({s:.3f}) [{label_names[corpus_labels[i]]}] {corpus_texts[i][:100]}...")


alpha=0.0 -> ['Business', 'Business', 'Business', 'Business', 'Business']
alpha=0.3 -> ['Business', 'Business', 'Business', 'Business', 'Business']
alpha=0.6 -> ['World', 'Business', 'Business', 'Business', 'Business']
alpha=0.9 -> ['World', 'Business', 'Business', 'Business', 'Business']
alpha=1.0 -> ['World', 'Business', 'Business', 'Business', 'World']
[TIME] TF-IDF 질의 시간: 0.00s
[TIME] SBERT 질의 시간: 2.20s
[TIME] Hybrid(alpha=0.6) 질의 시간: 2.57s
[TIME] TF-IDF(no stopwords) fit_transform: 0.25s

[TF-IDF(stopwords) vs TF-IDF(raw) 비교]
- with stopwords:
  (0.188) [Business] Veteran inventor in market float Trevor Baylis, the veteran inventor famous for creating the Freepla...
  (0.180) [Business] In a Down Market, Head Toward Value Funds There is little cause for celebration in the stock market ...
  (0.172) [Business] Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\about th...
- no stopwords:
  (0.179) [Business] Hungarian central bank cuts key i

#### 환경 주의사항
- CPU 전용: `SentenceTransformer(..., device="cpu")` 고정
- 시드 고정: `random.seed`, `np.random.seed`
- 한글 폰트: `AppleGothic` (환경에 따라 변경 가능)
- 재사용 빠른 실습을 위해 코퍼스 크기를 작게 유지(각 클래스 200)
