# 2. 검색 엔진 (BIM & BM25 Search) - 샘플링X 후처리X
- 쿼리 로드 및 검색 수행
- BIM vs BM25 점수 계산 및 결과 저장

## 2.1 검색 함수 정의
검색에 필요한 핵심 로직(IDF, Score 계산)

In [1]:
import sqlite3
import json
import pickle
import math
import os
from collections import defaultdict
from tqdm.notebook import tqdm
from kiwipiepy import Kiwi
from pathlib import Path

PROJECT_ROOT = Path('..').resolve()
DATA_DIR = PROJECT_ROOT / 'data'
DB_PATH = PROJECT_ROOT / 'database' / 'inverted_index.db'
RESULTS_DIR = PROJECT_ROOT / 'results'

os.makedirs(RESULTS_DIR, exist_ok=True)

# BM25 파라미터
K1 = 1.2
B = 0.75
TOP_K = 100

In [2]:
# Kiwi 초기화
kiwi = Kiwi(num_workers=-1)

def get_db_connection():
    return sqlite3.connect(str(DB_PATH))

def tokenize(text):
    if not text: return []
    clean_text = text.replace('\x00', '')
    try:
        tokens = kiwi.tokenize(clean_text)
        useful_tags = ['NNG', 'NNP', 'VV', 'VA', 'MAG']
        return [t.form for t in tokens if t.tag in useful_tags and len(t.form) > 1]
    except:
        return []

def calculate_scores(cursor, query_tokens, N, avgdl):
    bim_scores = defaultdict(float)
    bm25_scores = defaultdict(float)
    doc_term_freqs = defaultdict(lambda: defaultdict(int))

    for term in query_tokens:
        cursor.execute("SELECT doc_id, tf FROM inverted_index WHERE term = ?", (term,))
        postings = cursor.fetchall()
        if not postings: continue

        df = len(postings)
        idf = math.log((N - df + 0.5) / (df + 0.5))
        if idf < 0: idf = 0

        for doc_id, tf in postings:
            cursor.execute("SELECT length FROM documents WHERE doc_id = ?", (doc_id,))
            res = cursor.fetchone()
            if not res: continue
            doc_len = res[0]

            # BIM Score
            bim_scores[doc_id] += idf

            # BM25 Score
            numerator = tf * (K1 + 1)
            denominator = tf + K1 * (1 - B + B * (doc_len / avgdl))
            bm25_scores[doc_id] += idf * (numerator / denominator)

            doc_term_freqs[doc_id][term] = tf

    return bim_scores, bm25_scores, doc_term_freqs

## 2.2 검색 실행 (Batch Search)

In [3]:
# 실행 로직
with open(DATA_DIR / 'queries.pkl', 'rb') as f:
    queries = pickle.load(f)

conn = get_db_connection()
cursor = conn.cursor()

# 통계 로드
cursor.execute("SELECT value FROM statistics WHERE key='N'")
N = cursor.fetchone()[0]
cursor.execute("SELECT value FROM statistics WHERE key='avgdl'")
avgdl = cursor.fetchone()[0]

print(f"검색 시작: 쿼리 {len(queries):,}개")
all_results = []

for query in tqdm(queries, desc="Searching"):
    q_tokens = tokenize(query['text'])
    if not q_tokens: continue

    bim_s, bm25_s, term_freqs = calculate_scores(cursor, q_tokens, N, avgdl)
    doc_ids = set(bim_s.keys()) | set(bm25_s.keys())

    results = []
    for doc_id in doc_ids:
        cursor.execute("SELECT title, length FROM documents WHERE doc_id = ?", (doc_id,))
        row = cursor.fetchone()
        if not row: continue

        results.append({
            'doc_id': doc_id,
            'doc_title': row[0],
            'doc_length': row[1],
            'bim_score': round(bim_s.get(doc_id, 0), 4),
            'bm25_score': round(bm25_s.get(doc_id, 0), 4),
            'score_difference': round(bm25_s.get(doc_id, 0) - bim_s.get(doc_id, 0), 4),
            'term_frequencies': term_freqs.get(doc_id, {})
        })

    results.sort(key=lambda x: x['bm25_score'], reverse=True)
    top_results = results[:TOP_K]
    for i, r in enumerate(top_results, 1): r['rank'] = i

    all_results.append({
        'query_id': query['_id'],
        'query_text': query['text'],
        'results': top_results
    })

conn.close()

with open(RESULTS_DIR / 'search_results.json', 'w', encoding='utf-8') as f:
    json.dump(all_results, f, ensure_ascii=False, indent=2)

print(f"검색 완료. 저장 경로: {RESULTS_DIR / 'search_results.json'}")

검색 시작: 쿼리 1,454개


Searching:   0%|          | 0/1454 [00:00<?, ?it/s]

검색 완료. 저장 경로: C:\Users\cse\Desktop\xeoxaxeo\NLP\results\search_results.json


## 2.3 결과 샘플 확인

In [4]:
print(f"첫 번째 쿼리: {all_results[0]['query_text']}")
for res in all_results[0]['results'][:5]:
    print(f"Rank {res['rank']}: {res['doc_title']} (BM25: {res['bm25_score']}, BIM: {res['bim_score']})")

첫 번째 쿼리: 얘들아 폴아웃 뉴베가스에서 부머들 세력이 지하벙커에서 폭발 나서 튀어나온 다음 군 기지 차지했다던데, 물이랑 전기 어떻게 구해서 사는 거임? 야포로 이방인 쏴버린다던데 진짜로? 구체적인 과정 궁금함 ㅠㅠ
Rank 1: 부머(폴아웃: 뉴 베가스) (BM25: 48.0462, BIM: 32.5041)
Rank 2: 넬리스 공군 기지 (BM25: 46.5677, BIM: 26.397)
Rank 3: 뉴 캘리포니아 공화국 (BM25: 44.8472, BIM: 30.9406)
Rank 4: 볼트 34 (BM25: 43.6899, BIM: 29.3525)
Rank 5: 카이사르의 군단 (BM25: 38.3114, BIM: 37.6594)
