# 2. 검색 엔진 (BIM & BM25 Search) - 샘플링O 후처리X

In [1]:
import sqlite3
import json
import pickle
import math
import os
from collections import defaultdict
from tqdm.notebook import tqdm
from kiwipiepy import Kiwi
from pathlib import Path

PROJECT_ROOT = Path('..').resolve()
DATA_DIR = PROJECT_ROOT / 'data'
DB_PATH = PROJECT_ROOT / 'database' / 'inverted_index_sample.db'
RESULTS_DIR = PROJECT_ROOT / 'results'
OUTPUT_PATH = RESULTS_DIR / 'search_results_sample.json'

os.makedirs(RESULTS_DIR, exist_ok=True)

K1 = 1.2
B = 0.75
TOP_K = 100

kiwi = Kiwi(num_workers=-1)

def get_db_connection():
    return sqlite3.connect(str(DB_PATH))

def tokenize_dirty(text):
    if not text: return []
    text = text.replace('\x00', '')
    try:
        tokens = kiwi.tokenize(text)
        useful_tags = ['NNG', 'NNP', 'VV', 'VA', 'MAG']
        return [t.form for t in tokens if t.tag in useful_tags and len(t.form) > 1]
    except:
        return []

def calculate_scores(cursor, query_tokens, N, avgdl):
    bim_scores = defaultdict(float)
    bm25_scores = defaultdict(float)

    for term in query_tokens:
        cursor.execute("SELECT doc_id, tf FROM inverted_index WHERE term = ?", (term,))
        postings = cursor.fetchall()
        if not postings: continue

        df = len(postings)
        idf = math.log((N - df + 0.5) / (df + 0.5))
        if idf < 0: idf = 0

        for doc_id, tf in postings:
            cursor.execute("SELECT length FROM documents WHERE doc_id = ?", (doc_id,))
            res = cursor.fetchone()
            if not res: continue
            doc_len = res[0]

            bim_scores[doc_id] += idf
            numerator = tf * (K1 + 1)
            denominator = tf + K1 * (1 - B + B * (doc_len / avgdl))
            bm25_scores[doc_id] += idf * (numerator / denominator)

    return bim_scores, bm25_scores

with open(DATA_DIR / 'queries.pkl', 'rb') as f:
    queries = pickle.load(f)

conn = get_db_connection()
cursor = conn.cursor()

cursor.execute("SELECT value FROM statistics WHERE key='N'")
N = cursor.fetchone()[0]
cursor.execute("SELECT value FROM statistics WHERE key='avgdl'")
avgdl = cursor.fetchone()[0]

all_results = []

print(f"검색 시작 (Target: {DB_PATH.name})")

for query in tqdm(queries, desc="Searching"):
    q_tokens = tokenize_dirty(query['text'])
    if not q_tokens: continue

    bim_s, bm25_s = calculate_scores(cursor, q_tokens, N, avgdl)
    doc_ids = set(bim_s.keys()) | set(bm25_s.keys())

    results = []
    for doc_id in doc_ids:
        results.append({
            'doc_id': doc_id,
            'bim_score': round(bim_s.get(doc_id, 0), 4),
            'bm25_score': round(bm25_s.get(doc_id, 0), 4),
        })

    results.sort(key=lambda x: x['bm25_score'], reverse=True)
    top_results = results[:TOP_K]

    all_results.append({
        'query_id': query['_id'],
        'results': top_results
    })

conn.close()

with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
    json.dump(all_results, f, ensure_ascii=False, indent=2)

print(f"검색 완료: {OUTPUT_PATH}")

검색 시작 (Target: inverted_index_sample.db)


Searching:   0%|          | 0/1454 [00:00<?, ?it/s]

검색 완료: C:\Users\cse\Desktop\xeoxaxeo\NLP\results\search_results_sample.json
