# 2. 검색 엔진 (BIM & BM25 Search) - 샘플링X 후처리O
- 쿼리 로드 및 검색 수행
- BIM vs BM25 점수 계산 및 결과 저장

In [1]:
import sqlite3
import json
import pickle
import math
import os
import re
from collections import defaultdict
from tqdm.notebook import tqdm
from kiwipiepy import Kiwi
from pathlib import Path

PROJECT_ROOT = Path('..').resolve()

DATA_DIR = PROJECT_ROOT / 'data'
DB_PATH = PROJECT_ROOT / 'database' / 'inverted_index_full_clean.db'

RESULTS_DIR = PROJECT_ROOT / 'results'
OUTPUT_PATH = RESULTS_DIR / 'search_results_full_clean.json'

os.makedirs(RESULTS_DIR, exist_ok=True)

K1 = 1.2
B = 0.75
TOP_K = 100

kiwi = Kiwi(num_workers=-1)

STOPWORDS = {
    '나오', '경우', '보이', '이후', '사람', '정도', '자신', '사용', '가능', '대하',
    '위하', '사실', '만들', '등장', '문제', '모습', '시작', '가지', '생각', '따르',
    '이상', '함께', '당시', '상대', '시간', '다시', '상황', '이름', '가장', '또한',
    '모두', '결국', '다만', '많이', '달리', '다르', '물론', '당하', '처음', '현재',
    '같이', '이유', '통하', '역시', '자체', '거의', '매우', '없이', '상태', '바로',
    '인하', '특히', '존재', '들어가', '사이', '다음', '모르', '참고', '부분', '이것',
    '해당', '그냥', '마지막', '필요', '부르', '관련', '떨어지', '대부분', '때문', '아니'
}

def get_db_connection():
    return sqlite3.connect(str(DB_PATH))

def tokenize(text):
    if not text: return []

    text = text.replace('\x00', '')
    text = re.sub(r'~~.*?~~', '', text)
    text = re.sub(r'[ㄱ-ㅎㅏ-ㅣ]+', '', text)
    text = re.sub(r'[\.\?\!~\-]{2,}', '.', text)

    try:
        tokens = kiwi.tokenize(text)
        useful_tags = ['NNG', 'NNP', 'VV', 'VA', 'MAG']

        result = []
        for t in tokens:
            if t.tag in useful_tags and len(t.form) > 1:
                if t.form not in STOPWORDS:
                    result.append(t.form)
        return result
    except:
        return []

def calculate_scores(cursor, query_tokens, N, avgdl):
    bim_scores = defaultdict(float)
    bm25_scores = defaultdict(float)
    doc_term_freqs = defaultdict(lambda: defaultdict(int))

    for term in query_tokens:
        cursor.execute("SELECT doc_id, tf FROM inverted_index WHERE term = ?", (term,))
        postings = cursor.fetchall()
        if not postings: continue

        df = len(postings)
        idf = math.log((N - df + 0.5) / (df + 0.5))
        if idf < 0: idf = 0

        for doc_id, tf in postings:
            cursor.execute("SELECT length FROM documents WHERE doc_id = ?", (doc_id,))
            res = cursor.fetchone()
            if not res: continue
            doc_len = res[0]

            bim_scores[doc_id] += idf

            numerator = tf * (K1 + 1)
            denominator = tf + K1 * (1 - B + B * (doc_len / avgdl))
            bm25_scores[doc_id] += idf * (numerator / denominator)

            doc_term_freqs[doc_id][term] = tf

    return bim_scores, bm25_scores, doc_term_freqs

with open(DATA_DIR / 'queries.pkl', 'rb') as f:
    queries = pickle.load(f)

conn = get_db_connection()
cursor = conn.cursor()

cursor.execute("SELECT value FROM statistics WHERE key='N'")
N = cursor.fetchone()[0]
cursor.execute("SELECT value FROM statistics WHERE key='avgdl'")
avgdl = cursor.fetchone()[0]

print(f"검색 시작 (Target DB: {DB_PATH.name})")
all_results = []

for query in tqdm(queries, desc="Searching"):
    q_tokens = tokenize(query['text'])
    if not q_tokens: continue

    bim_s, bm25_s, term_freqs = calculate_scores(cursor, q_tokens, N, avgdl)
    doc_ids = set(bim_s.keys()) | set(bm25_s.keys())

    results = []
    for doc_id in doc_ids:
        cursor.execute("SELECT title, length FROM documents WHERE doc_id = ?", (doc_id,))
        row = cursor.fetchone()
        if not row: continue

        results.append({
            'doc_id': doc_id,
            'doc_title': row[0],
            'doc_length': row[1],
            'bim_score': round(bim_s.get(doc_id, 0), 4),
            'bm25_score': round(bm25_s.get(doc_id, 0), 4),
            'score_difference': round(bm25_s.get(doc_id, 0) - bim_s.get(doc_id, 0), 4),
            'term_frequencies': term_freqs.get(doc_id, {})
        })

    results.sort(key=lambda x: x['bm25_score'], reverse=True)
    top_results = results[:TOP_K]
    for i, r in enumerate(top_results, 1): r['rank'] = i

    all_results.append({
        'query_id': query['_id'],
        'query_text': query['text'],
        'results': top_results
    })

conn.close()

with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
    json.dump(all_results, f, ensure_ascii=False, indent=2)

print(f"검색 완료: {OUTPUT_PATH}")

검색 시작 (Target DB: inverted_index_full_clean.db)


Searching:   0%|          | 0/1454 [00:00<?, ?it/s]

검색 완료: C:\Users\cse\Desktop\xeoxaxeo\NLP\results\search_results_full_clean.json
