In [1]:
!pip install statsmodels pandas numpy tqdm kiwipiepy




[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import sqlite3
import pickle
import math
import pandas as pd
import numpy as np
from kiwipiepy import Kiwi
from tqdm import tqdm
import warnings

warnings.filterwarnings('ignore')

BASE_DIR = os.path.join('..')
DATA_DIR = os.path.join(BASE_DIR, 'final_data')
DB_DIR = os.path.join(BASE_DIR, 'final_database')
FULL_DATA_PATH = os.path.join(DATA_DIR, 'full_data.pkl')
QUERIES_PATH = os.path.join(BASE_DIR, 'data', 'queries.pkl')
QRELS_PATH = os.path.join(BASE_DIR, 'data', 'qrels.pkl')

os.makedirs(DB_DIR, exist_ok=True)
DB_PATH = os.path.join(DB_DIR, 'search_index_full.db')

print("데이터 로드")
df = pd.read_pickle(FULL_DATA_PATH)

if '_id' in df.columns:
    df.set_index('_id', inplace=True)
elif 'doc_id' in df.columns:
    df.set_index('doc_id', inplace=True)
df.index = df.index.astype(str)

with open(QUERIES_PATH, 'rb') as f:
    queries_data = pickle.load(f)
with open(QRELS_PATH, 'rb') as f:
    qrels_data = pickle.load(f)

데이터 로드


In [4]:
if os.path.exists(DB_PATH):
    os.remove(DB_PATH)

conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute('CREATE TABLE inverted_index (term TEXT, doc_id TEXT, tf INTEGER)')
cursor.execute('CREATE INDEX idx_term ON inverted_index(term)')

print("역색인 구축 (패딩 'O' 제외)")
data_to_insert = []
BATCH_SIZE = 50000

doc_freq = {}
total_docs = len(df)
avg_dl = df['doc_length'].mean()

for doc_id, row in tqdm(df.iterrows(), total=total_docs, desc="Indexing"):
    tokens = row['tokens_padded']
    term_counts = {}

    for t in tokens:
        if set(t) == {'O'}: continue
        term_counts[t] = term_counts.get(t, 0) + 1

    for term, tf in term_counts.items():
        data_to_insert.append((term, str(doc_id), tf))
        doc_freq[term] = doc_freq.get(term, 0) + 1

    if len(data_to_insert) >= BATCH_SIZE:
        cursor.executemany('INSERT INTO inverted_index VALUES (?, ?, ?)', data_to_insert)
        data_to_insert = []

if data_to_insert:
    cursor.executemany('INSERT INTO inverted_index VALUES (?, ?, ?)', data_to_insert)
conn.commit()
print("인덱싱 완료")

PermissionError: [WinError 32] 다른 프로세스가 파일을 사용 중이기 때문에 프로세스가 액세스 할 수 없습니다: '..\\final_database\\search_index_full.db'

In [None]:
kiwi = Kiwi(num_workers=0)
idf_cache = {t: math.log((total_docs - df + 0.5) / (df + 0.5) + 1) for t, df in doc_freq.items()}

def tokenize_query(text):
    try:
        return [t.form for t in kiwi.tokenize(text) if t.tag in ['NNG', 'NNP', 'VV', 'VA', 'MAG']]
    except: return []

def calculate_scores(query_tokens, model_type='BM25', k1=1.2, b=0.75):
    scores = {}
    for term in query_tokens:
        if term not in idf_cache: continue
        idf = idf_cache[term]

        cursor.execute('SELECT doc_id, tf FROM inverted_index WHERE term = ?', (term,))
        rows = cursor.fetchall()

        for doc_id, tf in rows:
            if model_type == 'BIM':
                scores[doc_id] = scores.get(doc_id, 0.0) + idf
            else: # BM25
                doc_len = df.at[doc_id, 'doc_length']
                num = tf * (k1 + 1)
                den = tf + k1 * (1 - b + b * (doc_len / avg_dl))
                scores[doc_id] = scores.get(doc_id, 0.0) + idf * (num / den)
    return scores

def calculate_metrics(ranked_docs, relevant_docs, k=10):
    top_k = ranked_docs[:k]
    relevant_retrieved_k = len(set(top_k) & relevant_docs)
    p_at_k = relevant_retrieved_k / k

    total_relevant = len(relevant_docs)
    r_at_k = relevant_retrieved_k / total_relevant if total_relevant > 0 else 0

    relevant_count = 0
    precision_sum = 0.0
    for i, doc_id in enumerate(ranked_docs):
        if doc_id in relevant_docs:
            relevant_count += 1
            precision_sum += relevant_count / (i + 1)

    ap = precision_sum / total_relevant if total_relevant > 0 else 0
    return p_at_k, r_at_k, ap

In [None]:
qrels_dict = {}
for item in qrels_data:
    qid, doc_id = str(item.get('query-id')), str(item.get('corpus-id'))
    if qid and doc_id:
        qrels_dict.setdefault(qid, set()).add(doc_id)

queries_map = {str(q['_id']): q['text'] for q in queries_data if '_id' in q}

print("쿼리 전처리")
processed_queries = {}
for qid in qrels_dict:
    if qid in queries_map:
        tokens = tokenize_query(queries_map[qid])
        if tokens:
            processed_queries[qid] = tokens

print(f"실험 대상 쿼리 수: {len(processed_queries)}")

In [None]:
print("\n[BM25 하이퍼파라미터 튜닝 (Full Dataset: 정밀 탐색)]")

# 샘플 데이터 최적점(k1=4.0, b=0.98)을 중심으로 집중 탐색
k1_values = [3.8, 4.0, 4.2]
b_values = [0.95, 0.98, 0.99, 1.0]

tuning_results = []
best_map = -1
# 초기값은 샘플 데이터의 Best 값으로 설정
best_params = {'k1': 4.0, 'b': 0.98}

print(f"튜닝 후보: k1={k1_values}, b={b_values} (총 {len(k1_values)*len(b_values)}회 수행)")

for k1_val in k1_values:
    for b_val in b_values:
        total_map = 0
        count = 0

        for qid, q_tokens in tqdm(processed_queries.items(), desc=f"k1={k1_val}, b={b_val}", leave=False):
            if qid not in qrels_dict: continue

            scores = calculate_scores(q_tokens, model_type='BM25', k1=k1_val, b=b_val)

            ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
            ranked_ids = [d for d, s in ranked[:100]]

            _, _, ap = calculate_metrics(ranked_ids, qrels_dict[qid], k=10)
            total_map += ap
            count += 1

        avg_map = total_map / count if count > 0 else 0
        print(f"  - Params(k1={k1_val}, b={b_val}) -> MAP: {avg_map:.4f}")

        tuning_results.append({'k1': k1_val, 'b': b_val, 'MAP': avg_map})

        if avg_map > best_map:
            best_map = avg_map
            best_params = {'k1': k1_val, 'b': b_val}

print(f"\n최적 파라미터(Full Data): {best_params} (MAP: {best_map:.4f})")

pd.DataFrame(tuning_results).to_csv(os.path.join(DATA_DIR, 'tuning_results_full.csv'), index=False)

In [None]:
print("\n[최종 평가 및 회귀 데이터셋 생성]")
regression_data = []
metrics_data = []

models_to_run = [
    {'name': 'BIM', 'type': 'BIM', 'k1': 0, 'b': 0},
    {'name': 'BM25_Best', 'type': 'BM25', 'k1': best_params['k1'], 'b': best_params['b']}
]

for model_info in models_to_run:
    m_name = model_info['name']
    print(f"  - Running {m_name}")

    for qid, q_tokens in tqdm(processed_queries.items(), desc=m_name):
        target_docs = qrels_dict[qid]

        scores = calculate_scores(q_tokens, model_type=model_info['type'],
                                  k1=model_info['k1'], b=model_info['b'])

        ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        ranked_ids = [d for d, s in ranked[:100]]

        p10, r10, ap = calculate_metrics(ranked_ids, target_docs, k=10)
        metrics_data.append({
            'qid': qid,
            'model': m_name,
            'P@10': p10, 'R@10': r10, 'AP': ap
        })

        top_10_ids = set(ranked_ids[:10])

        for target_doc in target_docs:
            if target_doc not in df.index: continue

            is_success = 1 if target_doc in top_10_ids else 0
            doc_len = df.at[target_doc, 'doc_length']
            topic_probs = df.at[target_doc, 'topic_probs']

            row = {
                'qid': qid,
                'doc_id': target_doc,
                'model': m_name,
                'success': is_success,
                'doc_length': doc_len,
                'query_length': len(q_tokens),
            }
            for idx, prob in enumerate(topic_probs):
                row[f'topic_{idx}'] = prob

            regression_data.append(row)

conn.close()

In [None]:
df_reg = pd.DataFrame(regression_data)
df_met = pd.DataFrame(metrics_data)

reg_path = os.path.join(DATA_DIR, 'regression_dataset.csv')
met_path = os.path.join(DATA_DIR, 'performance_metrics.csv')

df_reg.to_csv(reg_path, index=False)
df_met.to_csv(met_path, index=False)

print("\n[작업 완료]")
print(f"회귀 데이터셋: {reg_path} ({len(df_reg)}건)")
print(f"성능 지표: {met_path}")
print("\n[최종 성능 요약]")
print(df_met.groupby('model')[['P@10', 'R@10', 'AP']].mean())