In [1]:
!pip install statsmodels pandas numpy tqdm kiwipiepy




[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import sqlite3
import pickle
import math
import pandas as pd
import numpy as np
from kiwipiepy import Kiwi
from tqdm import tqdm
import warnings

warnings.filterwarnings('ignore')

BASE_DIR = os.path.join('..')
DATA_DIR = os.path.join(BASE_DIR, 'final_data')
DB_DIR = os.path.join(BASE_DIR, 'final_database')
SAMPLE_PATH = os.path.join(DATA_DIR, 'sample.pkl')
QUERIES_PATH = os.path.join(BASE_DIR, 'data', 'queries.pkl')
QRELS_PATH = os.path.join(BASE_DIR, 'data', 'qrels.pkl')

os.makedirs(DB_DIR, exist_ok=True)
DB_PATH = os.path.join(DB_DIR, 'search_index_sample.db')

print("샘플 데이터 로드")
df = pd.read_pickle(SAMPLE_PATH)

if '_id' in df.columns:
    df.set_index('_id', inplace=True)
elif 'doc_id' in df.columns:
    df.set_index('doc_id', inplace=True)
df.index = df.index.astype(str)

with open(QUERIES_PATH, 'rb') as f:
    queries_data = pickle.load(f)
with open(QRELS_PATH, 'rb') as f:
    qrels_data = pickle.load(f)

if os.path.exists(DB_PATH):
    os.remove(DB_PATH)

conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute('CREATE TABLE inverted_index (term TEXT, doc_id TEXT, tf INTEGER)')
cursor.execute('CREATE INDEX idx_term ON inverted_index(term)')

샘플 데이터 로드


<sqlite3.Cursor at 0x238461bc940>

In [3]:
print("역색인 구축 (패딩 제외)")
data_to_insert = []
BATCH_SIZE = 10000

doc_freq = {}
total_docs = len(df)
avg_dl = df['doc_length'].mean()

for doc_id, row in tqdm(df.iterrows(), total=total_docs, desc="Indexing"):
    tokens = row['tokens_padded']
    term_counts = {}

    for t in tokens:
        if set(t) == {'O'}: continue
        term_counts[t] = term_counts.get(t, 0) + 1

    for term, tf in term_counts.items():
        data_to_insert.append((term, str(doc_id), tf))
        doc_freq[term] = doc_freq.get(term, 0) + 1

    if len(data_to_insert) >= BATCH_SIZE:
        cursor.executemany('INSERT INTO inverted_index VALUES (?, ?, ?)', data_to_insert)
        data_to_insert = []

if data_to_insert:
    cursor.executemany('INSERT INTO inverted_index VALUES (?, ?, ?)', data_to_insert)
conn.commit()
print("인덱싱 완료")

역색인 구축 (패딩 제외)


Indexing: 100%|██████████| 5000/5000 [00:40<00:00, 124.76it/s]


인덱싱 완료


In [4]:
kiwi = Kiwi(num_workers=0)
idf_cache = {t: math.log((total_docs - df + 0.5) / (df + 0.5) + 1) for t, df in doc_freq.items()}

def tokenize_query(text):
    try:
        return [t.form for t in kiwi.tokenize(text) if t.tag in ['NNG', 'NNP', 'VV', 'VA', 'MAG']]
    except: return []

def calculate_scores(query_tokens, model_type='BM25', k1=1.2, b=0.75):
    scores = {}
    for term in query_tokens:
        if term not in idf_cache: continue
        idf = idf_cache[term]

        cursor.execute('SELECT doc_id, tf FROM inverted_index WHERE term = ?', (term,))
        rows = cursor.fetchall()

        for doc_id, tf in rows:
            if model_type == 'BIM':
                scores[doc_id] = scores.get(doc_id, 0.0) + idf
            else:
                doc_len = df.at[doc_id, 'doc_length']
                num = tf * (k1 + 1)
                den = tf + k1 * (1 - b + b * (doc_len / avg_dl))
                scores[doc_id] = scores.get(doc_id, 0.0) + idf * (num / den)
    return scores

def calculate_metrics(ranked_docs, relevant_docs, k=10):
    top_k = ranked_docs[:k]
    relevant_retrieved_k = len(set(top_k) & relevant_docs)
    p_at_k = relevant_retrieved_k / k

    total_relevant = len(relevant_docs)
    r_at_k = relevant_retrieved_k / total_relevant if total_relevant > 0 else 0

    relevant_count = 0
    precision_sum = 0.0
    for i, doc_id in enumerate(ranked_docs):
        if doc_id in relevant_docs:
            relevant_count += 1
            precision_sum += relevant_count / (i + 1)

    ap = precision_sum / total_relevant if total_relevant > 0 else 0
    return p_at_k, r_at_k, ap

In [5]:
qrels_dict = {}
for item in qrels_data:
    qid, doc_id = str(item.get('query-id')), str(item.get('corpus-id'))
    if qid and doc_id:
        qrels_dict.setdefault(qid, set()).add(doc_id)

queries_map = {str(q['_id']): q['text'] for q in queries_data if '_id' in q}

print("쿼리 전처리")
processed_queries = {}
for qid in qrels_dict:
    if qid in queries_map:
        tokens = tokenize_query(queries_map[qid])
        if tokens:
            processed_queries[qid] = tokens

쿼리 전처리


In [9]:
print("\n[BM25 하이퍼파라미터 튜닝 (Sample)]")
k1_values = [1.0, 1.2, 1.4]
b_values = [0.6, 0.75, 0.9]

tuning_results = []
best_map = -1
best_params = {'k1': 1.2, 'b': 0.75}

for k1_val in k1_values:
    for b_val in b_values:
        total_map = 0
        count = 0
        for qid, q_tokens in processed_queries.items():
            if not any(d in df.index for d in qrels_dict[qid]):
                continue

            scores = calculate_scores(q_tokens, model_type='BM25', k1=k1_val, b=b_val)
            ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
            ranked_ids = [d for d, s in ranked[:100]]
            _, _, ap = calculate_metrics(ranked_ids, qrels_dict[qid], k=10)
            total_map += ap
            count += 1

        avg_map = total_map / count if count > 0 else 0
        print(f"  - Params(k1={k1_val}, b={b_val}) -> MAP: {avg_map:.4f}")
        tuning_results.append({'k1': k1_val, 'b': b_val, 'MAP': avg_map})

        if avg_map > best_map:
            best_map = avg_map
            best_params = {'k1': k1_val, 'b': b_val}

print(f"Best: {best_params} (MAP: {best_map:.4f})")
pd.DataFrame(tuning_results).to_csv(os.path.join(DATA_DIR, 'tuning_results_sample.csv'), index=False)



[BM25 하이퍼파라미터 튜닝 (Sample)]
  - Params(k1=1.0, b=0.6) -> MAP: 0.5772
  - Params(k1=1.0, b=0.75) -> MAP: 0.5938
  - Params(k1=1.0, b=0.9) -> MAP: 0.6070
  - Params(k1=1.2, b=0.6) -> MAP: 0.5839
  - Params(k1=1.2, b=0.75) -> MAP: 0.6004
  - Params(k1=1.2, b=0.9) -> MAP: 0.6145
  - Params(k1=1.4, b=0.6) -> MAP: 0.5884
  - Params(k1=1.4, b=0.75) -> MAP: 0.6052
  - Params(k1=1.4, b=0.9) -> MAP: 0.6180
Best: {'k1': 1.4, 'b': 0.9} (MAP: 0.6180)


In [10]:
print("\n[BM25 하이퍼파라미터 튜닝 (2차: 범위 확장)]")

k1_values = [1.4, 1.5, 1.6, 1.8, 2.0]
b_values = [0.8, 0.9, 0.95, 1.0]

tuning_results = []
best_map = -1
best_params = {'k1': 1.4, 'b': 0.9} # 1차 튜닝 최고점

for k1_val in k1_values:
    for b_val in b_values:
        total_map = 0
        count = 0
        for qid, q_tokens in processed_queries.items():
            if not any(d in df.index for d in qrels_dict[qid]):
                continue

            scores = calculate_scores(q_tokens, model_type='BM25', k1=k1_val, b=b_val)

            ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
            ranked_ids = [d for d, s in ranked[:100]]

            _, _, ap = calculate_metrics(ranked_ids, qrels_dict[qid], k=10)
            total_map += ap
            count += 1

        avg_map = total_map / count if count > 0 else 0
        print(f"  - Params(k1={k1_val}, b={b_val}) -> MAP: {avg_map:.4f}")
        tuning_results.append({'k1': k1_val, 'b': b_val, 'MAP': avg_map})

        if avg_map > best_map:
            best_map = avg_map
            best_params = {'k1': k1_val, 'b': b_val}

print(f"\nBest: {best_params} (MAP: {best_map:.4f})")
pd.DataFrame(tuning_results).to_csv(os.path.join(DATA_DIR, 'tuning_results_sample.csv'), index=False)


[BM25 하이퍼파라미터 튜닝 (2차: 범위 확장)]
  - Params(k1=1.4, b=0.8) -> MAP: 0.6102
  - Params(k1=1.4, b=0.9) -> MAP: 0.6180
  - Params(k1=1.4, b=0.95) -> MAP: 0.6217
  - Params(k1=1.4, b=1.0) -> MAP: 0.6230
  - Params(k1=1.5, b=0.8) -> MAP: 0.6121
  - Params(k1=1.5, b=0.9) -> MAP: 0.6206
  - Params(k1=1.5, b=0.95) -> MAP: 0.6235
  - Params(k1=1.5, b=1.0) -> MAP: 0.6249
  - Params(k1=1.6, b=0.8) -> MAP: 0.6133
  - Params(k1=1.6, b=0.9) -> MAP: 0.6221
  - Params(k1=1.6, b=0.95) -> MAP: 0.6249
  - Params(k1=1.6, b=1.0) -> MAP: 0.6262
  - Params(k1=1.8, b=0.8) -> MAP: 0.6160
  - Params(k1=1.8, b=0.9) -> MAP: 0.6254
  - Params(k1=1.8, b=0.95) -> MAP: 0.6280
  - Params(k1=1.8, b=1.0) -> MAP: 0.6293
  - Params(k1=2.0, b=0.8) -> MAP: 0.6181
  - Params(k1=2.0, b=0.9) -> MAP: 0.6276
  - Params(k1=2.0, b=0.95) -> MAP: 0.6300
  - Params(k1=2.0, b=1.0) -> MAP: 0.6325

Best: {'k1': 2.0, 'b': 1.0} (MAP: 0.6325)


In [11]:
print("\n[BM25 하이퍼파라미터 튜닝 (3차)]")

k1_values = [2.0, 4.0, 6.0]
b_values = [1.0, 2.0, 3.0]

tuning_results = []
best_map = -1
best_params = {'k1': 2.0, 'b': 1.0} # 2차 튜닝 최고점

for k1_val in k1_values:
    for b_val in b_values:
        total_map = 0
        count = 0
        for qid, q_tokens in processed_queries.items():
            if not any(d in df.index for d in qrels_dict[qid]):
                continue

            scores = calculate_scores(q_tokens, model_type='BM25', k1=k1_val, b=b_val)

            ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
            ranked_ids = [d for d, s in ranked[:100]]

            _, _, ap = calculate_metrics(ranked_ids, qrels_dict[qid], k=10)
            total_map += ap
            count += 1

        avg_map = total_map / count if count > 0 else 0
        print(f"  - Params(k1={k1_val}, b={b_val}) -> MAP: {avg_map:.4f}")
        tuning_results.append({'k1': k1_val, 'b': b_val, 'MAP': avg_map})

        if avg_map > best_map:
            best_map = avg_map
            best_params = {'k1': k1_val, 'b': b_val}

print(f"\nBest: {best_params} (MAP: {best_map:.4f})")
pd.DataFrame(tuning_results).to_csv(os.path.join(DATA_DIR, 'tuning_results_sample.csv'), index=False)


[BM25 하이퍼파라미터 튜닝 (3차)]
  - Params(k1=2.0, b=1.0) -> MAP: 0.6325
  - Params(k1=2.0, b=2.0) -> MAP: 0.0419
  - Params(k1=2.0, b=3.0) -> MAP: 0.0282
  - Params(k1=4.0, b=1.0) -> MAP: 0.6378
  - Params(k1=4.0, b=2.0) -> MAP: 0.0377
  - Params(k1=4.0, b=3.0) -> MAP: 0.0326
  - Params(k1=6.0, b=1.0) -> MAP: 0.6354
  - Params(k1=6.0, b=2.0) -> MAP: 0.0337
  - Params(k1=6.0, b=3.0) -> MAP: 0.0348

Best: {'k1': 4.0, 'b': 1.0} (MAP: 0.6378)


In [12]:
print("\n[BM25 하이퍼파라미터 튜닝 (4차)]")

k1_values = [3.0, 3.5, 3.8, 4.0, 4.2, 4.5, 5.0]
b_values = [0.95, 0.98, 0.99, 1.0]

tuning_results = []
best_map = -1
best_params = {'k1': 4.0, 'b': 1.0}

for k1_val in k1_values:
    for b_val in b_values:
        total_map = 0
        count = 0
        for qid, q_tokens in processed_queries.items():
            if not any(d in df.index for d in qrels_dict[qid]):
                continue

            scores = calculate_scores(q_tokens, model_type='BM25', k1=k1_val, b=b_val)

            ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
            ranked_ids = [d for d, s in ranked[:100]]

            _, _, ap = calculate_metrics(ranked_ids, qrels_dict[qid], k=10)
            total_map += ap
            count += 1

        avg_map = total_map / count if count > 0 else 0
        print(f"  - Params(k1={k1_val}, b={b_val}) -> MAP: {avg_map:.4f}")
        tuning_results.append({'k1': k1_val, 'b': b_val, 'MAP': avg_map})

        if avg_map > best_map:
            best_map = avg_map
            best_params = {'k1': k1_val, 'b': b_val}

print(f"\nFinal Best: {best_params} (MAP: {best_map:.4f})")
pd.DataFrame(tuning_results).to_csv(os.path.join(DATA_DIR, 'tuning_results_sample.csv'), index=False)


[BM25 하이퍼파라미터 튜닝 (4차)]
  - Params(k1=3.0, b=0.95) -> MAP: 0.6365
  - Params(k1=3.0, b=0.98) -> MAP: 0.6375
  - Params(k1=3.0, b=0.99) -> MAP: 0.6374
  - Params(k1=3.0, b=1.0) -> MAP: 0.6374
  - Params(k1=3.5, b=0.95) -> MAP: 0.6377
  - Params(k1=3.5, b=0.98) -> MAP: 0.6377
  - Params(k1=3.5, b=0.99) -> MAP: 0.6382
  - Params(k1=3.5, b=1.0) -> MAP: 0.6375
  - Params(k1=3.8, b=0.95) -> MAP: 0.6379
  - Params(k1=3.8, b=0.98) -> MAP: 0.6379
  - Params(k1=3.8, b=0.99) -> MAP: 0.6385
  - Params(k1=3.8, b=1.0) -> MAP: 0.6379
  - Params(k1=4.0, b=0.95) -> MAP: 0.6381
  - Params(k1=4.0, b=0.98) -> MAP: 0.6389
  - Params(k1=4.0, b=0.99) -> MAP: 0.6386
  - Params(k1=4.0, b=1.0) -> MAP: 0.6378
  - Params(k1=4.2, b=0.95) -> MAP: 0.6388
  - Params(k1=4.2, b=0.98) -> MAP: 0.6388
  - Params(k1=4.2, b=0.99) -> MAP: 0.6388
  - Params(k1=4.2, b=1.0) -> MAP: 0.6377
  - Params(k1=4.5, b=0.95) -> MAP: 0.6384
  - Params(k1=4.5, b=0.98) -> MAP: 0.6385
  - Params(k1=4.5, b=0.99) -> MAP: 0.6381
  - Params(k1=4

In [13]:
print("\n[회귀 데이터 생성 (Sample)]")
regression_data = []
metrics_data = []

models_to_run = [
    {'name': 'BIM', 'type': 'BIM', 'k1': 0, 'b': 0},
    {'name': 'BM25_Best', 'type': 'BM25', 'k1': best_params['k1'], 'b': best_params['b']}
]

for model_info in models_to_run:
    m_name = model_info['name']
    for qid, q_tokens in tqdm(processed_queries.items(), desc=m_name):
        target_docs = qrels_dict[qid]
        valid_targets = [d for d in target_docs if d in df.index]
        if not valid_targets: continue

        scores = calculate_scores(q_tokens, model_type=model_info['type'],
                                  k1=model_info['k1'], b=model_info['b'])

        ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        ranked_ids = [d for d, s in ranked[:100]]

        p10, r10, ap = calculate_metrics(ranked_ids, target_docs, k=10)
        metrics_data.append({'qid': qid, 'model': m_name, 'P@10': p10, 'R@10': r10, 'AP': ap})

        top_10_ids = set(ranked_ids[:10])

        for target_doc in valid_targets:
            is_success = 1 if target_doc in top_10_ids else 0
            doc_len = df.at[target_doc, 'doc_length']
            topic_probs = df.at[target_doc, 'topic_probs']

            row = {
                'qid': qid, 'doc_id': target_doc, 'model': m_name,
                'success': is_success, 'doc_length': doc_len,
                'query_length': len(q_tokens),
            }
            for idx, prob in enumerate(topic_probs):
                row[f'topic_{idx}'] = prob
            regression_data.append(row)

conn.close()

df_reg = pd.DataFrame(regression_data)
df_met = pd.DataFrame(metrics_data)

reg_path = os.path.join(DATA_DIR, 'regression_dataset_sample.csv')
met_path = os.path.join(DATA_DIR, 'performance_metrics_sample.csv')

df_reg.to_csv(reg_path, index=False)
df_met.to_csv(met_path, index=False)

print("\n[샘플링 작업 완료]")
print(f"회귀 데이터: {reg_path}")
print(f"성능 지표: {met_path}")


[회귀 데이터 생성 (Sample)]


BIM: 100%|██████████| 1454/1454 [01:03<00:00, 22.80it/s]
BM25_Best: 100%|██████████| 1454/1454 [01:34<00:00, 15.34it/s]



[샘플링 작업 완료]
회귀 데이터: ..\final_data\regression_dataset_sample.csv
성능 지표: ..\final_data\performance_metrics_sample.csv
