In [1]:
import os
import sqlite3
import pickle
import math
import pandas as pd
import numpy as np
from kiwipiepy import Kiwi
from tqdm import tqdm
import warnings

warnings.filterwarnings('ignore')

BASE_DIR = os.path.join('..')
DATA_DIR = os.path.join(BASE_DIR, 'data_final')
DB_DIR = os.path.join(BASE_DIR, 'database_final')
SAMPLED_DATA_PATH = os.path.join(DATA_DIR, 'sampled_data_final.pkl')
QUERIES_PATH = os.path.join(BASE_DIR, 'data', 'queries.pkl')
QRELS_PATH = os.path.join(BASE_DIR, 'data', 'qrels.pkl')

os.makedirs(DB_DIR, exist_ok=True)
DB_PATH = os.path.join(DB_DIR, 'search_index_final.db')

print("샘플 데이터 로드")
df = pd.read_pickle(SAMPLED_DATA_PATH)

if '_id' in df.columns:
    df.set_index('_id', inplace=True)
elif 'doc_id' in df.columns:
    df.set_index('doc_id', inplace=True)
df.index = df.index.astype(str)

print(f"로드된 문서 수: {len(df)}개")

with open(QUERIES_PATH, 'rb') as f:
    queries_data = pickle.load(f)
with open(QRELS_PATH, 'rb') as f:
    qrels_data = pickle.load(f)

if os.path.exists(DB_PATH):
    os.remove(DB_PATH)

conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute('CREATE TABLE inverted_index (term TEXT, doc_id TEXT, tf INTEGER)')
cursor.execute('CREATE INDEX idx_term ON inverted_index(term)')

샘플 데이터 로드
로드된 문서 수: 5000개


<sqlite3.Cursor at 0x164dce546c0>

In [2]:
print("\n역색인 구축 (패딩 O 제외)")
data_to_insert = []
BATCH_SIZE = 50000

doc_freq = {}
total_docs = len(df)
avg_dl = df['doc_length'].mean()

for doc_id, row in tqdm(df.iterrows(), total=total_docs, desc="인덱싱"):
    tokens = row['tokens_padded']
    term_counts = {}

    for t in tokens:
        if set(t) == {'O'}: continue
        term_counts[t] = term_counts.get(t, 0) + 1

    for term, tf in term_counts.items():
        data_to_insert.append((term, str(doc_id), tf))
        doc_freq[term] = doc_freq.get(term, 0) + 1

    if len(data_to_insert) >= BATCH_SIZE:
        cursor.executemany('INSERT INTO inverted_index VALUES (?, ?, ?)', data_to_insert)
        data_to_insert = []

if data_to_insert:
    cursor.executemany('INSERT INTO inverted_index VALUES (?, ?, ?)', data_to_insert)
conn.commit()
print("인덱싱 완료")


역색인 구축 (패딩 O 제외)


인덱싱: 100%|██████████| 5000/5000 [00:35<00:00, 140.15it/s]


인덱싱 완료


In [3]:
kiwi = Kiwi(num_workers=0)
idf_cache = {t: math.log((total_docs - df_count + 0.5) / (df_count + 0.5) + 1) for t, df_count in doc_freq.items()}

def tokenize_query(text):
    try:
        return [t.form for t in kiwi.tokenize(text) if t.tag in ['NNG', 'NNP', 'VV', 'VA', 'MAG']]
    except:
        return []

def calculate_scores(query_tokens, model_type='BM25', k1=1.2, b=0.75):
    scores = {}
    for term in query_tokens:
        if term not in idf_cache: continue
        idf = idf_cache[term]

        cursor.execute('SELECT doc_id, tf FROM inverted_index WHERE term = ?', (term,))
        rows = cursor.fetchall()

        for doc_id, tf in rows:
            if model_type == 'BIM':
                scores[doc_id] = scores.get(doc_id, 0.0) + idf
            else:
                doc_len = df.at[doc_id, 'doc_length']
                num = tf * (k1 + 1)
                den = tf + k1 * (1 - b + b * (doc_len / avg_dl))
                scores[doc_id] = scores.get(doc_id, 0.0) + idf * (num / den)
    return scores

def calculate_metrics(ranked_docs, relevant_docs, k=10):
    top_k = ranked_docs[:k]
    relevant_retrieved_k = len(set(top_k) & relevant_docs)
    p_at_k = relevant_retrieved_k / k

    total_relevant = len(relevant_docs)
    r_at_k = relevant_retrieved_k / total_relevant if total_relevant > 0 else 0

    relevant_count = 0
    precision_sum = 0.0
    for i, doc_id in enumerate(ranked_docs):
        if doc_id in relevant_docs:
            relevant_count += 1
            precision_sum += relevant_count / (i + 1)

    ap = precision_sum / total_relevant if total_relevant > 0 else 0
    return p_at_k, r_at_k, ap

qrels_dict = {}
for item in qrels_data:
    qid, doc_id = str(item.get('query-id')), str(item.get('corpus-id'))
    if qid and doc_id:
        qrels_dict.setdefault(qid, set()).add(doc_id)

queries_map = {str(q['_id']): q['text'] for q in queries_data if '_id' in q}

print("\n쿼리 전처리 중")
processed_queries = {}
for qid in qrels_dict:
    if qid in queries_map:
        tokens = tokenize_query(queries_map[qid])
        if tokens:
            processed_queries[qid] = tokens

print(f"실험 대상 쿼리 수: {len(processed_queries)}개")


쿼리 전처리 중
실험 대상 쿼리 수: 1454개


In [4]:
print("\n[BM25 하이퍼파라미터 튜닝 시작]")

k1_values = [2.5, 2.75, 3.0, 3.25, 3.5, 3.75, 4.0, 4.25, 4.5]
b_values = [0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.98, 0.99]

tuning_results = []
best_map = -1
best_params = {'k1': 4.0, 'b': 1.0}

print(f"튜닝 후보: k1={k1_values}, b={b_values} (총 {len(k1_values)*len(b_values)}회)")

for k1_val in k1_values:
    for b_val in b_values:
        total_map = 0
        count = 0

        for qid, q_tokens in tqdm(processed_queries.items(), desc=f"k1={k1_val}, b={b_val}", leave=False):
            if not any(d in df.index for d in qrels_dict[qid]):
                continue

            scores = calculate_scores(q_tokens, model_type='BM25', k1=k1_val, b=b_val)

            ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
            ranked_ids = [d for d, s in ranked[:100]]

            _, _, ap = calculate_metrics(ranked_ids, qrels_dict[qid], k=10)
            total_map += ap
            count += 1

        avg_map = total_map / count if count > 0 else 0
        print(f"  - k1={k1_val}, b={b_val} -> MAP: {avg_map:.4f}")

        tuning_results.append({'k1': k1_val, 'b': b_val, 'MAP': avg_map})

        if avg_map > best_map:
            best_map = avg_map
            best_params = {'k1': k1_val, 'b': b_val}

print(f"\n최적 파라미터: {best_params} (MAP: {best_map:.4f})")

pd.DataFrame(tuning_results).to_csv(os.path.join(DATA_DIR, 'tuning_results_final.csv'), index=False)


[BM25 하이퍼파라미터 튜닝 시작]
튜닝 후보: k1=[2.5, 2.75, 3.0, 3.25, 3.5, 3.75, 4.0, 4.25, 4.5], b=[0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.98, 0.99] (총 81회)


                                                                   

  - k1=2.5, b=0.65 -> MAP: 0.6142


                                                                  

  - k1=2.5, b=0.7 -> MAP: 0.6219


                                                                   

  - k1=2.5, b=0.75 -> MAP: 0.6290


                                                                  

  - k1=2.5, b=0.8 -> MAP: 0.6368


                                                                   

  - k1=2.5, b=0.85 -> MAP: 0.6424


                                                                  

  - k1=2.5, b=0.9 -> MAP: 0.6482


                                                                   

  - k1=2.5, b=0.95 -> MAP: 0.6526


                                                                   

  - k1=2.5, b=0.98 -> MAP: 0.6537


                                                                   

  - k1=2.5, b=0.99 -> MAP: 0.6539


                                                                    

  - k1=2.75, b=0.65 -> MAP: 0.6157


                                                                   

  - k1=2.75, b=0.7 -> MAP: 0.6238


                                                                    

  - k1=2.75, b=0.75 -> MAP: 0.6303


                                                                   

  - k1=2.75, b=0.8 -> MAP: 0.6380


                                                                    

  - k1=2.75, b=0.85 -> MAP: 0.6441


                                                                   

  - k1=2.75, b=0.9 -> MAP: 0.6499


                                                                    

  - k1=2.75, b=0.95 -> MAP: 0.6539


                                                                    

  - k1=2.75, b=0.98 -> MAP: 0.6549


                                                                    

  - k1=2.75, b=0.99 -> MAP: 0.6553


                                                                   

  - k1=3.0, b=0.65 -> MAP: 0.6166


                                                                  

  - k1=3.0, b=0.7 -> MAP: 0.6248


                                                                   

  - k1=3.0, b=0.75 -> MAP: 0.6312


                                                                  

  - k1=3.0, b=0.8 -> MAP: 0.6388


                                                                   

  - k1=3.0, b=0.85 -> MAP: 0.6456


                                                                  

  - k1=3.0, b=0.9 -> MAP: 0.6512


                                                                   

  - k1=3.0, b=0.95 -> MAP: 0.6548


                                                                   

  - k1=3.0, b=0.98 -> MAP: 0.6565


                                                                   

  - k1=3.0, b=0.99 -> MAP: 0.6573


                                                                    

  - k1=3.25, b=0.65 -> MAP: 0.6167


                                                                   

  - k1=3.25, b=0.7 -> MAP: 0.6241


                                                                    

  - k1=3.25, b=0.75 -> MAP: 0.6322


                                                                   

  - k1=3.25, b=0.8 -> MAP: 0.6393


                                                                    

  - k1=3.25, b=0.85 -> MAP: 0.6472


                                                                   

  - k1=3.25, b=0.9 -> MAP: 0.6527


                                                                    

  - k1=3.25, b=0.95 -> MAP: 0.6564


                                                                    

  - k1=3.25, b=0.98 -> MAP: 0.6573


                                                                    

  - k1=3.25, b=0.99 -> MAP: 0.6577


                                                                   

  - k1=3.5, b=0.65 -> MAP: 0.6173


                                                                  

  - k1=3.5, b=0.7 -> MAP: 0.6243


                                                                   

  - k1=3.5, b=0.75 -> MAP: 0.6327


                                                                  

  - k1=3.5, b=0.8 -> MAP: 0.6397


                                                                   

  - k1=3.5, b=0.85 -> MAP: 0.6485


                                                                  

  - k1=3.5, b=0.9 -> MAP: 0.6548


                                                                   

  - k1=3.5, b=0.95 -> MAP: 0.6573


                                                                   

  - k1=3.5, b=0.98 -> MAP: 0.6584


                                                                   

  - k1=3.5, b=0.99 -> MAP: 0.6588


                                                                    

  - k1=3.75, b=0.65 -> MAP: 0.6169


                                                                   

  - k1=3.75, b=0.7 -> MAP: 0.6241


                                                                    

  - k1=3.75, b=0.75 -> MAP: 0.6326


                                                                   

  - k1=3.75, b=0.8 -> MAP: 0.6404


                                                                    

  - k1=3.75, b=0.85 -> MAP: 0.6483


                                                                   

  - k1=3.75, b=0.9 -> MAP: 0.6554


                                                                    

  - k1=3.75, b=0.95 -> MAP: 0.6580


                                                                    

  - k1=3.75, b=0.98 -> MAP: 0.6590


                                                                    

  - k1=3.75, b=0.99 -> MAP: 0.6590


                                                                   

  - k1=4.0, b=0.65 -> MAP: 0.6170


                                                                  

  - k1=4.0, b=0.7 -> MAP: 0.6245


                                                                   

  - k1=4.0, b=0.75 -> MAP: 0.6332


                                                                  

  - k1=4.0, b=0.8 -> MAP: 0.6405


                                                                   

  - k1=4.0, b=0.85 -> MAP: 0.6490


                                                                  

  - k1=4.0, b=0.9 -> MAP: 0.6552


                                                                   

  - k1=4.0, b=0.95 -> MAP: 0.6585


                                                                   

  - k1=4.0, b=0.98 -> MAP: 0.6596


                                                                   

  - k1=4.0, b=0.99 -> MAP: 0.6596


                                                                    

  - k1=4.25, b=0.65 -> MAP: 0.6169


                                                                   

  - k1=4.25, b=0.7 -> MAP: 0.6243


                                                                    

  - k1=4.25, b=0.75 -> MAP: 0.6336


                                                                   

  - k1=4.25, b=0.8 -> MAP: 0.6412


                                                                    

  - k1=4.25, b=0.85 -> MAP: 0.6497


                                                                   

  - k1=4.25, b=0.9 -> MAP: 0.6554


                                                                    

  - k1=4.25, b=0.95 -> MAP: 0.6592


                                                                    

  - k1=4.25, b=0.98 -> MAP: 0.6596


                                                                    

  - k1=4.25, b=0.99 -> MAP: 0.6596


                                                                   

  - k1=4.5, b=0.65 -> MAP: 0.6168


                                                                  

  - k1=4.5, b=0.7 -> MAP: 0.6251


                                                                   

  - k1=4.5, b=0.75 -> MAP: 0.6334


                                                                  

  - k1=4.5, b=0.8 -> MAP: 0.6417


                                                                   

  - k1=4.5, b=0.85 -> MAP: 0.6496


                                                                  

  - k1=4.5, b=0.9 -> MAP: 0.6561


                                                                   

  - k1=4.5, b=0.95 -> MAP: 0.6595


                                                                   

  - k1=4.5, b=0.98 -> MAP: 0.6604


                                                                   

  - k1=4.5, b=0.99 -> MAP: 0.6602

최적 파라미터: {'k1': 4.5, 'b': 0.98} (MAP: 0.6604)




In [13]:
print("[BM25 하이퍼파라미터 추가 탐색 - k1 고구간]")

conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()

k1_values = [5.0, 5.25, 5.5]
b_values = [0.95, 0.98, 0.99]

candidate_params = []
for k1_val in k1_values:
    for b_val in b_values:
        candidate_params.append({'k1': k1_val, 'b': b_val})

print(f"튜닝 후보: k1={k1_values}, b={b_values} (총 {len(k1_values)*len(b_values)}회)")

tuning_results = []
best_map = 0.6604
best_params = {'k1': 4.5, 'b': 0.98}

for idx, params in enumerate(candidate_params, 1):
    k1_val = params['k1']
    b_val = params['b']

    total_map = 0
    count = 0

    for qid, q_tokens in tqdm(processed_queries.items(), desc=f"k1={k1_val}, b={b_val}", leave=False):
        if not any(d in df.index for d in qrels_dict[qid]):
            continue

        scores = calculate_scores(q_tokens, model_type='BM25', k1=k1_val, b=b_val)
        ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        ranked_ids = [d for d, s in ranked[:100]]
        _, _, ap = calculate_metrics(ranked_ids, qrels_dict[qid], k=10)
        total_map += ap
        count += 1

    avg_map = total_map / count if count > 0 else 0

    tuning_results.append({
        'k1': k1_val,
        'b': b_val,
        'MAP': avg_map,
        'queries': count
    })

    print(f"  - k1={k1_val}, b={b_val} -> MAP: {avg_map:.4f}")

    if avg_map > best_map:
        best_map = avg_map
        best_params = {'k1': k1_val, 'b': b_val}

print(f"\n최종 최적 파라미터: {best_params} (MAP: {best_map:.4f})")

df_extended = pd.DataFrame(tuning_results)
df_extended.to_csv(os.path.join(DATA_DIR, 'tuning_extended_final.csv'), index=False)
print("저장 완료: tuning_extended_final.csv")

conn.close()

[BM25 하이퍼파라미터 추가 탐색 - k1 고구간]
튜닝 후보: k1=[5.0, 5.25, 5.5], b=[0.95, 0.98, 0.99] (총 9회)


                                                                   

  - k1=5.0, b=0.95 -> MAP: 0.6596


                                                                   

  - k1=5.0, b=0.98 -> MAP: 0.6604


                                                                   

  - k1=5.0, b=0.99 -> MAP: 0.6609


                                                                    

  - k1=5.25, b=0.95 -> MAP: 0.6599


                                                                    

  - k1=5.25, b=0.98 -> MAP: 0.6608


                                                                    

  - k1=5.25, b=0.99 -> MAP: 0.6612


                                                                   

  - k1=5.5, b=0.95 -> MAP: 0.6603


                                                                   

  - k1=5.5, b=0.98 -> MAP: 0.6610


                                                                   

  - k1=5.5, b=0.99 -> MAP: 0.6620

최종 최적 파라미터: {'k1': 5.5, 'b': 0.99} (MAP: 0.6620)
저장 완료: tuning_extended_final.csv




In [14]:
print("\n[최종 평가 및 회귀 데이터셋 생성]")
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()

regression_data = []
metrics_data = []

models_to_run = [
    {'name': 'BIM', 'type': 'BIM', 'k1': 0, 'b': 0},
    {'name': 'BM25_Best', 'type': 'BM25', 'k1': best_params['k1'], 'b': best_params['b']}
]

for model_info in models_to_run:
    m_name = model_info['name']
    print(f"  - 모델 실행: {m_name}")

    for qid, q_tokens in tqdm(processed_queries.items(), desc=m_name):
        target_docs = qrels_dict[qid]

        scores = calculate_scores(q_tokens, model_type=model_info['type'],
                                  k1=model_info['k1'], b=model_info['b'])

        ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        ranked_ids = [d for d, s in ranked[:100]]

        p10, r10, ap = calculate_metrics(ranked_ids, target_docs, k=10)
        metrics_data.append({
            'qid': qid,
            'model': m_name,
            'P@10': p10, 'R@10': r10, 'AP': ap
        })

        top_10_ids = set(ranked_ids[:10])

        query_avg_token_len = np.mean([len(t) for t in q_tokens]) if q_tokens else 0
        query_unique_ratio = len(set(q_tokens)) / len(q_tokens) if q_tokens else 0

        for doc_id in ranked_ids[:10]:
            if doc_id not in df.index: continue

            is_relevant = 1 if doc_id in target_docs else 0
            doc_len = df.at[doc_id, 'doc_length']
            dominant_topic = df.at[doc_id, 'dominant_topic']
            dominant_prob = df.at[doc_id, 'dominant_prob']

            doc_tokens_clean = [t for t in df.at[doc_id, 'tokens_padded'] if set(t) != {'O'}]
            query_match_count = sum(1 for qt in q_tokens if qt in doc_tokens_clean)
            query_match_ratio = query_match_count / len(q_tokens) if q_tokens else 0

            doc_score = scores.get(doc_id, 0.0)

            row = {
                'qid': qid,
                'doc_id': doc_id,
                'model': m_name,
                'relevance': is_relevant,
                'doc_length': doc_len,
                'query_length': len(q_tokens),
                'query_avg_token_len': query_avg_token_len,
                'query_unique_ratio': query_unique_ratio,
                'query_match_count': query_match_count,
                'query_match_ratio': query_match_ratio,
                'dominant_topic': dominant_topic,
                'dominant_prob': dominant_prob,
                'search_score': doc_score
            }
            regression_data.append(row)

        for target_doc in target_docs:
            if target_doc in top_10_ids: continue
            if target_doc not in df.index: continue

            doc_len = df.at[target_doc, 'doc_length']
            dominant_topic = df.at[target_doc, 'dominant_topic']
            dominant_prob = df.at[target_doc, 'dominant_prob']

            doc_tokens_clean = [t for t in df.at[target_doc, 'tokens_padded'] if set(t) != {'O'}]
            query_match_count = sum(1 for qt in q_tokens if qt in doc_tokens_clean)
            query_match_ratio = query_match_count / len(q_tokens) if q_tokens else 0

            doc_score = scores.get(target_doc, 0.0)

            row = {
                'qid': qid,
                'doc_id': target_doc,
                'model': m_name,
                'relevance': 1,
                'doc_length': doc_len,
                'query_length': len(q_tokens),
                'query_avg_token_len': query_avg_token_len,
                'query_unique_ratio': query_unique_ratio,
                'query_match_count': query_match_count,
                'query_match_ratio': query_match_ratio,
                'dominant_topic': dominant_topic,
                'dominant_prob': dominant_prob,
                'search_score': doc_score
            }
            regression_data.append(row)

conn.close()


[최종 평가 및 회귀 데이터셋 생성]
  - 모델 실행: BIM


BIM: 100%|██████████| 1454/1454 [02:41<00:00,  8.99it/s]


  - 모델 실행: BM25_Best


BM25_Best: 100%|██████████| 1454/1454 [03:15<00:00,  7.45it/s]


In [15]:
print("\n데이터셋 저장")
pd.DataFrame(regression_data).to_csv(os.path.join(DATA_DIR, 'regression_dataset_final.csv'), index=False)
pd.DataFrame(metrics_data).to_csv(os.path.join(DATA_DIR, 'performance_metrics_final.csv'), index=False)

print("\n작업 완료")
print(f"회귀 데이터셋: {len(regression_data)}개 샘플")
print(f"성능 메트릭: {len(metrics_data)}개 레코드")


데이터셋 저장

작업 완료
회귀 데이터셋: 33016개 샘플
성능 메트릭: 2908개 레코드
