In [1]:
import os
import sqlite3
import pickle
import math
import pandas as pd
import numpy as np
from kiwipiepy import Kiwi
from tqdm import tqdm
import warnings

warnings.filterwarnings('ignore')

BASE_DIR = os.path.join('..')
DATA_DIR = os.path.join(BASE_DIR, 'data_final')
DB_DIR = os.path.join(BASE_DIR, 'database_final')
SAMPLED_DATA_PATH = os.path.join(DATA_DIR, 'sampled_data_v2.pkl')
QUERIES_PATH = os.path.join(BASE_DIR, 'data', 'queries.pkl')
QRELS_PATH = os.path.join(BASE_DIR, 'data', 'qrels.pkl')

os.makedirs(DB_DIR, exist_ok=True)
DB_PATH = os.path.join(DB_DIR, 'search_index_v2.db')

print("샘플 데이터 로드")
df = pd.read_pickle(SAMPLED_DATA_PATH)

if '_id' in df.columns:
    df.set_index('_id', inplace=True)
elif 'doc_id' in df.columns:
    df.set_index('doc_id', inplace=True)
df.index = df.index.astype(str)

print(f"로드된 문서 수: {len(df)}개")

with open(QUERIES_PATH, 'rb') as f:
    queries_data = pickle.load(f)
with open(QRELS_PATH, 'rb') as f:
    qrels_data = pickle.load(f)

if os.path.exists(DB_PATH):
    os.remove(DB_PATH)

conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute('CREATE TABLE inverted_index (term TEXT, doc_id TEXT, tf INTEGER)')
cursor.execute('CREATE INDEX idx_term ON inverted_index(term)')

샘플 데이터 로드
로드된 문서 수: 5000개


<sqlite3.Cursor at 0x2132d644840>

In [2]:
print("\n역색인 구축 (패딩 O 제외)")
data_to_insert = []
BATCH_SIZE = 50000

doc_freq = {}
total_docs = len(df)
avg_dl = df['doc_length'].mean()

for doc_id, row in tqdm(df.iterrows(), total=total_docs, desc="인덱싱"):
    tokens = row['tokens_padded']
    term_counts = {}

    for t in tokens:
        if set(t) == {'O'}:
            continue
        term_counts[t] = term_counts.get(t, 0) + 1

    for term, tf in term_counts.items():
        data_to_insert.append((term, str(doc_id), tf))
        doc_freq[term] = doc_freq.get(term, 0) + 1

    if len(data_to_insert) >= BATCH_SIZE:
        cursor.executemany('INSERT INTO inverted_index VALUES (?, ?, ?)', data_to_insert)
        data_to_insert = []

if data_to_insert:
    cursor.executemany('INSERT INTO inverted_index VALUES (?, ?, ?)', data_to_insert)
conn.commit()
print("인덱싱 완료")


역색인 구축 (패딩 O 제외)


인덱싱: 100%|██████████| 5000/5000 [00:24<00:00, 201.76it/s]


인덱싱 완료


In [3]:
kiwi = Kiwi(num_workers=0)
idf_cache = {t: math.log((total_docs - df_count + 0.5) / (df_count + 0.5) + 1) for t, df_count in doc_freq.items()}

def tokenize_query(text):
    try:
        return [t.form for t in kiwi.tokenize(text) if t.tag in ['NNG', 'NNP', 'VV', 'VA', 'MAG']]
    except:
        return []

def calculate_scores(query_tokens, model_type='BM25', k1=1.2, b=0.75):
    scores = {}
    for term in query_tokens:
        if term not in idf_cache:
            continue
        idf = idf_cache[term]

        cursor.execute('SELECT doc_id, tf FROM inverted_index WHERE term = ?', (term,))
        rows = cursor.fetchall()

        for doc_id, tf in rows:
            if model_type == 'BIM':
                scores[doc_id] = scores.get(doc_id, 0.0) + idf
            else:
                doc_len = df.at[doc_id, 'doc_length']
                num = tf * (k1 + 1)
                den = tf + k1 * (1 - b + b * (doc_len / avg_dl))
                scores[doc_id] = scores.get(doc_id, 0.0) + idf * (num / den)
    return scores

def calculate_metrics(ranked_docs, relevant_docs, k=10):
    top_k = ranked_docs[:k]
    relevant_retrieved_k = len(set(top_k) & relevant_docs)
    p_at_k = relevant_retrieved_k / k

    total_relevant = len(relevant_docs)
    r_at_k = relevant_retrieved_k / total_relevant if total_relevant > 0 else 0

    relevant_count = 0
    precision_sum = 0.0
    for i, doc_id in enumerate(ranked_docs):
        if doc_id in relevant_docs:
            relevant_count += 1
            precision_sum += relevant_count / (i + 1)

    ap = precision_sum / total_relevant if total_relevant > 0 else 0
    return p_at_k, r_at_k, ap

qrels_dict = {}
for item in qrels_data:
    qid, doc_id = str(item.get('query-id')), str(item.get('corpus-id'))
    if qid and doc_id:
        qrels_dict.setdefault(qid, set()).add(doc_id)

queries_map = {str(q['_id']): q['text'] for q in queries_data if '_id' in q}

print("\n쿼리 전처리")
processed_queries = {}
for qid in qrels_dict:
    if qid in queries_map:
        tokens = tokenize_query(queries_map[qid])
        if tokens:
            processed_queries[qid] = tokens

print(f"실험 대상 쿼리 수: {len(processed_queries)}개")


쿼리 전처리
실험 대상 쿼리 수: 1454개


In [4]:
print("\n[1차 하이퍼파라미터 탐색 - Coarse Grid]")
k1_coarse = [2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]
b_coarse = [0.6, 0.7, 0.8, 0.9, 0.99]

tuning_coarse = []
best_map_coarse = -1
best_params_coarse = {'k1': 3.0, 'b': 0.75}

print(f"탐색 범위: k1={k1_coarse}, b={b_coarse} (총 {len(k1_coarse)*len(b_coarse)}회)")

for k1_val in k1_coarse:
    for b_val in b_coarse:
        total_map = 0
        count = 0

        for qid, q_tokens in tqdm(processed_queries.items(), desc=f"k1={k1_val}, b={b_val}", leave=False):
            if not any(d in df.index for d in qrels_dict[qid]):
                continue

            scores = calculate_scores(q_tokens, model_type='BM25', k1=k1_val, b=b_val)
            ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
            ranked_ids = [d for d, s in ranked[:100]]

            _, _, ap = calculate_metrics(ranked_ids, qrels_dict[qid], k=10)
            total_map += ap
            count += 1

        avg_map = total_map / count if count > 0 else 0
        print(f"  k1={k1_val}, b={b_val} -> MAP: {avg_map:.4f}")

        tuning_coarse.append({'k1': k1_val, 'b': b_val, 'MAP': avg_map})

        if avg_map > best_map_coarse:
            best_map_coarse = avg_map
            best_params_coarse = {'k1': k1_val, 'b': b_val}

print(f"\n1차 최적 파라미터: {best_params_coarse} (MAP: {best_map_coarse:.4f})")
pd.DataFrame(tuning_coarse).to_csv(os.path.join(DATA_DIR, 'tuning_coarse_v2.csv'), index=False)


[1차 하이퍼파라미터 탐색 - Coarse Grid]
탐색 범위: k1=[2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0], b=[0.6, 0.7, 0.8, 0.9, 0.99] (총 35회)


                                                                  

  k1=2.0, b=0.6 -> MAP: 0.6045


                                                                  

  k1=2.0, b=0.7 -> MAP: 0.6116


                                                                  

  k1=2.0, b=0.8 -> MAP: 0.6194


                                                                  

  k1=2.0, b=0.9 -> MAP: 0.6256


                                                                   

  k1=2.0, b=0.99 -> MAP: 0.6277


                                                                  

  k1=2.5, b=0.6 -> MAP: 0.6066


                                                                  

  k1=2.5, b=0.7 -> MAP: 0.6146


                                                                  

  k1=2.5, b=0.8 -> MAP: 0.6204


                                                                  

  k1=2.5, b=0.9 -> MAP: 0.6263


                                                                   

  k1=2.5, b=0.99 -> MAP: 0.6293


                                                                  

  k1=3.0, b=0.6 -> MAP: 0.6066


                                                                  

  k1=3.0, b=0.7 -> MAP: 0.6154


                                                                  

  k1=3.0, b=0.8 -> MAP: 0.6214


                                                                  

  k1=3.0, b=0.9 -> MAP: 0.6287


                                                                   

  k1=3.0, b=0.99 -> MAP: 0.6305


                                                                  

  k1=3.5, b=0.6 -> MAP: 0.6046


                                                                  

  k1=3.5, b=0.7 -> MAP: 0.6152


                                                                  

  k1=3.5, b=0.8 -> MAP: 0.6217


                                                                  

  k1=3.5, b=0.9 -> MAP: 0.6290


                                                                   

  k1=3.5, b=0.99 -> MAP: 0.6304


                                                                  

  k1=4.0, b=0.6 -> MAP: 0.6030


                                                                  

  k1=4.0, b=0.7 -> MAP: 0.6134


                                                                  

  k1=4.0, b=0.8 -> MAP: 0.6225


                                                                  

  k1=4.0, b=0.9 -> MAP: 0.6278


                                                                   

  k1=4.0, b=0.99 -> MAP: 0.6294


                                                                  

  k1=4.5, b=0.6 -> MAP: 0.6011


                                                                  

  k1=4.5, b=0.7 -> MAP: 0.6125


                                                                  

  k1=4.5, b=0.8 -> MAP: 0.6224


                                                                  

  k1=4.5, b=0.9 -> MAP: 0.6272


                                                                   

  k1=4.5, b=0.99 -> MAP: 0.6280


                                                                  

  k1=5.0, b=0.6 -> MAP: 0.5990


                                                                  

  k1=5.0, b=0.7 -> MAP: 0.6113


                                                                  

  k1=5.0, b=0.8 -> MAP: 0.6219


                                                                  

  k1=5.0, b=0.9 -> MAP: 0.6265


                                                                   

  k1=5.0, b=0.99 -> MAP: 0.6278

1차 최적 파라미터: {'k1': 3.0, 'b': 0.99} (MAP: 0.6305)




In [5]:
print("\n[2차 하이퍼파라미터 탐색 - Fine Grid]")
k1_center = best_params_coarse['k1']
b_center = best_params_coarse['b']

k1_fine = [k1_center - 0.5, k1_center - 0.25, k1_center, k1_center + 0.25, k1_center + 0.5]
k1_fine = [k for k in k1_fine if 2.0 <= k <= 5.0]

b_fine = [b_center - 0.1, b_center - 0.05, b_center, b_center + 0.05, b_center + 0.1]
b_fine = [b for b in b_fine if 0.6 <= b <= 0.99]

tuning_fine = []
best_map_fine = best_map_coarse
best_params_fine = best_params_coarse.copy()

print(f"탐색 범위: k1={k1_fine}, b={b_fine} (총 {len(k1_fine)*len(b_fine)}회)")

for k1_val in k1_fine:
    for b_val in b_fine:
        total_map = 0
        count = 0

        for qid, q_tokens in tqdm(processed_queries.items(), desc=f"k1={k1_val:.2f}, b={b_val:.2f}", leave=False):
            if not any(d in df.index for d in qrels_dict[qid]):
                continue

            scores = calculate_scores(q_tokens, model_type='BM25', k1=k1_val, b=b_val)
            ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
            ranked_ids = [d for d, s in ranked[:100]]

            _, _, ap = calculate_metrics(ranked_ids, qrels_dict[qid], k=10)
            total_map += ap
            count += 1

        avg_map = total_map / count if count > 0 else 0
        print(f"  k1={k1_val:.2f}, b={b_val:.2f} -> MAP: {avg_map:.4f}")

        tuning_fine.append({'k1': k1_val, 'b': b_val, 'MAP': avg_map})

        if avg_map > best_map_fine:
            best_map_fine = avg_map
            best_params_fine = {'k1': k1_val, 'b': b_val}

print(f"\n최종 최적 파라미터: {best_params_fine} (MAP: {best_map_fine:.4f})")
pd.DataFrame(tuning_fine).to_csv(os.path.join(DATA_DIR, 'tuning_fine_v2.csv'), index=False)


[2차 하이퍼파라미터 탐색 - Fine Grid]
탐색 범위: k1=[2.5, 2.75, 3.0, 3.25, 3.5], b=[0.89, 0.94, 0.99] (총 15회)


                                                                    

  k1=2.50, b=0.89 -> MAP: 0.6265


                                                                    

  k1=2.50, b=0.94 -> MAP: 0.6287


                                                                    

  k1=2.50, b=0.99 -> MAP: 0.6293


                                                                    

  k1=2.75, b=0.89 -> MAP: 0.6271


                                                                    

  k1=2.75, b=0.94 -> MAP: 0.6301


                                                                    

  k1=2.75, b=0.99 -> MAP: 0.6298


                                                                    

  k1=3.00, b=0.89 -> MAP: 0.6277


                                                                    

  k1=3.00, b=0.94 -> MAP: 0.6303


                                                                    

  k1=3.00, b=0.99 -> MAP: 0.6305


                                                                    

  k1=3.25, b=0.89 -> MAP: 0.6283


                                                                    

  k1=3.25, b=0.94 -> MAP: 0.6307


                                                                    

  k1=3.25, b=0.99 -> MAP: 0.6309


                                                                    

  k1=3.50, b=0.89 -> MAP: 0.6288


                                                                    

  k1=3.50, b=0.94 -> MAP: 0.6301


                                                                    

  k1=3.50, b=0.99 -> MAP: 0.6304

최종 최적 파라미터: {'k1': 3.25, 'b': 0.99} (MAP: 0.6309)




In [9]:
print("\n[최종 평가 및 회귀 데이터셋 생성]")

conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()

regression_data = []
metrics_data = []

models_to_run = [
    {'name': 'BIM', 'type': 'BIM', 'k1': 0, 'b': 0},
    {'name': 'BM25_Best', 'type': 'BM25', 'k1': best_params_fine['k1'], 'b': best_params_fine['b']}
]

for model_info in models_to_run:
    m_name = model_info['name']
    print(f"  모델 실행: {m_name}")

    for qid, q_tokens in tqdm(processed_queries.items(), desc=m_name):
        target_docs = qrels_dict[qid]

        if not any(d in df.index for d in target_docs):
            continue

        scores = calculate_scores(q_tokens, model_type=model_info['type'],
                                  k1=model_info['k1'], b=model_info['b'])

        ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        ranked_ids = [d for d, s in ranked[:100]]

        p10, r10, ap = calculate_metrics(ranked_ids, target_docs, k=10)
        metrics_data.append({
            'qid': qid,
            'model': m_name,
            'P@10': p10, 'R@10': r10, 'AP': ap
        })

        query_avg_token_len = np.mean([len(t) for t in q_tokens]) if q_tokens else 0
        query_unique_ratio = len(set(q_tokens)) / len(q_tokens) if q_tokens else 0

        for doc_id in ranked_ids[:10]:
            if doc_id not in df.index:
                continue

            is_relevant = 1 if doc_id in target_docs else 0
            doc_len = df.at[doc_id, 'doc_length']
            dominant_topic = df.at[doc_id, 'dominant_topic']
            dominant_prob = df.at[doc_id, 'dominant_prob']

            doc_tokens_clean = [t for t in df.at[doc_id, 'tokens_padded'] if set(t) != {'O'}]
            query_match_count = sum(1 for qt in q_tokens if qt in doc_tokens_clean)
            query_match_ratio = query_match_count / len(q_tokens) if q_tokens else 0

            doc_score = scores.get(doc_id, 0.0)

            row = {
                'qid': qid,
                'doc_id': doc_id,
                'model': m_name,
                'relevance': is_relevant,
                'doc_length': doc_len,
                'query_length': len(q_tokens),
                'query_avg_token_len': query_avg_token_len,
                'query_unique_ratio': query_unique_ratio,
                'query_match_count': query_match_count,
                'query_match_ratio': query_match_ratio,
                'dominant_topic': dominant_topic,
                'dominant_prob': dominant_prob,
                'search_score': doc_score
            }
            regression_data.append(row)

conn.close()


[최종 평가 및 회귀 데이터셋 생성]
  모델 실행: BIM


BIM: 100%|██████████| 1454/1454 [00:48<00:00, 29.87it/s]


  모델 실행: BM25_Best


BM25_Best: 100%|██████████| 1454/1454 [00:35<00:00, 41.40it/s]


In [10]:
print("\n데이터셋 저장")
pd.DataFrame(regression_data).to_csv(os.path.join(DATA_DIR, 'regression_dataset_v2.csv'), index=False)
pd.DataFrame(metrics_data).to_csv(os.path.join(DATA_DIR, 'performance_metrics_v2.csv'), index=False)

print("\n작업 완료")
print(f"회귀 데이터셋: {len(regression_data)}개 샘플")
print(f"성능 메트릭: {len(metrics_data)}개 레코드")

df_metrics = pd.DataFrame(metrics_data)
print("\n[모델별 성능 요약]")
for model in df_metrics['model'].unique():
    model_data = df_metrics[df_metrics['model'] == model]
    print(f"\n{model}:")
    print(f"  MAP: {model_data['AP'].mean():.4f}")
    print(f"  P@10: {model_data['P@10'].mean():.4f}")
    print(f"  R@10: {model_data['R@10'].mean():.4f}")


데이터셋 저장

작업 완료
회귀 데이터셋: 27298개 샘플
성능 메트릭: 2744개 레코드

[모델별 성능 요약]

BIM:
  MAP: 0.4165
  P@10: 0.1926
  R@10: 0.5205

BM25_Best:
  MAP: 0.6309
  P@10: 0.2698
  R@10: 0.6810
