In [1]:
import os
import sqlite3
import pickle
import math
import pandas as pd
import numpy as np
from kiwipiepy import Kiwi
from tqdm import tqdm
import warnings

warnings.filterwarnings('ignore')

BASE_DIR = os.path.join('..')
DATA_DIR = os.path.join(BASE_DIR, 'final_data')
DB_DIR = os.path.join(BASE_DIR, 'final_database')
FULL_DATA_PATH = os.path.join(DATA_DIR, 'full_data_final.pkl')
QUERIES_PATH = os.path.join(BASE_DIR, 'data', 'queries.pkl')
QRELS_PATH = os.path.join(BASE_DIR, 'data', 'qrels.pkl')

os.makedirs(DB_DIR, exist_ok=True)
DB_PATH = os.path.join(DB_DIR, 'search_index_final.db')

print("데이터 로드 중")
df = pd.read_pickle(FULL_DATA_PATH)

if '_id' in df.columns:
    df.set_index('_id', inplace=True)
elif 'doc_id' in df.columns:
    df.set_index('doc_id', inplace=True)
df.index = df.index.astype(str)

df['dominant_topic'] = df['topic_probs'].apply(lambda x: np.argmax(x))
df['dominant_prob'] = df['topic_probs'].apply(lambda x: np.max(x))

with open(QUERIES_PATH, 'rb') as f:
    queries_data = pickle.load(f)
with open(QRELS_PATH, 'rb') as f:
    qrels_data = pickle.load(f)

if os.path.exists(DB_PATH):
    os.remove(DB_PATH)

conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute('CREATE TABLE inverted_index (term TEXT, doc_id TEXT, tf INTEGER)')
cursor.execute('CREATE INDEX idx_term ON inverted_index(term)')

데이터 로드 중


<sqlite3.Cursor at 0x20ba6fa7340>

In [2]:
print("역색인 구축 중 (패딩 O 제외)")
data_to_insert = []
BATCH_SIZE = 50000

doc_freq = {}
total_docs = len(df)
avg_dl = df['doc_length'].mean()

for doc_id, row in tqdm(df.iterrows(), total=total_docs, desc="인덱싱"):
    tokens = row['tokens_padded']
    term_counts = {}

    for t in tokens:
        if set(t) == {'O'}: continue
        term_counts[t] = term_counts.get(t, 0) + 1

    for term, tf in term_counts.items():
        data_to_insert.append((term, str(doc_id), tf))
        doc_freq[term] = doc_freq.get(term, 0) + 1

    if len(data_to_insert) >= BATCH_SIZE:
        cursor.executemany('INSERT INTO inverted_index VALUES (?, ?, ?)', data_to_insert)
        data_to_insert = []

if data_to_insert:
    cursor.executemany('INSERT INTO inverted_index VALUES (?, ?, ?)', data_to_insert)
conn.commit()
print("인덱싱 완료")

역색인 구축 중 (패딩 O 제외)


인덱싱: 100%|██████████| 50222/50222 [07:27<00:00, 112.29it/s]


인덱싱 완료


In [3]:
kiwi = Kiwi(num_workers=0)
idf_cache = {t: math.log((total_docs - df + 0.5) / (df + 0.5) + 1) for t, df in doc_freq.items()}

def tokenize_query(text):
    try:
        return [t.form for t in kiwi.tokenize(text) if t.tag in ['NNG', 'NNP', 'VV', 'VA', 'MAG']]
    except: return []

def calculate_scores(query_tokens, model_type='BM25', k1=1.2, b=0.75):
    scores = {}
    for term in query_tokens:
        if term not in idf_cache: continue
        idf = idf_cache[term]

        cursor.execute('SELECT doc_id, tf FROM inverted_index WHERE term = ?', (term,))
        rows = cursor.fetchall()

        for doc_id, tf in rows:
            if model_type == 'BIM':
                scores[doc_id] = scores.get(doc_id, 0.0) + idf
            else:
                doc_len = df.at[doc_id, 'doc_length']
                num = tf * (k1 + 1)
                den = tf + k1 * (1 - b + b * (doc_len / avg_dl))
                scores[doc_id] = scores.get(doc_id, 0.0) + idf * (num / den)
    return scores

def calculate_metrics(ranked_docs, relevant_docs, k=10):
    top_k = ranked_docs[:k]
    relevant_retrieved_k = len(set(top_k) & relevant_docs)
    p_at_k = relevant_retrieved_k / k

    total_relevant = len(relevant_docs)
    r_at_k = relevant_retrieved_k / total_relevant if total_relevant > 0 else 0

    relevant_count = 0
    precision_sum = 0.0
    for i, doc_id in enumerate(ranked_docs):
        if doc_id in relevant_docs:
            relevant_count += 1
            precision_sum += relevant_count / (i + 1)

    ap = precision_sum / total_relevant if total_relevant > 0 else 0
    return p_at_k, r_at_k, ap

qrels_dict = {}
for item in qrels_data:
    qid, doc_id = str(item.get('query-id')), str(item.get('corpus-id'))
    if qid and doc_id:
        qrels_dict.setdefault(qid, set()).add(doc_id)

queries_map = {str(q['_id']): q['text'] for q in queries_data if '_id' in q}

In [4]:
print("쿼리 전처리 중")
processed_queries = {}
for qid in qrels_dict:
    if qid in queries_map:
        tokens = tokenize_query(queries_map[qid])
        if tokens:
            processed_queries[qid] = tokens

print(f"실험 대상 쿼리 수: {len(processed_queries)}")

쿼리 전처리 중
실험 대상 쿼리 수: 1454


In [7]:
import random

print("\n[BM25 하이퍼파라미터 튜닝 (+ Query Sampling)]")

valid_qids = [qid for qid in processed_queries.keys() if qid in qrels_dict]

random.seed(42)
if len(valid_qids) > 100:
    sampled_qids = random.sample(valid_qids, 100)
    print(f"전체 {len(valid_qids)}개 쿼리 중 100개만 샘플링하여 튜닝")
else:
    sampled_qids = valid_qids
    print(f"전체 {len(valid_qids)}개 쿼리 사용")

k1_values = [3.8, 4.0, 4.2]
b_values = [0.95, 0.98, 0.99, 1.0]

tuning_results = []
best_map = -1
best_params = {'k1': 4.0, 'b': 0.98}

print(f"튜닝 후보: k1={k1_values}, b={b_values} (총 {len(k1_values)*len(b_values)}회)")

for k1_val in k1_values:
    for b_val in b_values:
        total_map = 0
        count = 0

        for qid in tqdm(sampled_qids, desc=f"k1={k1_val}, b={b_val}", leave=False):
            q_tokens = processed_queries[qid]

            scores = calculate_scores(q_tokens, model_type='BM25', k1=k1_val, b=b_val)

            ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
            ranked_ids = [d for d, s in ranked[:100]]

            _, _, ap = calculate_metrics(ranked_ids, qrels_dict[qid], k=10)
            total_map += ap
            count += 1

        avg_map = total_map / count if count > 0 else 0
        print(f"  - Params(k1={k1_val}, b={b_val}) -> MAP: {avg_map:.4f}")

        tuning_results.append({'k1': k1_val, 'b': b_val, 'MAP': avg_map})

        if avg_map > best_map:
            best_map = avg_map
            best_params = {'k1': k1_val, 'b': b_val}

print(f"\n최적 파라미터(Fast Tuning): {best_params} (MAP: {best_map:.4f})")

pd.DataFrame(tuning_results).to_csv(os.path.join(DATA_DIR, 'tuning_results_final.csv'), index=False)


[BM25 하이퍼파라미터 튜닝 (+ Query Sampling)]
전체 1454개 쿼리 중 100개만 샘플링하여 튜닝
튜닝 후보: k1=[3.8, 4.0, 4.2], b=[0.95, 0.98, 0.99, 1.0] (총 12회)


                                                                 

  - Params(k1=3.8, b=0.95) -> MAP: 0.5269


                                                                 

  - Params(k1=3.8, b=0.98) -> MAP: 0.5217


                                                                 

  - Params(k1=3.8, b=0.99) -> MAP: 0.5236


                                                                

  - Params(k1=3.8, b=1.0) -> MAP: 0.5223


                                                                 

  - Params(k1=4.0, b=0.95) -> MAP: 0.5260


                                                                 

  - Params(k1=4.0, b=0.98) -> MAP: 0.5231


                                                                 

  - Params(k1=4.0, b=0.99) -> MAP: 0.5235


                                                                

  - Params(k1=4.0, b=1.0) -> MAP: 0.5221


                                                                 

  - Params(k1=4.2, b=0.95) -> MAP: 0.5237


                                                                 

  - Params(k1=4.2, b=0.98) -> MAP: 0.5224


                                                                 

  - Params(k1=4.2, b=0.99) -> MAP: 0.5231


                                                                

  - Params(k1=4.2, b=1.0) -> MAP: 0.5228

최적 파라미터(Fast Tuning): {'k1': 3.8, 'b': 0.95} (MAP: 0.5269)




In [13]:
import random

print("\n[BM25 하이퍼파라미터 튜닝 - 1차]")

valid_qids = [qid for qid in processed_queries.keys() if qid in qrels_dict]

random.seed(42)
if len(valid_qids) > 100:
    sampled_qids = random.sample(valid_qids, 100)
    print(f"전체 {len(valid_qids)}개 쿼리 중 100개만 샘플링하여 튜닝")
else:
    sampled_qids = valid_qids
    print(f"전체 {len(valid_qids)}개 쿼리 사용")

k1_values = [1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0]
b_values = [0.5, 0,6, 0.7, 0.8, 0.9, 0.99]

tuning_results = []
best_map = -1
best_params = {'k1': 1.2, 'b': 0.7}

print(f"튜닝 후보: k1={k1_values}, b={b_values} (총 {len(k1_values)*len(b_values)}회)")

for k1_val in k1_values:
    for b_val in b_values:
        total_map = 0
        count = 0

        for qid in tqdm(sampled_qids, desc=f"k1={k1_val}, b={b_val}", leave=False):
            q_tokens = processed_queries[qid]

            scores = calculate_scores(q_tokens, model_type='BM25', k1=k1_val, b=b_val)

            ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
            ranked_ids = [d for d, s in ranked[:100]]

            _, _, ap = calculate_metrics(ranked_ids, qrels_dict[qid], k=10)
            total_map += ap
            count += 1

        avg_map = total_map / count if count > 0 else 0
        print(f"  - Params(k1={k1_val}, b={b_val}) -> MAP: {avg_map:.4f}")

        tuning_results.append({'k1': k1_val, 'b': b_val, 'MAP': avg_map})

        if avg_map > best_map:
            best_map = avg_map
            best_params = {'k1': k1_val, 'b': b_val}

print(f"\n최적 파라미터(Fast Tuning): {best_params} (MAP: {best_map:.4f})")

pd.DataFrame(tuning_results).to_csv(os.path.join(DATA_DIR, 'tuning_results_final.csv'), index=False)


[BM25 하이퍼파라미터 튜닝 - 1차]
전체 1454개 쿼리 중 100개만 샘플링하여 튜닝
튜닝 후보: k1=[1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0], b=[0.5, 0, 6, 0.7, 0.8, 0.9, 0.99] (총 49회)


                                                                

  - Params(k1=1.0, b=0.5) -> MAP: 0.5108


                                                              

  - Params(k1=1.0, b=0) -> MAP: 0.3425


                                                              

  - Params(k1=1.0, b=6) -> MAP: 0.0014


                                                                

  - Params(k1=1.0, b=0.7) -> MAP: 0.5345


                                                                

  - Params(k1=1.0, b=0.8) -> MAP: 0.5360


                                                                

  - Params(k1=1.0, b=0.9) -> MAP: 0.5354


                                                                 

  - Params(k1=1.0, b=0.99) -> MAP: 0.5307


                                                                

  - Params(k1=1.5, b=0.5) -> MAP: 0.5223


                                                              

  - Params(k1=1.5, b=0) -> MAP: 0.3540


                                                              

  - Params(k1=1.5, b=6) -> MAP: 0.0020


                                                                

  - Params(k1=1.5, b=0.7) -> MAP: 0.5479


                                                                

  - Params(k1=1.5, b=0.8) -> MAP: 0.5510


                                                                

  - Params(k1=1.5, b=0.9) -> MAP: 0.5493


                                                                 

  - Params(k1=1.5, b=0.99) -> MAP: 0.5397


                                                                

  - Params(k1=2.0, b=0.5) -> MAP: 0.5282


                                                              

  - Params(k1=2.0, b=0) -> MAP: 0.3523


                                                              

  - Params(k1=2.0, b=6) -> MAP: 0.0024


                                                                

  - Params(k1=2.0, b=0.7) -> MAP: 0.5520


                                                                

  - Params(k1=2.0, b=0.8) -> MAP: 0.5459


                                                                

  - Params(k1=2.0, b=0.9) -> MAP: 0.5432


                                                                 

  - Params(k1=2.0, b=0.99) -> MAP: 0.5351


                                                                

  - Params(k1=2.5, b=0.5) -> MAP: 0.5261


                                                              

  - Params(k1=2.5, b=0) -> MAP: 0.3504


                                                              

  - Params(k1=2.5, b=6) -> MAP: 0.0027


                                                                

  - Params(k1=2.5, b=0.7) -> MAP: 0.5537


                                                                

  - Params(k1=2.5, b=0.8) -> MAP: 0.5433


                                                                

  - Params(k1=2.5, b=0.9) -> MAP: 0.5402


                                                                 

  - Params(k1=2.5, b=0.99) -> MAP: 0.5297


                                                                

  - Params(k1=3.0, b=0.5) -> MAP: 0.5183


                                                              

  - Params(k1=3.0, b=0) -> MAP: 0.3439


                                                              

  - Params(k1=3.0, b=6) -> MAP: 0.0018


                                                                

  - Params(k1=3.0, b=0.7) -> MAP: 0.5546


                                                                

  - Params(k1=3.0, b=0.8) -> MAP: 0.5388


                                                                

  - Params(k1=3.0, b=0.9) -> MAP: 0.5359


                                                                 

  - Params(k1=3.0, b=0.99) -> MAP: 0.5259


                                                                

  - Params(k1=3.5, b=0.5) -> MAP: 0.5180


                                                              

  - Params(k1=3.5, b=0) -> MAP: 0.3428


                                                              

  - Params(k1=3.5, b=6) -> MAP: 0.0021


                                                                

  - Params(k1=3.5, b=0.7) -> MAP: 0.5423


                                                                

  - Params(k1=3.5, b=0.8) -> MAP: 0.5370


                                                                

  - Params(k1=3.5, b=0.9) -> MAP: 0.5345


                                                                 

  - Params(k1=3.5, b=0.99) -> MAP: 0.5237


                                                                

  - Params(k1=4.0, b=0.5) -> MAP: 0.5144


                                                              

  - Params(k1=4.0, b=0) -> MAP: 0.3413


                                                              

  - Params(k1=4.0, b=6) -> MAP: 0.0024


                                                                

  - Params(k1=4.0, b=0.7) -> MAP: 0.5415


                                                                

  - Params(k1=4.0, b=0.8) -> MAP: 0.5353


                                                                

  - Params(k1=4.0, b=0.9) -> MAP: 0.5350


                                                                 

  - Params(k1=4.0, b=0.99) -> MAP: 0.5235

최적 파라미터(Fast Tuning): {'k1': 3.0, 'b': 0.7} (MAP: 0.5546)




In [14]:
import random
import os
import pandas as pd
from tqdm import tqdm

print("\n[BM25 하이퍼파라미터 튜닝 - 2차]")

valid_qids = [qid for qid in processed_queries.keys() if qid in qrels_dict]
random.seed(42)

if len(valid_qids) > 100:
    sampled_qids = random.sample(valid_qids, 100)
    print(f"전체 {len(valid_qids)}개 쿼리 중 100개만 샘플링하여 튜닝")
else:
    sampled_qids = valid_qids
    print(f"전체 {len(valid_qids)}개 쿼리 사용")


k1_values = [2.4, 2.6, 2.8, 3.0, 3.2]
b_values = [0.65, 0.7, 0.72, 0.75, 0.78, 0.8]

tuning_results = []
best_map = -1
# 1차 튜닝의 최고점
best_params = {'k1': 3.0, 'b': 0.7}

print(f"튜닝 후보: k1={k1_values}, b={b_values} (총 {len(k1_values)*len(b_values)}회)")

for k1_val in k1_values:
    for b_val in b_values:
        total_map = 0
        count = 0

        for qid in tqdm(sampled_qids, desc=f"k1={k1_val}, b={b_val}", leave=False):
            q_tokens = processed_queries[qid]
            scores = calculate_scores(q_tokens, model_type='BM25', k1=k1_val, b=b_val)

            ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
            ranked_ids = [d for d, s in ranked[:100]]

            _, _, ap = calculate_metrics(ranked_ids, qrels_dict[qid], k=10)
            total_map += ap
            count += 1

        avg_map = total_map / count if count > 0 else 0
        print(f"  - Params(k1={k1_val}, b={b_val}) -> MAP: {avg_map:.4f}")

        tuning_results.append({'k1': k1_val, 'b': b_val, 'MAP': avg_map})

        if avg_map > best_map:
            best_map = avg_map
            best_params = {'k1': k1_val, 'b': b_val}

print(f"\n최적 파라미터(2차): {best_params} (MAP: {best_map:.4f})")

# 결과 저장
pd.DataFrame(tuning_results).to_csv(os.path.join(DATA_DIR, 'tuning_results_final.csv'), index=False)


[BM25 하이퍼파라미터 튜닝 - 2차]
전체 1454개 쿼리 중 100개만 샘플링하여 튜닝
튜닝 후보: k1=[2.4, 2.6, 2.8, 3.0, 3.2], b=[0.65, 0.7, 0.72, 0.75, 0.78, 0.8] (총 30회)


                                                                 

  - Params(k1=2.4, b=0.65) -> MAP: 0.5466


                                                                

  - Params(k1=2.4, b=0.7) -> MAP: 0.5530


                                                                 

  - Params(k1=2.4, b=0.72) -> MAP: 0.5569


                                                                 

  - Params(k1=2.4, b=0.75) -> MAP: 0.5600


                                                                 

  - Params(k1=2.4, b=0.78) -> MAP: 0.5487


                                                                

  - Params(k1=2.4, b=0.8) -> MAP: 0.5422


                                                                 

  - Params(k1=2.6, b=0.65) -> MAP: 0.5456


                                                                

  - Params(k1=2.6, b=0.7) -> MAP: 0.5544


                                                                 

  - Params(k1=2.6, b=0.72) -> MAP: 0.5591


                                                                 

  - Params(k1=2.6, b=0.75) -> MAP: 0.5592


                                                                 

  - Params(k1=2.6, b=0.78) -> MAP: 0.5414


                                                                

  - Params(k1=2.6, b=0.8) -> MAP: 0.5378


                                                                 

  - Params(k1=2.8, b=0.65) -> MAP: 0.5476


                                                                

  - Params(k1=2.8, b=0.7) -> MAP: 0.5542


                                                                 

  - Params(k1=2.8, b=0.72) -> MAP: 0.5604


                                                                 

  - Params(k1=2.8, b=0.75) -> MAP: 0.5571


                                                                 

  - Params(k1=2.8, b=0.78) -> MAP: 0.5379


                                                                

  - Params(k1=2.8, b=0.8) -> MAP: 0.5389


                                                                 

  - Params(k1=3.0, b=0.65) -> MAP: 0.5501


                                                                

  - Params(k1=3.0, b=0.7) -> MAP: 0.5546


                                                                 

  - Params(k1=3.0, b=0.72) -> MAP: 0.5608


                                                                 

  - Params(k1=3.0, b=0.75) -> MAP: 0.5494


                                                                 

  - Params(k1=3.0, b=0.78) -> MAP: 0.5390


                                                                

  - Params(k1=3.0, b=0.8) -> MAP: 0.5388


                                                                 

  - Params(k1=3.2, b=0.65) -> MAP: 0.5435


                                                                

  - Params(k1=3.2, b=0.7) -> MAP: 0.5523


                                                                 

  - Params(k1=3.2, b=0.72) -> MAP: 0.5499


                                                                 

  - Params(k1=3.2, b=0.75) -> MAP: 0.5479


                                                                 

  - Params(k1=3.2, b=0.78) -> MAP: 0.5390


                                                                

  - Params(k1=3.2, b=0.8) -> MAP: 0.5379

최적 파라미터(2차): {'k1': 3.0, 'b': 0.72} (MAP: 0.5608)




In [15]:
print("\n[최종 하이퍼파라미터 검증 - 전체 쿼리]")
print(f"샘플링 결과 기반 상위 3개 후보 검증")

candidate_params = [
    {'k1': 3.0, 'b': 0.72},
    {'k1': 2.8, 'b': 0.72},
    {'k1': 2.4, 'b': 0.75},
]

print(f"검증 후보: 총 {len(candidate_params)}개")

final_validation_results = []
best_map = -1
best_params = None

for params in candidate_params:
    k1_val = params['k1']
    b_val = params['b']

    total_map = 0
    count = 0

    for qid, q_tokens in tqdm(processed_queries.items(), desc=f"k1={k1_val}, b={b_val}", leave=False):
        if not any(d in df.index for d in qrels_dict[qid]):
            continue

        scores = calculate_scores(q_tokens, model_type='BM25', k1=k1_val, b=b_val)
        ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        ranked_ids = [d for d, s in ranked[:100]]
        _, _, ap = calculate_metrics(ranked_ids, qrels_dict[qid], k=10)
        total_map += ap
        count += 1

    avg_map = total_map / count if count > 0 else 0

    final_validation_results.append({
        'k1': k1_val,
        'b': b_val,
        'MAP': avg_map,
        'queries': count
    })

    print(f"  - Params(k1={k1_val}, b={b_val}) -> MAP: {avg_map:.4f}")

    if avg_map > best_map:
        best_map = avg_map
        best_params = {'k1': k1_val, 'b': b_val}

print(f"\n최적 파라미터(최종): {best_params} (MAP: {best_map:.4f})")

df_final_validation = pd.DataFrame(final_validation_results)
df_final_validation.to_csv(os.path.join(DATA_DIR, 'final_validation_results.csv'), index=False)


[최종 하이퍼파라미터 검증 - 전체 쿼리]
샘플링 결과 기반 상위 3개 후보 검증
검증 후보: 총 3개


                                                                   

  - Params(k1=3.0, b=0.72) -> MAP: 0.5169


                                                                    

  - Params(k1=2.8, b=0.72) -> MAP: 0.5178


                                                                  

KeyboardInterrupt: 

In [None]:
print("\n[BM25 하이퍼파라미터 튜닝 - 전체 쿼리]")

k1_values = [2.6, 2.8, 3.0, 3.2, 3.5]
b_values = [0.65, 0.72, 0.78, 0.75, 0.85]

candidate_params = []
for k1_val in k1_values:
    for b_val in b_values:
        candidate_params.append({'k1': k1_val, 'b': b_val})

print(f"튜닝 후보: k1={k1_values}, b={b_values}")
print(f"총 조합: {len(candidate_params)}개 (k1 {len(k1_values)}개 × b {len(b_values)}개)")
print(f"전체 쿼리: {len(processed_queries)}개\n")

tuning_results = []
best_map = -1
best_params = None

for idx, params in enumerate(candidate_params, 1):
    k1_val = params['k1']
    b_val = params['b']

    print(f"[{idx}/{len(candidate_params)}] k1={k1_val}, b={b_val} 평가 중")

    total_map = 0
    count = 0

    for qid, q_tokens in tqdm(processed_queries.items(), desc=f"k1={k1_val}, b={b_val}", leave=False):
        if not any(d in df.index for d in qrels_dict[qid]):
            continue

        scores = calculate_scores(q_tokens, model_type='BM25', k1=k1_val, b=b_val)
        ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        ranked_ids = [d for d, s in ranked[:100]]
        _, _, ap = calculate_metrics(ranked_ids, qrels_dict[qid], k=10)
        total_map += ap
        count += 1

    avg_map = total_map / count if count > 0 else 0

    tuning_results.append({
        'k1': k1_val,
        'b': b_val,
        'MAP': avg_map,
        'queries': count
    })

    print(f"  - Params(k1={k1_val}, b={b_val}) -> MAP: {avg_map:.4f}")

    if avg_map > best_map:
        best_map = avg_map
        best_params = {'k1': k1_val, 'b': b_val}

print(f"\n최적 파라미터(최종): {best_params} (MAP: {best_map:.4f})")

df_tuning_final = pd.DataFrame(tuning_results)
df_tuning_final = df_tuning_final.sort_values('MAP', ascending=False)
df_tuning_final.to_csv(os.path.join(DATA_DIR, 'tuning_results_final.csv'), index=False)

print("\n저장 완료: tuning_results_final.csv")


[BM25 하이퍼파라미터 튜닝 - 전체 쿼리]
튜닝 후보: k1=[2.6, 2.8, 3.0, 3.2, 3.5], b=[0.65, 0.72, 0.78, 0.75, 0.85]
총 조합: 25개 (k1 5개 × b 5개)
전체 쿼리: 1454개

[1/25] k1=2.6, b=0.65 평가 중


k1=2.6, b=0.65:   2%|▏         | 33/1454 [00:58<34:11,  1.44s/it] 

In [8]:
print("\n[최종 평가 및 회귀 데이터셋 생성]")
regression_data = []
metrics_data = []

models_to_run = [
    {'name': 'BIM', 'type': 'BIM', 'k1': 0, 'b': 0},
    {'name': 'BM25_Best', 'type': 'BM25', 'k1': best_params['k1'], 'b': best_params['b']}
]

for model_info in models_to_run:
    m_name = model_info['name']
    print(f"  - 모델 실행 중: {m_name}")

    for qid, q_tokens in tqdm(processed_queries.items(), desc=m_name):
        target_docs = qrels_dict[qid]

        scores = calculate_scores(q_tokens, model_type=model_info['type'],
                                  k1=model_info['k1'], b=model_info['b'])

        ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        ranked_ids = [d for d, s in ranked[:100]]

        p10, r10, ap = calculate_metrics(ranked_ids, target_docs, k=10)
        metrics_data.append({
            'qid': qid,
            'model': m_name,
            'P@10': p10, 'R@10': r10, 'AP': ap
        })

        top_10_ids = set(ranked_ids[:10])

        query_avg_token_len = np.mean([len(t) for t in q_tokens]) if q_tokens else 0
        query_unique_ratio = len(set(q_tokens)) / len(q_tokens) if q_tokens else 0

        for doc_id in ranked_ids[:10]:
            if doc_id not in df.index: continue

            is_relevant = 1 if doc_id in target_docs else 0
            doc_len = df.at[doc_id, 'doc_length']
            dominant_topic = df.at[doc_id, 'dominant_topic']
            dominant_prob = df.at[doc_id, 'dominant_prob']

            doc_tokens_clean = [t for t in df.at[doc_id, 'tokens_padded'] if set(t) != {'O'}]
            query_match_count = sum(1 for qt in q_tokens if qt in doc_tokens_clean)
            query_match_ratio = query_match_count / len(q_tokens) if q_tokens else 0

            doc_score = scores.get(doc_id, 0.0)

            row = {
                'qid': qid,
                'doc_id': doc_id,
                'model': m_name,
                'relevance': is_relevant,
                'doc_length': doc_len,
                'query_length': len(q_tokens),
                'query_avg_token_len': query_avg_token_len,
                'query_unique_ratio': query_unique_ratio,
                'query_match_count': query_match_count,
                'query_match_ratio': query_match_ratio,
                'dominant_topic': dominant_topic,
                'dominant_prob': dominant_prob,
                'search_score': doc_score
            }
            regression_data.append(row)

        for target_doc in target_docs:
            if target_doc in top_10_ids: continue
            if target_doc not in df.index: continue

            doc_len = df.at[target_doc, 'doc_length']
            dominant_topic = df.at[target_doc, 'dominant_topic']
            dominant_prob = df.at[target_doc, 'dominant_prob']

            doc_tokens_clean = [t for t in df.at[target_doc, 'tokens_padded'] if set(t) != {'O'}]
            query_match_count = sum(1 for qt in q_tokens if qt in doc_tokens_clean)
            query_match_ratio = query_match_count / len(q_tokens) if q_tokens else 0

            doc_score = scores.get(target_doc, 0.0)

            row = {
                'qid': qid,
                'doc_id': target_doc,
                'model': m_name,
                'relevance': 1,
                'doc_length': doc_len,
                'query_length': len(q_tokens),
                'query_avg_token_len': query_avg_token_len,
                'query_unique_ratio': query_unique_ratio,
                'query_match_count': query_match_count,
                'query_match_ratio': query_match_ratio,
                'dominant_topic': dominant_topic,
                'dominant_prob': dominant_prob,
                'search_score': doc_score
            }
            regression_data.append(row)

conn.close()


[최종 평가 및 회귀 데이터셋 생성]
  - 모델 실행 중: BIM


BIM:  13%|█▎        | 188/1454 [02:39<17:56,  1.18it/s]


KeyboardInterrupt: 

In [None]:
print("\n데이터셋 저장 중")
pd.DataFrame(regression_data).to_csv(os.path.join(DATA_DIR, 'regression_dataset_final.csv'), index=False)
pd.DataFrame(metrics_data).to_csv(os.path.join(DATA_DIR, 'performance_metrics_final.csv'), index=False)

print("작업 완료")
print(f"회귀 데이터셋: {len(regression_data)}개 샘플")
print(f"성능 메트릭: {len(metrics_data)}개 레코드")