In [5]:
!pip install statsmodels




[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import os
import sqlite3
import pickle
import math
import pandas as pd
import numpy as np
from kiwipiepy import Kiwi
from tqdm import tqdm
import warnings

warnings.filterwarnings('ignore')

BASE_DIR = os.path.join('..')
DATA_DIR = os.path.join(BASE_DIR, 'final_data')
DB_DIR = os.path.join(BASE_DIR, 'final_database')
SAMPLE_PATH = os.path.join(DATA_DIR, 'sample.pkl')
QUERIES_PATH = os.path.join(BASE_DIR, 'data', 'queries.pkl')
QRELS_PATH = os.path.join(BASE_DIR, 'data', 'qrels.pkl')

os.makedirs(DB_DIR, exist_ok=True)
DB_PATH = os.path.join(DB_DIR, 'search_index.db')

print("데이터 로드 중")
df_sample = pd.read_pickle(SAMPLE_PATH)

if '_id' in df_sample.columns:
    df_sample.set_index('_id', inplace=True)
elif 'doc_id' in df_sample.columns:
    df_sample.set_index('doc_id', inplace=True)

df_sample.index = df_sample.index.astype(str)

with open(QUERIES_PATH, 'rb') as f:
    queries_data = pickle.load(f)
with open(QRELS_PATH, 'rb') as f:
    qrels_data = pickle.load(f)

print("SQLite DB 초기화")
if os.path.exists(DB_PATH):
    os.remove(DB_PATH)

conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()

cursor.execute('''
    CREATE TABLE IF NOT EXISTS inverted_index (
        term TEXT,
        doc_id TEXT,
        tf INTEGER
    )
''')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_term ON inverted_index(term)')
cursor.execute('CREATE INDEX IF NOT EXISTS idx_doc_id ON inverted_index(doc_id)')

print("역색인 구축 (패딩 토큰 제외)")
data_to_insert = []
BATCH_SIZE = 10000

doc_freq = {}
total_docs = len(df_sample)
avg_dl = df_sample['doc_length'].mean()

for doc_id, row in tqdm(df_sample.iterrows(), total=total_docs, desc="Indexing"):
    tokens = row['tokens_padded']

    term_counts = {}
    for t in tokens:
        if set(t) == {'O'}:
            continue
        term_counts[t] = term_counts.get(t, 0) + 1

    for term, tf in term_counts.items():
        data_to_insert.append((term, str(doc_id), tf))
        doc_freq[term] = doc_freq.get(term, 0) + 1

    if len(data_to_insert) >= BATCH_SIZE:
        cursor.executemany('INSERT INTO inverted_index VALUES (?, ?, ?)', data_to_insert)
        data_to_insert = []

if data_to_insert:
    cursor.executemany('INSERT INTO inverted_index VALUES (?, ?, ?)', data_to_insert)

conn.commit()
print("역색인 구축 완료")

kiwi = Kiwi(num_workers=0)

def tokenize_query(text):
    if not isinstance(text, str): return []
    try:
        tokens = kiwi.tokenize(text)
        return [t.form for t in tokens if t.tag in ['NNG', 'NNP', 'VV', 'VA', 'MAG']]
    except:
        return []

idf_cache = {}
for term, df in doc_freq.items():
    idf_cache[term] = math.log((total_docs - df + 0.5) / (df + 0.5) + 1)

k1 = 1.2
b = 0.75

def get_bim_score(query_tokens, doc_ids_in_sample):
    scores = {str(doc_id): 0.0 for doc_id in doc_ids_in_sample}
    for term in query_tokens:
        if term not in idf_cache: continue
        idf = idf_cache[term]
        cursor.execute('SELECT doc_id FROM inverted_index WHERE term = ?', (term,))
        rows = cursor.fetchall()
        for (doc_id,) in rows:
            if doc_id in scores:
                scores[doc_id] += idf
    return scores

def get_bm25_score(query_tokens, doc_ids_in_sample):
    scores = {str(doc_id): 0.0 for doc_id in doc_ids_in_sample}
    for term in query_tokens:
        if term not in idf_cache: continue
        idf = idf_cache[term]
        cursor.execute('SELECT doc_id, tf FROM inverted_index WHERE term = ?', (term,))
        rows = cursor.fetchall()
        for doc_id, tf in rows:
            if doc_id in scores:
                doc_len = df_sample.at[doc_id, 'doc_length']
                numerator = tf * (k1 + 1)
                denominator = tf + k1 * (1 - b + b * (doc_len / avg_dl))
                scores[doc_id] += idf * (numerator / denominator)
    return scores

print("검색 실험 및 데이터셋 생성")

regression_data = []

qrels_dict = {}
for item in qrels_data:
    qid = str(item.get('query-id'))
    doc_id = str(item.get('corpus-id'))
    if qid and doc_id:
        if qid not in qrels_dict:
            qrels_dict[qid] = []
        qrels_dict[qid].append(doc_id)

queries_map = {}
for q in queries_data:
    if '_id' in q:
        queries_map[str(q['_id'])] = q['text']

print(f"매핑 확인 - Qrels: {len(qrels_dict)}, Queries: {len(queries_map)}")

match_count = 0
skip_count_no_query = 0
skip_count_no_doc = 0

for qid, target_doc_ids in tqdm(qrels_dict.items(), desc="Processing"):

    if qid not in queries_map:
        skip_count_no_query += 1
        continue

    query_text = queries_map[qid]
    q_tokens = tokenize_query(query_text)

    if not q_tokens:
        continue

    valid_targets = [d for d in target_doc_ids if d in df_sample.index]

    if not valid_targets:
        skip_count_no_doc += 1
        continue

    match_count += 1

    bim_scores = get_bim_score(q_tokens, df_sample.index)
    bim_sorted = sorted(bim_scores.items(), key=lambda x: x[1], reverse=True)
    bim_top10 = [doc for doc, score in bim_sorted[:10]]

    bm25_scores = get_bm25_score(q_tokens, df_sample.index)
    bm25_sorted = sorted(bm25_scores.items(), key=lambda x: x[1], reverse=True)
    bm25_top10 = [doc for doc, score in bm25_sorted[:10]]

    for target_doc in valid_targets:
        doc_len = df_sample.at[target_doc, 'doc_length']
        q_len = len(q_tokens)
        topic_probs = df_sample.at[target_doc, 'topic_probs']

        base_row = {
            'qid': qid,
            'doc_id': target_doc,
            'doc_length': doc_len,
            'query_length': q_len,
        }
        for idx, prob in enumerate(topic_probs):
            base_row[f'topic_{idx}'] = prob

        row_bim = base_row.copy()
        row_bim['model'] = 'BIM'
        row_bim['success'] = 1 if target_doc in bim_top10 else 0
        regression_data.append(row_bim)

        row_bm25 = base_row.copy()
        row_bm25['model'] = 'BM25'
        row_bm25['success'] = 1 if target_doc in bm25_top10 else 0
        regression_data.append(row_bm25)

print("\n[작업 결과]")
print(f"매칭 성공 쿼리: {match_count}")
print(f"텍스트 없는 쿼리: {skip_count_no_query}")
print(f"정답 문서 없는 쿼리: {skip_count_no_doc}")

conn.close()

if regression_data:
    df_regression = pd.DataFrame(regression_data)
    save_path = os.path.join(DATA_DIR, 'regression_dataset.csv')
    df_regression.to_csv(save_path, index=False)

    print(f"저장 완료: {save_path}")
    print(f"총 데이터: {len(df_regression)}건")
    print("\n[모델별 성공률]")
    print(df_regression.groupby('model')['success'].mean())
else:
    print("데이터 생성 실패")

데이터 로드 중
SQLite DB 초기화
역색인 구축 (패딩 토큰 제외)


Indexing: 100%|██████████| 5000/5000 [00:34<00:00, 143.55it/s]


역색인 구축 완료
검색 실험 및 데이터셋 생성
매핑 확인 - Qrels: 1454, Queries: 1454


Processing: 100%|██████████| 1454/1454 [02:33<00:00,  9.47it/s]


[작업 결과]
매칭 성공 쿼리: 1372
텍스트 없는 쿼리: 0
정답 문서 없는 쿼리: 82
저장 완료: ..\final_data\regression_dataset.csv
총 데이터: 10160건

[모델별 성공률]
model
BIM     0.482087
BM25    0.685236
Name: success, dtype: float64



