# 1. 데이터 준비 및 인덱싱 (Data Setup & Indexing) - 샘플링X 후처리X
- KomuRetrieval 전체 데이터셋 다운로드
- Kiwi 형태소 분석기를 이용한 전처리
- Inverted Index 구축 및 SQLite 저장

## 1.1 데이터셋 로드 (전체 데이터)
HuggingFace에서 데이터를 다운로드하고 pickle로 저장.

In [1]:
import os
import sqlite3
import json
import pickle
from collections import defaultdict, Counter
from typing import List
from tqdm.notebook import tqdm
from kiwipiepy import Kiwi
from datasets import load_dataset
from pathlib import Path

# 경로 및 디렉토리 설정
PROJECT_ROOT = Path('..').resolve()
DATA_DIR = PROJECT_ROOT / 'data'
DB_DIR = PROJECT_ROOT / 'database'
DB_PATH = DB_DIR / 'inverted_index.db'

for d in [DATA_DIR, DB_DIR]:
    os.makedirs(d, exist_ok=True)

def load_and_save_dataset():
    # 데이터셋 다운로드 (전체 데이터)
    queries = load_dataset("junyoungson/KomuRetrieval", "queries", split="queries")
    corpus = load_dataset("junyoungson/KomuRetrieval", "corpus", split="corpus")
    qrels = load_dataset("junyoungson/KomuRetrieval", split="test")

    # 리스트 변환
    full_corpus = [item for item in tqdm(corpus, desc="Corpus Loading")]
    full_queries = [item for item in tqdm(queries, desc="Queries Loading")]
    full_qrels = [item for item in tqdm(qrels, desc="Qrels Loading")]

    # pickle 저장
    with open(DATA_DIR / 'corpus.pkl', 'wb') as f:
        pickle.dump(full_corpus, f)
    with open(DATA_DIR / 'queries.pkl', 'wb') as f:
        pickle.dump(full_queries, f)
    with open(DATA_DIR / 'qrels.pkl', 'wb') as f:
        pickle.dump(full_qrels, f)

    return full_corpus

In [2]:
# 데이터 로드 (없으면 다운로드, 있으면 로드)
if not (DATA_DIR / 'corpus.pkl').exists():
    corpus = load_and_save_dataset()
else:
    with open(DATA_DIR / 'corpus.pkl', 'rb') as f:
        corpus = pickle.load(f)

## 1.2 형태소 분석 및 토큰화
Kiwi를 사용하여 명사, 동사, 형용사 등을 추출.

In [3]:
# Kiwi 초기화
kiwi = Kiwi(num_workers=-1)

def tokenize(text: str) -> List[str]:
    if not text: return []
    clean_text = text.replace('\x00', '') # 특수문자 제거
    try:
        tokens = kiwi.tokenize(clean_text)
        useful_tags = ['NNG', 'NNP', 'VV', 'VA', 'MAG']
        return [t.form for t in tokens if t.tag in useful_tags and len(t.form) > 1]
    except:
        return []

## 1.3 Inverted Index 구축 및 DB 저장
시간이 다소 소요될 수 있음.

In [4]:
def build_index_db(corpus):
    conn = sqlite3.connect(str(DB_PATH))
    cursor = conn.cursor()

    # 테이블 초기화
    for table in ['documents', 'inverted_index', 'statistics', 'term_stats']:
        cursor.execute(f"DROP TABLE IF EXISTS {table}")

    cursor.execute("CREATE TABLE documents (doc_id TEXT PRIMARY KEY, title TEXT, length INTEGER, tokens TEXT)")
    cursor.execute("CREATE TABLE inverted_index (term TEXT, doc_id TEXT, tf INTEGER, PRIMARY KEY (term, doc_id))")
    cursor.execute("CREATE TABLE statistics (key TEXT PRIMARY KEY, value REAL)")
    cursor.execute("CREATE TABLE term_stats (term TEXT PRIMARY KEY, df INTEGER)")

    inverted_index = defaultdict(lambda: defaultdict(int))
    doc_lengths = {}
    doc_data = []

    # 문서 토큰화 및 저장 (배치 처리)
    print("Index 구축 시작")
    for doc in tqdm(corpus, desc="Indexing"):
        doc_id = doc['_id']
        full_text = f"{doc['title']} {doc['text']}"
        tokens = tokenize(full_text)

        doc_lengths[doc_id] = len(tokens)
        doc_data.append((doc_id, doc['title'], len(tokens), json.dumps(tokens, ensure_ascii=False)))

        counts = Counter(tokens)
        for term, tf in counts.items():
            inverted_index[term][doc_id] = tf

        # 1000개 단위 커밋
        if len(doc_data) >= 1000:
            cursor.executemany("INSERT INTO documents VALUES (?, ?, ?, ?)", doc_data)
            conn.commit()
            doc_data = []

    if doc_data:
        cursor.executemany("INSERT INTO documents VALUES (?, ?, ?, ?)", doc_data)
        conn.commit()

    # Inverted Index 저장
    index_data = []
    term_stats_data = []

    for term, postings in tqdm(inverted_index.items(), desc="Saving Inverted Index"):
        term_stats_data.append((term, len(postings)))
        for doc_id, tf in postings.items():
            index_data.append((term, doc_id, tf))

        if len(index_data) >= 10000:
            cursor.executemany("INSERT INTO inverted_index VALUES (?, ?, ?)", index_data)
            conn.commit()
            index_data = []

    if index_data:
        cursor.executemany("INSERT INTO inverted_index VALUES (?, ?, ?)", index_data)

    cursor.executemany("INSERT INTO term_stats VALUES (?, ?)", term_stats_data)

    # 통계 정보 저장
    N = len(corpus)
    avgdl = sum(doc_lengths.values()) / N if N > 0 else 0
    cursor.executemany("INSERT INTO statistics VALUES (?, ?)", [
        ('N', N), ('avgdl', avgdl), ('total_terms', len(inverted_index))
    ])

    # 인덱스 생성 (검색 속도 최적화)
    cursor.execute("CREATE INDEX idx_term ON inverted_index(term)")
    cursor.execute("CREATE INDEX idx_doc_id ON inverted_index(doc_id)")

    conn.commit()
    conn.close()
    print("DB 구축 완료.")

build_index_db(corpus)

Index 구축 시작


Indexing:   0%|          | 0/50222 [00:00<?, ?it/s]

Saving Inverted Index:   0%|          | 0/475032 [00:00<?, ?it/s]

DB 구축 완료.
