In [1]:
import os
import sqlite3
import json
import pickle
import re
from collections import defaultdict, Counter
from typing import List
from tqdm.notebook import tqdm
from kiwipiepy import Kiwi
from datasets import load_dataset
from pathlib import Path

PROJECT_ROOT = Path('..').resolve()
DATA_DIR = PROJECT_ROOT / 'data'
DB_DIR = PROJECT_ROOT / 'database'

DB_PATH = DB_DIR / 'inverted_index_full_clean.db'

for d in [DATA_DIR, DB_DIR]:
    os.makedirs(d, exist_ok=True)

if not (DATA_DIR / 'corpus.pkl').exists():
    print("데이터셋 다운로드 중")
    queries = load_dataset("junyoungson/KomuRetrieval", "queries", split="queries")
    corpus = load_dataset("junyoungson/KomuRetrieval", "corpus", split="corpus")
    qrels = load_dataset("junyoungson/KomuRetrieval", split="test")

    full_corpus = [item for item in tqdm(corpus, desc="Corpus Loading")]
    full_queries = [item for item in tqdm(queries, desc="Queries Loading")]
    full_qrels = [item for item in tqdm(qrels, desc="Qrels Loading")]

    with open(DATA_DIR / 'corpus.pkl', 'wb') as f:
        pickle.dump(full_corpus, f)
    with open(DATA_DIR / 'queries.pkl', 'wb') as f:
        pickle.dump(full_queries, f)
    with open(DATA_DIR / 'qrels.pkl', 'wb') as f:
        pickle.dump(full_qrels, f)
else:
    print("기존 데이터 로드 중")
    with open(DATA_DIR / 'corpus.pkl', 'rb') as f:
        full_corpus = pickle.load(f)

corpus = full_corpus
print(f"전체 데이터 사용: {len(corpus):,}개")

kiwi = Kiwi(num_workers=-1)

STOPWORDS = {
    '나오', '경우', '보이', '이후', '사람', '정도', '자신', '사용', '가능', '대하',
    '위하', '사실', '만들', '등장', '문제', '모습', '시작', '가지', '생각', '따르',
    '이상', '함께', '당시', '상대', '시간', '다시', '상황', '이름', '가장', '또한',
    '모두', '결국', '다만', '많이', '달리', '다르', '물론', '당하', '처음', '현재',
    '같이', '이유', '통하', '역시', '자체', '거의', '매우', '없이', '상태', '바로',
    '인하', '특히', '존재', '들어가', '사이', '다음', '모르', '참고', '부분', '이것',
    '해당', '그냥', '마지막', '필요', '부르', '관련', '떨어지', '대부분', '때문', '아니'
}

def tokenize(text: str) -> List[str]:
    if not text: return []

    text = text.replace('\x00', '')
    text = re.sub(r'~~.*?~~', '', text)
    text = re.sub(r'[ㄱ-ㅎㅏ-ㅣ]+', '', text)
    text = re.sub(r'[\.\?\!~\-]{2,}', '.', text)

    try:
        tokens = kiwi.tokenize(text)
        useful_tags = ['NNG', 'NNP', 'VV', 'VA', 'MAG']

        result = []
        for t in tokens:
            if t.tag in useful_tags and len(t.form) > 1:
                if t.form not in STOPWORDS:
                    result.append(t.form)
        return result
    except:
        return []

def build_index_db(corpus):
    conn = sqlite3.connect(str(DB_PATH))
    cursor = conn.cursor()

    for table in ['documents', 'inverted_index', 'statistics', 'term_stats']:
        cursor.execute(f"DROP TABLE IF EXISTS {table}")

    cursor.execute("CREATE TABLE documents (doc_id TEXT PRIMARY KEY, title TEXT, length INTEGER, tokens TEXT)")
    cursor.execute("CREATE TABLE inverted_index (term TEXT, doc_id TEXT, tf INTEGER, PRIMARY KEY (term, doc_id))")
    cursor.execute("CREATE TABLE statistics (key TEXT PRIMARY KEY, value REAL)")
    cursor.execute("CREATE TABLE term_stats (term TEXT PRIMARY KEY, df INTEGER)")

    inverted_index = defaultdict(lambda: defaultdict(int))
    doc_lengths = {}
    doc_data = []

    print(f"Index 구축 시작 (Target: {DB_PATH.name})")
    for doc in tqdm(corpus, desc="Indexing"):
        doc_id = doc['_id']
        full_text = f"{doc['title']} {doc['text']}"
        tokens = tokenize(full_text)

        doc_lengths[doc_id] = len(tokens)
        doc_data.append((doc_id, doc['title'], len(tokens), json.dumps(tokens, ensure_ascii=False)))

        counts = Counter(tokens)
        for term, tf in counts.items():
            inverted_index[term][doc_id] = tf

        if len(doc_data) >= 1000:
            cursor.executemany("INSERT INTO documents VALUES (?, ?, ?, ?)", doc_data)
            conn.commit()
            doc_data = []

    if doc_data:
        cursor.executemany("INSERT INTO documents VALUES (?, ?, ?, ?)", doc_data)
        conn.commit()

    index_data = []
    term_stats_data = []

    print("역색인 저장")
    for term, postings in tqdm(inverted_index.items(), desc="Saving Index"):
        term_stats_data.append((term, len(postings)))
        for doc_id, tf in postings.items():
            index_data.append((term, doc_id, tf))

        if len(index_data) >= 10000:
            cursor.executemany("INSERT INTO inverted_index VALUES (?, ?, ?)", index_data)
            conn.commit()
            index_data = []

    if index_data:
        cursor.executemany("INSERT INTO inverted_index VALUES (?, ?, ?)", index_data)

    cursor.executemany("INSERT INTO term_stats VALUES (?, ?)", term_stats_data)

    N = len(corpus)
    avgdl = sum(doc_lengths.values()) / N if N > 0 else 0
    cursor.executemany("INSERT INTO statistics VALUES (?, ?)", [
        ('N', N), ('avgdl', avgdl), ('total_terms', len(inverted_index))
    ])

    print("인덱스 생성")
    cursor.execute("CREATE INDEX idx_term ON inverted_index(term)")
    cursor.execute("CREATE INDEX idx_doc_id ON inverted_index(doc_id)")

    conn.commit()
    conn.close()
    print(f"DB 구축 완료: {DB_PATH}")

build_index_db(corpus)

기존 데이터 로드 중
전체 데이터 사용: 50,222개
Index 구축 시작 (Target: inverted_index_full_clean.db)


Indexing:   0%|          | 0/50222 [00:00<?, ?it/s]

역색인 저장


Saving Index:   0%|          | 0/474939 [00:00<?, ?it/s]

인덱스 생성
DB 구축 완료: C:\Users\cse\Desktop\xeoxaxeo\NLP\database\inverted_index_full_clean.db
