# 1. 데이터 준비 및 인덱싱 - 샘플링O 후처리X
- 데이터 후처리: X (null값 제거만)
- 샘플링: 앞에서부터 5000개

In [1]:
import os
import sqlite3
import json
import pickle
from collections import defaultdict, Counter
from tqdm.notebook import tqdm
from kiwipiepy import Kiwi
from pathlib import Path

PROJECT_ROOT = Path('..').resolve()
DATA_DIR = PROJECT_ROOT / 'data'
DB_DIR = PROJECT_ROOT / 'database'
DB_PATH = DB_DIR / 'inverted_index_sample.db'

for d in [DATA_DIR, DB_DIR]:
    os.makedirs(d, exist_ok=True)

# 데이터 로드
if not (DATA_DIR / 'corpus.pkl').exists():
    raise FileNotFoundError("corpus.pkl 없음")

with open(DATA_DIR / 'corpus.pkl', 'rb') as f:
    full_corpus = pickle.load(f)

# 5000개 샘플링
SAMPLE_SIZE = 5000
corpus = full_corpus[:SAMPLE_SIZE]
print(f"Dirty Indexing 시작: {len(corpus)}개 문서 (후처리 미적용)")

kiwi = Kiwi(num_workers=-1)

def tokenize_dirty(text: str) -> list:
    if not text: return []
    # Null 제거만
    text = text.replace('\x00', '')
    try:
        tokens = kiwi.tokenize(text)
        useful_tags = ['NNG', 'NNP', 'VV', 'VA', 'MAG']
        return [t.form for t in tokens if t.tag in useful_tags and len(t.form) > 1]
    except:
        return []

def build_index_db(corpus):
    conn = sqlite3.connect(str(DB_PATH))
    cursor = conn.cursor()

    for table in ['documents', 'inverted_index', 'statistics', 'term_stats']:
        cursor.execute(f"DROP TABLE IF EXISTS {table}")

    cursor.execute("CREATE TABLE documents (doc_id TEXT PRIMARY KEY, title TEXT, length INTEGER, tokens TEXT)")
    cursor.execute("CREATE TABLE inverted_index (term TEXT, doc_id TEXT, tf INTEGER, PRIMARY KEY (term, doc_id))")
    cursor.execute("CREATE TABLE statistics (key TEXT PRIMARY KEY, value REAL)")
    cursor.execute("CREATE TABLE term_stats (term TEXT PRIMARY KEY, df INTEGER)")

    inverted_index = defaultdict(lambda: defaultdict(int))
    doc_lengths = {}
    doc_data = []

    for doc in tqdm(corpus, desc="Indexing (Dirty)"):
        doc_id = doc['_id']
        full_text = f"{doc['title']} {doc['text']}"
        tokens = tokenize_dirty(full_text)

        doc_lengths[doc_id] = len(tokens)
        doc_data.append((doc_id, doc['title'], len(tokens), json.dumps(tokens, ensure_ascii=False)))

        counts = Counter(tokens)
        for term, tf in counts.items():
            inverted_index[term][doc_id] = tf

        if len(doc_data) >= 1000:
            cursor.executemany("INSERT INTO documents VALUES (?, ?, ?, ?)", doc_data)
            conn.commit()
            doc_data = []

    if doc_data:
        cursor.executemany("INSERT INTO documents VALUES (?, ?, ?, ?)", doc_data)
        conn.commit()

    index_data = []
    term_stats_data = []

    for term, postings in tqdm(inverted_index.items(), desc="Saving Index"):
        term_stats_data.append((term, len(postings)))
        for doc_id, tf in postings.items():
            index_data.append((term, doc_id, tf))

        if len(index_data) >= 10000:
            cursor.executemany("INSERT INTO inverted_index VALUES (?, ?, ?)", index_data)
            conn.commit()
            index_data = []

    if index_data:
        cursor.executemany("INSERT INTO inverted_index VALUES (?, ?, ?)", index_data)

    cursor.executemany("INSERT INTO term_stats VALUES (?, ?)", term_stats_data)

    N = len(corpus)
    avgdl = sum(doc_lengths.values()) / N if N > 0 else 0
    cursor.executemany("INSERT INTO statistics VALUES (?, ?)", [
        ('N', N), ('avgdl', avgdl), ('total_terms', len(inverted_index))
    ])

    cursor.execute("CREATE INDEX idx_term ON inverted_index(term)")
    cursor.execute("CREATE INDEX idx_doc_id ON inverted_index(doc_id)")

    conn.commit()
    conn.close()
    print(f"DB 구축 완료: {DB_PATH}")

build_index_db(corpus)

Dirty Indexing 시작: 5000개 문서 (후처리 미적용)


Indexing (Dirty):   0%|          | 0/5000 [00:00<?, ?it/s]

Saving Index:   0%|          | 0/153410 [00:00<?, ?it/s]

DB 구축 완료: C:\Users\cse\Desktop\xeoxaxeo\NLP\database\inverted_index_sample.db
