In [1]:
!pip install gensim kiwipiepy pandas numpy tqdm




[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import pickle
import pandas as pd
import numpy as np
from kiwipiepy import Kiwi
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from tqdm import tqdm
import warnings

warnings.filterwarnings('ignore')

# 경로 설정
BASE_DIR = os.path.join('..')
DATA_DIR = os.path.join(BASE_DIR, 'final_data')
CORPUS_PATH = os.path.join(BASE_DIR, 'data', 'corpus.pkl')
QRELS_PATH = os.path.join(BASE_DIR, 'data', 'qrels.pkl')

os.makedirs(DATA_DIR, exist_ok=True)

print("데이터 로드")
with open(CORPUS_PATH, 'rb') as f:
    corpus_data = pickle.load(f)
with open(QRELS_PATH, 'rb') as f:
    qrels_data = pickle.load(f)

df = pd.DataFrame(corpus_data)
if 'text' not in df.columns and 'body' in df.columns:
    df.rename(columns={'body': 'text'}, inplace=True)


id_col = '_id' if '_id' in df.columns else 'doc_id'


print(f"전체 문서 수: {len(df)}개")
print("스마트 샘플링 수행")

데이터 로드
전체 문서 수: 50222개
스마트 샘플링 수행


In [3]:
# 정답 문서 ID 추출
relevant_ids = set()
for item in qrels_data:
    if 'corpus-id' in item:
        relevant_ids.add(item['corpus-id'])

# 데이터 분리
df_relevant = df[df[id_col].isin(relevant_ids)]
df_others = df[~df[id_col].isin(relevant_ids)]

print(f" - 필수 포함(정답) 문서: {len(df_relevant)}개")

TARGET_N = 5000
needed_n = TARGET_N - len(df_relevant)

if needed_n > 0:
    df_random = df_others.sample(n=needed_n, random_state=42)
    df_sample = pd.concat([df_relevant, df_random])
else:
    df_sample = df_relevant.sample(n=TARGET_N, random_state=42)

df = df_sample.sample(frac=1, random_state=42).reset_index(drop=True)
print(f" - 최종 샘플링 완료: {len(df)}개")


 - 필수 포함(정답) 문서: 6194개
 - 최종 샘플링 완료: 5000개


In [4]:
# 전처리 및 LDA

kiwi = Kiwi(num_workers=0)

def preprocess_dual(text):
    if not isinstance(text, str):
        return [], []

    try:
        tokens = kiwi.tokenize(text)
        padded = []
        meaningful = []
        target_pos = ['NNG', 'NNP', 'VV', 'VA', 'MAG']

        for t in tokens:
            if t.tag in target_pos:
                if len(t.form) > 1:
                    padded.append(t.form)
                    meaningful.append(t.form)
                else:
                    padded.append('O' * len(t.form))
            else:
                padded.append('O' * len(t.form))
        return padded, meaningful
    except:
        return [], []

tqdm.pandas()
print("전처리 수행 (Padding + LDA 토큰)")
df[['tokens_padded', 'tokens_lda']] = df['text'].progress_apply(
    lambda x: pd.Series(preprocess_dual(x))
)

df['doc_length'] = df['tokens_padded'].apply(lambda x: sum(len(t) for t in x))

전처리 수행 (Padding + LDA 토큰)


100%|██████████| 5000/5000 [17:28<00:00,  4.77it/s]  


In [6]:
print("LDA 모델 학습")
lda_tokens = df['tokens_lda'].tolist()
dictionary = corpora.Dictionary(lda_tokens)
dictionary.filter_extremes(no_below=10, no_above=0.5)
corpus = [dictionary.doc2bow(text) for text in lda_tokens]

lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=10,
    random_state=42,
    passes=10,
    alpha='auto'
)

def get_topic_probs(tokens):
    if not tokens:
        return [0.1] * 10
    bow = dictionary.doc2bow(tokens)
    topics = lda_model.get_document_topics(bow, minimum_probability=0.0)
    topic_vec = [0.0] * 10
    for topic_id, prob in topics:
        topic_vec[topic_id] = prob
    return topic_vec

print("토픽 확률 추출")
df['topic_probs'] = df['tokens_lda'].progress_apply(get_topic_probs)

save_path = os.path.join(DATA_DIR, 'sample.pkl')
model_path = os.path.join(DATA_DIR, 'lda_sample.model')

df.to_pickle(save_path)
lda_model.save(model_path)

print("작업 완료")
print(f"샘플 데이터 저장: {save_path}")

LDA 모델 학습
토픽 확률 추출


100%|██████████| 5000/5000 [00:04<00:00, 1082.57it/s]


작업 완료
샘플 데이터 저장: ..\final_data\sample.pkl
