In [3]:
import os
import pickle
import pandas as pd
import numpy as np
from kiwipiepy import Kiwi
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from tqdm import tqdm
import warnings

warnings.filterwarnings('ignore')

CORPUS_PATH = os.path.join('..', 'data', 'corpus.pkl')
QRELS_PATH = os.path.join('..', 'data', 'qrels.pkl')
DATA_DIR = os.path.join('..', 'final_data')

os.makedirs(DATA_DIR, exist_ok=True)

print("데이터 로드 (Corpus + Qrels)")
with open(CORPUS_PATH, 'rb') as f:
    corpus_data = pickle.load(f)
with open(QRELS_PATH, 'rb') as f:
    qrels_data = pickle.load(f)

df = pd.DataFrame(corpus_data)
if 'text' not in df.columns and 'body' in df.columns:
    df.rename(columns={'body': 'text'}, inplace=True)

데이터 로드 (Corpus + Qrels)


In [5]:
print(f"데이터 타입: {type(qrels_data)}")
if isinstance(qrels_data, list) and len(qrels_data) > 0:
    print(f"첫 번째 데이터 샘플: {qrels_data[0]}")
    print(f"데이터 길이: {len(qrels_data)}")
else:
    print("오류")

데이터 타입: <class 'list'>
첫 번째 데이터 샘플: {'query-id': 'query_000001', 'corpus-id': '부머(폴아웃: 뉴 베가스)', 'score': 1}
데이터 길이: 6289


In [7]:
print("Smart Sampling 수행 (정답 문서 보존)")
relevant_doc_ids = set()
for item in qrels_data:
    if 'corpus-id' in item:
        relevant_doc_ids.add(item['corpus-id'])

id_col = '_id' if '_id' in df.columns else 'doc_id'
print(f"매칭 기준 컬럼: {id_col}")

df_relevant = df[df[id_col].isin(relevant_doc_ids)]
df_others = df[~df[id_col].isin(relevant_doc_ids)]

print(f"  - 정답 문서 수: {len(df_relevant)}개 (필수 포함)")

target_n = 5000
needed_n = target_n - len(df_relevant)

if needed_n > 0:
    df_random = df_others.sample(n=needed_n, random_state=42)
    df_sample = pd.concat([df_relevant, df_random])
else:
    df_sample = df_relevant.sample(n=target_n, random_state=42)

df_sample = df_sample.sample(frac=1, random_state=42).reset_index(drop=True)
print(f"  - 최종 샘플링 완료: {len(df_sample)}개")

Smart Sampling 수행 (정답 문서 보존)
매칭 기준 컬럼: _id
  - 정답 문서 수: 6194개 (필수 포함)
  - 최종 샘플링 완료: 5000개


In [10]:
kiwi = Kiwi(num_workers=0)

def preprocess_dual(text):
    if not isinstance(text, str):
        return [], []

    try:
        tokens = kiwi.tokenize(text)
        padded = []
        meaningful = []
        target_pos = ['NNG', 'NNP', 'VV', 'VA', 'MAG']

        for t in tokens:
            if t.tag in target_pos:
                if len(t.form) > 1:
                    padded.append(t.form)
                    meaningful.append(t.form)
                else:
                    padded.append('O' * len(t.form))
            else:
                padded.append('O' * len(t.form))

        return padded, meaningful
    except:
        return [], []

tqdm.pandas()
print("데이터 전처리 (Length Preserving Padding + LDA)")
df_sample[['tokens_padded', 'tokens_lda']] = df_sample['text'].progress_apply(
    lambda x: pd.Series(preprocess_dual(x))
)

df_sample['doc_length'] = df_sample['tokens_padded'].apply(lambda x: sum(len(t) for t in x))

데이터 전처리 (Length Preserving Padding + LDA)


100%|██████████| 5000/5000 [14:36<00:00,  5.71it/s]  


In [11]:
print("LDA 토픽 모델 학습")
lda_tokens = df_sample['tokens_lda'].tolist()
dictionary = corpora.Dictionary(lda_tokens)
dictionary.filter_extremes(no_below=10, no_above=0.5)
corpus = [dictionary.doc2bow(text) for text in lda_tokens]

lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=10,
    random_state=42,
    passes=10,
    alpha='auto',
    per_word_topics=True
)

def get_topic_probs(tokens):
    if not tokens:
        return [0.1] * 10
    bow = dictionary.doc2bow(tokens)
    topics = lda_model.get_document_topics(bow, minimum_probability=0.0)
    topic_vec = [0.0] * 10
    for topic_id, prob in topics:
        topic_vec[topic_id] = prob
    return topic_vec

print("토픽 확률 추론")
df_sample['topic_probs'] = df_sample['tokens_lda'].progress_apply(get_topic_probs)

sample_save_path = os.path.join(DATA_DIR, 'sample.pkl')
model_save_path = os.path.join(DATA_DIR, 'lda.model')

df_sample.to_pickle(sample_save_path)
lda_model.save(model_save_path)

print("작업 완료")
print(f"저장 경로: {DATA_DIR}")

print("\n[Padding 결과 예시]")
print(f"Original Text snippet: {df_sample['text'].iloc[0][:30]}...")
print(f"Padded Tokens: {df_sample['tokens_padded'].iloc[0][:10]}")

print("\n[LDA 토픽 목록]")
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")

LDA 토픽 모델 학습
토픽 확률 추론


100%|██████████| 5000/5000 [00:03<00:00, 1282.33it/s]


작업 완료
저장 경로: ..\final_data

[Padding 결과 예시]
Original Text snippet: 야인시대 합성물(심영물)에 등장하는 인물과 그 인물의 ...
Padded Tokens: ['야인시대', '합성', 'O', 'O', 'O', '영물', 'O', 'O', '등장', 'O']

[LDA 토픽 목록]
Topic 0: 0.006*"지역" + 0.005*"대통령" + 0.005*"사건" + 0.005*"정부" + 0.004*"서울" + 0.004*"국가" + 0.004*"학교" + 0.004*"대학" + 0.003*"선거" + 0.003*"의원"
Topic 1: 0.011*"방송" + 0.007*"멤버" + 0.006*"영상" + 0.006*"출연" + 0.005*"활동" + 0.005*"노래" + 0.005*"코너" + 0.005*"부르" + 0.004*"영화" + 0.004*"앨범"
Topic 2: 0.012*"조선" + 0.006*"고려" + 0.005*"일본" + 0.005*"기록" + 0.004*"황제" + 0.004*"신라" + 0.004*"시대" + 0.004*"인물" + 0.004*"고구려" + 0.004*"조조"
Topic 3: 0.015*"공격" + 0.011*"스킬" + 0.009*"레벨" + 0.008*"효과" + 0.007*"카드" + 0.006*"증가" + 0.006*"마법" + 0.005*"추가" + 0.005*"데미지" + 0.005*"상대"
Topic 4: 0.023*"경기" + 0.017*"시즌" + 0.015*"선수" + 0.011*"리그" + 0.010*"우승" + 0.008*"기록" + 0.007*"감독" + 0.005*"상대" + 0.005*"승리" + 0.005*"진출"
Topic 5: 0.004*"인간" + 0.004*"캐릭터" + 0.003*"주인공" + 0.003*"죽이" + 0.003*"장면" + 0.003*"모르" + 0.003*"능력" + 0.003*"당하" + 0.003*"아버지"