In [3]:
import os
import pickle
import pandas as pd
import numpy as np
from kiwipiepy import Kiwi
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from tqdm import tqdm
import warnings

warnings.filterwarnings('ignore')

BASE_DIR = os.path.join('..')
DATA_DIR = os.path.join(BASE_DIR, 'data_final')
CORPUS_PATH = os.path.join(BASE_DIR, 'data', 'corpus.pkl')
QRELS_PATH = os.path.join(BASE_DIR, 'data', 'qrels.pkl')

os.makedirs(DATA_DIR, exist_ok=True)

print("전체 데이터 로드")
with open(CORPUS_PATH, 'rb') as f:
    corpus_data = pickle.load(f)

df = pd.DataFrame(corpus_data)
if 'text' not in df.columns and 'body' in df.columns:
    df.rename(columns={'body': 'text'}, inplace=True)

print(f"전체 문서 수: {len(df)}개")

print("\n[스마트 샘플링 시작]")
with open(QRELS_PATH, 'rb') as f:
    qrels_data = pickle.load(f)

id_col = '_id' if '_id' in df.columns else 'doc_id'
if id_col not in df.columns:
    df.reset_index(inplace=True)
    id_col = 'index'

relevant_ids = set()
for item in qrels_data:
    corpus_id = item.get('corpus-id')
    if corpus_id:
        relevant_ids.add(str(corpus_id))

df[id_col] = df[id_col].astype(str)

df_relevant = df[df[id_col].isin(relevant_ids)]
df_others = df[~df[id_col].isin(relevant_ids)]

print(f"필수 포함 (정답 문서): {len(df_relevant)}개")

TARGET_N = 5000
needed_n = TARGET_N - len(df_relevant)

if needed_n > 0 and len(df_others) >= needed_n:
    df_random = df_others.sample(n=needed_n, random_state=42)
    df_sample = pd.concat([df_relevant, df_random])
elif needed_n > 0:
    df_sample = pd.concat([df_relevant, df_others])
else:
    df_sample = df_relevant.sample(n=TARGET_N, random_state=42)

df = df_sample.sample(frac=1, random_state=42).reset_index(drop=True)
print(f"최종 샘플링 완료: {len(df)}개")

전체 데이터 로드
전체 문서 수: 50222개

[스마트 샘플링 시작]
필수 포함 (정답 문서): 6194개
최종 샘플링 완료: 5000개


In [4]:
kiwi = Kiwi(num_workers=0)

def preprocess_dual(text):
    if not isinstance(text, str):
        return [], []

    try:
        tokens = kiwi.tokenize(text)
        padded = []
        meaningful = []
        target_pos = ['NNG', 'NNP', 'VV', 'VA', 'MAG']

        for t in tokens:
            if t.tag in target_pos:
                padded.append(t.form)
                meaningful.append(t.form)
            else:
                padded.append('O' * len(t.form))
        return padded, meaningful
    except:
        return [], []

tqdm.pandas()
print("\n샘플 데이터 전처리 (패딩 토큰 및 LDA 토큰 추출)")
df[['tokens_padded', 'tokens_lda']] = df['text'].progress_apply(
    lambda x: pd.Series(preprocess_dual(x))
)

df['doc_length'] = df['tokens_padded'].apply(lambda x: sum(len(t) for t in x))


샘플 데이터 전처리 (패딩 토큰 및 LDA 토큰 추출)


100%|██████████| 5000/5000 [16:03<00:00,  5.19it/s]


In [5]:
print("\nLDA 토픽 모델 학습 (샘플 데이터)")
lda_tokens = df['tokens_lda'].tolist()
dictionary = corpora.Dictionary(lda_tokens)
dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus = [dictionary.doc2bow(text) for text in lda_tokens]

lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=10,
    random_state=42,
    passes=5,
    alpha='auto'
)

def get_topic_probs(tokens):
    if not tokens:
        return [0.1] * 10
    bow = dictionary.doc2bow(tokens)
    topics = lda_model.get_document_topics(bow, minimum_probability=0.0)
    topic_vec = [0.0] * 10
    for topic_id, prob in topics:
        topic_vec[topic_id] = prob
    return topic_vec


LDA 토픽 모델 학습 (샘플 데이터)


In [6]:
print("토픽 확률 추론")
df['topic_probs'] = df['tokens_lda'].progress_apply(get_topic_probs)

df['dominant_topic'] = df['topic_probs'].apply(lambda x: np.argmax(x))
df['dominant_prob'] = df['topic_probs'].apply(lambda x: np.max(x))

save_path = os.path.join(DATA_DIR, 'sampled_data_final.pkl')
model_path = os.path.join(DATA_DIR, 'lda_model_final.model')

df.to_pickle(save_path)
lda_model.save(model_path)

토픽 확률 추론


100%|██████████| 5000/5000 [00:07<00:00, 703.80it/s]


In [7]:
print("\n작업 완료")
print(f"저장 경로: {save_path}")
print(f"최종 문서 수: {len(df)}개")
print(f"평균 문서 길이: {df['doc_length'].mean():.1f}")
print(f"\n문서 길이 분포:")
print(f"  - 최소: {df['doc_length'].min()}")
print(f"  - 25%: {df['doc_length'].quantile(0.25):.0f}")
print(f"  - 중앙값: {df['doc_length'].median():.0f}")
print(f"  - 75%: {df['doc_length'].quantile(0.75):.0f}")
print(f"  - 최대: {df['doc_length'].max()}")

print("\n[LDA 토픽 예시]")
for idx, topic in lda_model.print_topics(3):
    print(f"Topic {idx}: {topic}")

print("\n[토픽 분포]")
topic_dist = df['dominant_topic'].value_counts().sort_index()
for topic_id, count in topic_dist.items():
    print(f"Topic {topic_id}: {count}개 ({count/len(df)*100:.1f}%)")


작업 완료
저장 경로: ..\data_final\sampled_data_final.pkl
최종 문서 수: 5000개
평균 문서 길이: 7877.5

문서 길이 분포:
  - 최소: 272
  - 25%: 1527
  - 중앙값: 4366
  - 75%: 10843
  - 최대: 102419

[LDA 토픽 예시]
Topic 6: 0.015*"버스" + 0.013*"지역" + 0.008*"노선" + 0.007*"도시" + 0.007*"서울" + 0.006*"차량" + 0.006*"역" + 0.005*"동" + 0.005*"층" + 0.005*"차"
Topic 5: 0.008*"곡" + 0.006*"방송" + 0.006*"멤버" + 0.004*"출연" + 0.004*"코너" + 0.004*"노래" + 0.004*"활동" + 0.004*"부르" + 0.004*"영상" + 0.003*"음악"
Topic 3: 0.004*"죽" + 0.003*"인간" + 0.003*"죽이" + 0.002*"아버지" + 0.002*"주인공" + 0.002*"살" + 0.002*"당하" + 0.002*"장면" + 0.002*"능력" + 0.002*"인물"

[토픽 분포]
Topic 0: 527개 (10.5%)
Topic 1: 338개 (6.8%)
Topic 2: 631개 (12.6%)
Topic 3: 1118개 (22.4%)
Topic 4: 519개 (10.4%)
Topic 5: 679개 (13.6%)
Topic 6: 216개 (4.3%)
Topic 7: 421개 (8.4%)
Topic 8: 406개 (8.1%)
Topic 9: 145개 (2.9%)
