In [1]:
import os
import pickle
import pandas as pd
import numpy as np
import re
import math
from kiwipiepy import Kiwi
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from tqdm import tqdm
from collections import Counter
import warnings

warnings.filterwarnings('ignore')

BASE_DIR = os.path.join('..')
DATA_DIR = os.path.join(BASE_DIR, 'data_final')
CORPUS_PATH = os.path.join(BASE_DIR, 'data', 'corpus.pkl')
QRELS_PATH = os.path.join(BASE_DIR, 'data', 'qrels.pkl')

os.makedirs(DATA_DIR, exist_ok=True)

print("전체 데이터 로드")
with open(CORPUS_PATH, 'rb') as f:
    corpus_data = pickle.load(f)

df = pd.DataFrame(corpus_data)
if 'text' not in df.columns and 'body' in df.columns:
    df.rename(columns={'body': 'text'}, inplace=True)

print(f"전체 문서 수: {len(df)}개")

print("\n[스마트 샘플링 시작]")
with open(QRELS_PATH, 'rb') as f:
    qrels_data = pickle.load(f)

id_col = '_id' if '_id' in df.columns else 'doc_id'
if id_col not in df.columns:
    df.reset_index(inplace=True)
    id_col = 'index'

relevant_ids = set()
for item in qrels_data:
    corpus_id = item.get('corpus-id')
    if corpus_id:
        relevant_ids.add(str(corpus_id))

df[id_col] = df[id_col].astype(str)

df_relevant = df[df[id_col].isin(relevant_ids)]
df_others = df[~df[id_col].isin(relevant_ids)]

print(f"필수 포함 (정답 문서): {len(df_relevant)}개")

TARGET_N = 5000
needed_n = TARGET_N - len(df_relevant)

if needed_n > 0 and len(df_others) >= needed_n:
    df_random = df_others.sample(n=needed_n, random_state=42)
    df_sample = pd.concat([df_relevant, df_random])
elif needed_n > 0:
    df_sample = pd.concat([df_relevant, df_others])
else:
    df_sample = df_relevant.sample(n=TARGET_N, random_state=42)

df = df_sample.sample(frac=1, random_state=42).reset_index(drop=True)
print(f"최종 샘플링 완료: {len(df)}개")

전체 데이터 로드
전체 문서 수: 50222개

[스마트 샘플링 시작]
필수 포함 (정답 문서): 6194개
최종 샘플링 완료: 5000개


In [2]:
kiwi = Kiwi(num_workers=0)

def is_repeated_special(text):
    return bool(re.search(r'([!?.…~\-])\1+', text))

def is_jamo(text):
    return bool(re.search(r'[ㅋㅎㅠㅜㅡㅗㅓㅏㅣ]', text))

print("\n[불용어 추출: IDF 기반 분석]")
print("Step 1: 전체 5000개 문서에서 DF 계산")

target_pos = ['NNG', 'NNP', 'VV', 'VA', 'MAG']
doc_freq = Counter()
N = len(df)

for text in tqdm(df['text'], desc="DF 계산"):
    if not isinstance(text, str):
        continue

    try:
        tokens = kiwi.tokenize(text)
        unique_tokens = set()

        for t in tokens:
            if is_repeated_special(t.form) or is_jamo(t.form):
                continue
            if t.tag in target_pos and len(t.form) > 1:
                unique_tokens.add(t.form)

        for token in unique_tokens:
            doc_freq[token] += 1
    except:
        continue

print(f"총 고유 토큰 수: {len(doc_freq):,}개")

print("\nStep 2: IDF 계산 및 불용어 선정 (IDF < 1.5)")
idf_scores = []
for word, df_count in doc_freq.items():
    idf = math.log(N / df_count)
    idf_scores.append((word, df_count, idf))

idf_scores.sort(key=lambda x: x[2])

stopwords = set()
IDF_THRESHOLD = 1.5

for word, df_count, idf in idf_scores:
    if idf < IDF_THRESHOLD:
        stopwords.add(word)

print(f"IDF < {IDF_THRESHOLD} 불용어: {len(stopwords)}개")
print(f"\n[불용어 예시 (상위 20개)]")
for i, (word, df_count, idf) in enumerate(idf_scores[:20], 1):
    print(f"{i:2}. {word:<10} | DF: {df_count:4} | IDF: {idf:.4f}")

stopwords_path = os.path.join(DATA_DIR, 'stopwords_v2.txt')
with open(stopwords_path, 'w', encoding='utf-8') as f:
    for word in sorted(stopwords):
        f.write(f"{word}\n")
print(f"\n불용어 목록 저장: {stopwords_path}")


[불용어 추출: IDF 기반 분석]
Step 1: 전체 5000개 문서에서 DF 계산


DF 계산: 100%|██████████| 5000/5000 [22:51<00:00,  3.65it/s]  

총 고유 토큰 수: 157,022개

Step 2: IDF 계산 및 불용어 선정 (IDF < 1.5)
IDF < 1.5 불용어: 346개

[불용어 예시 (상위 20개)]
 1. 나오         | DF: 3845 | IDF: 0.2627
 2. 보이         | DF: 3680 | IDF: 0.3065
 3. 이후         | DF: 3678 | IDF: 0.3071
 4. 정도         | DF: 3496 | IDF: 0.3578
 5. 위하         | DF: 3414 | IDF: 0.3816
 6. 경우         | DF: 3318 | IDF: 0.4101
 7. 사실         | DF: 3304 | IDF: 0.4143
 8. 사람         | DF: 3223 | IDF: 0.4391
 9. 만들         | DF: 3204 | IDF: 0.4450
10. 시작         | DF: 3120 | IDF: 0.4716
11. 따르         | DF: 3091 | IDF: 0.4809
12. 가지         | DF: 3085 | IDF: 0.4829
13. 대하         | DF: 3073 | IDF: 0.4868
14. 가능         | DF: 3069 | IDF: 0.4881
15. 모두         | DF: 3046 | IDF: 0.4956
16. 이상         | DF: 3029 | IDF: 0.5012
17. 자신         | DF: 3003 | IDF: 0.5098
18. 함께         | DF: 2993 | IDF: 0.5132
19. 가장         | DF: 2947 | IDF: 0.5287
20. 다시         | DF: 2907 | IDF: 0.5423

불용어 목록 저장: ..\data_final\stopwords_v2.txt





In [3]:
def preprocess_with_stopwords(text, stopwords):
    if not isinstance(text, str):
        return [], []

    try:
        tokens = kiwi.tokenize(text)
        padded = []
        meaningful = []

        for t in tokens:
            if is_repeated_special(t.form) or is_jamo(t.form):
                padded.append('O' * len(t.form))
            elif t.tag in target_pos and len(t.form) > 1:
                if t.form in stopwords:
                    padded.append('O' * len(t.form))
                else:
                    padded.append(t.form)
                    meaningful.append(t.form)
            else:
                padded.append('O' * len(t.form))

        return padded, meaningful
    except:
        return [], []

def extract_document_features(text):
    if not isinstance(text, str):
        return 0, 0, 0, 0.0

    try:
        tokens = kiwi.tokenize(text)
        morph_count = len(tokens)

        syllable_count = len(re.sub(r'\s+', '', text))

        sentences = kiwi.split_into_sents(text)
        sent_count = len(sentences)

        avg_sent_len = syllable_count / sent_count if sent_count > 0 else 0.0

        return morph_count, syllable_count, sent_count, avg_sent_len
    except:
        return 0, 0, 0, 0.0

tqdm.pandas()
print("\n샘플 데이터 전처리 (불용어 패딩 포함)")
df[['tokens_padded', 'tokens_lda']] = df['text'].progress_apply(
    lambda x: pd.Series(preprocess_with_stopwords(x, stopwords))
)

print("\n문서 특성 추출")
df[['morph_count', 'syllable_count', 'sent_count', 'avg_sent_len']] = df['text'].progress_apply(
    lambda x: pd.Series(extract_document_features(x))
)

df['doc_length'] = df['tokens_padded'].apply(lambda x: sum(len(t) for t in x))


샘플 데이터 전처리 (불용어 패딩 포함)


100%|██████████| 5000/5000 [14:38<00:00,  5.69it/s] 



문서 특성 추출


100%|██████████| 5000/5000 [26:30<00:00,  3.14it/s]  


In [4]:
print("\nLDA 토픽 모델 학습")
lda_tokens = df['tokens_lda'].tolist()
dictionary = corpora.Dictionary(lda_tokens)
dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus = [dictionary.doc2bow(text) for text in lda_tokens]

lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=10,
    random_state=42,
    passes=5,
    alpha='auto'
)

def get_topic_probs(tokens):
    if not tokens:
        return [0.1] * 10
    bow = dictionary.doc2bow(tokens)
    topics = lda_model.get_document_topics(bow, minimum_probability=0.0)
    topic_vec = [0.0] * 10
    for topic_id, prob in topics:
        topic_vec[topic_id] = prob
    return topic_vec

print("토픽 확률 추론")
df['topic_probs'] = df['tokens_lda'].progress_apply(get_topic_probs)

df['dominant_topic'] = df['topic_probs'].apply(lambda x: np.argmax(x))
df['dominant_prob'] = df['topic_probs'].apply(lambda x: np.max(x))

save_path = os.path.join(DATA_DIR, 'sampled_data_v2.pkl')
model_path = os.path.join(DATA_DIR, 'lda_model_v2.model')

df.to_pickle(save_path)
lda_model.save(model_path)


LDA 토픽 모델 학습
토픽 확률 추론


100%|██████████| 5000/5000 [00:03<00:00, 1360.59it/s]


In [5]:
print("\n작업 완료")
print(f"저장 경로: {save_path}")
print(f"최종 문서 수: {len(df)}개")
print(f"불용어 개수: {len(stopwords)}개")

print(f"\n[문서 길이 통계 (doc_length)]")
print(f"  - 평균: {df['doc_length'].mean():.1f}")
print(f"  - 최소: {df['doc_length'].min()}")
print(f"  - 중앙값: {df['doc_length'].median():.0f}")
print(f"  - 최대: {df['doc_length'].max()}")

print(f"\n[형태소 개수 (morph_count)]")
print(f"  - 평균: {df['morph_count'].mean():.1f}")
print(f"  - 중앙값: {df['morph_count'].median():.0f}")

print(f"\n[음절 수 (syllable_count)]")
print(f"  - 평균: {df['syllable_count'].mean():.1f}")
print(f"  - 중앙값: {df['syllable_count'].median():.0f}")

print(f"\n[문장 개수 (sent_count)]")
print(f"  - 평균: {df['sent_count'].mean():.1f}")
print(f"  - 중앙값: {df['sent_count'].median():.0f}")

print(f"\n[평균 문장 길이 (avg_sent_len)]")
print(f"  - 평균: {df['avg_sent_len'].mean():.1f}")
print(f"  - 중앙값: {df['avg_sent_len'].median():.0f}")

print("\n[LDA 토픽 예시]")
for idx, topic in lda_model.print_topics(3):
    print(f"Topic {idx}: {topic}")

print("\n[토픽 분포]")
topic_dist = df['dominant_topic'].value_counts().sort_index()
for topic_id, count in topic_dist.items():
    print(f"Topic {topic_id}: {count}개 ({count/len(df)*100:.1f}%)")


작업 완료
저장 경로: ..\data_final\sampled_data_v2.pkl
최종 문서 수: 5000개
불용어 개수: 346개

[문서 길이 통계 (doc_length)]
  - 평균: 7873.0
  - 최소: 0
  - 중앙값: 4366
  - 최대: 102419

[형태소 개수 (morph_count)]
  - 평균: 5053.2
  - 중앙값: 2777

[음절 수 (syllable_count)]
  - 평균: 7514.7
  - 중앙값: 4174

[문장 개수 (sent_count)]
  - 평균: 154.0
  - 중앙값: 86

[평균 문장 길이 (avg_sent_len)]
  - 평균: 55.7
  - 중앙값: 47

[LDA 토픽 예시]
Topic 2: 0.024*"시즌" + 0.021*"경기" + 0.015*"선수" + 0.011*"우승" + 0.009*"투수" + 0.008*"리그" + 0.008*"감독" + 0.007*"야구" + 0.007*"삼성" + 0.006*"홈런"
Topic 0: 0.007*"영화" + 0.005*"포켓몬" + 0.004*"코너" + 0.004*"소닉" + 0.003*"출연" + 0.003*"디자인" + 0.003*"애니메이션" + 0.003*"방영" + 0.003*"세대" + 0.003*"요리"
Topic 5: 0.010*"스킬" + 0.009*"레벨" + 0.008*"무기" + 0.006*"증가" + 0.006*"데미지" + 0.005*"카드" + 0.005*"마법" + 0.005*"유저" + 0.005*"보스" + 0.005*"공격력"

[토픽 분포]
Topic 0: 679개 (13.6%)
Topic 1: 822개 (16.4%)
Topic 2: 295개 (5.9%)
Topic 3: 391개 (7.8%)
Topic 4: 445개 (8.9%)
Topic 5: 847개 (16.9%)
Topic 6: 503개 (10.1%)
Topic 7: 204개 (4.1%)
Topic 8: 538개 (10.8%)
Topi

In [9]:
print("\n[LDA 토픽 목록]")
for idx, topic in lda_model.print_topics(9):
    print(f"Topic {idx}: {topic}")


[LDA 토픽 목록]
Topic 2: 0.024*"시즌" + 0.021*"경기" + 0.015*"선수" + 0.011*"우승" + 0.009*"투수" + 0.008*"리그" + 0.008*"감독" + 0.007*"야구" + 0.007*"삼성" + 0.006*"홈런"
Topic 7: 0.029*"경기" + 0.019*"선수" + 0.016*"리그" + 0.016*"시즌" + 0.011*"우승" + 0.008*"월드컵" + 0.007*"축구" + 0.007*"대회" + 0.007*"감독" + 0.007*"진출"
Topic 9: 0.014*"버스" + 0.008*"서울" + 0.008*"노선" + 0.007*"도시" + 0.007*"대학" + 0.006*"학교" + 0.005*"후보" + 0.005*"차량" + 0.004*"운행" + 0.004*"학생"
Topic 8: 0.010*"방송" + 0.006*"멤버" + 0.004*"대통령" + 0.004*"뉴스" + 0.004*"정부" + 0.004*"앨범" + 0.004*"세대" + 0.004*"그룹" + 0.003*"철학" + 0.003*"프로그램"
Topic 3: 0.003*"경찰" + 0.003*"대통령" + 0.003*"정치" + 0.003*"사회" + 0.003*"갤러리" + 0.002*"여성" + 0.002*"정부" + 0.002*"판사" + 0.002*"조사" + 0.002*"법원"
Topic 4: 0.003*"중국" + 0.003*"대만" + 0.003*"영웅" + 0.003*"전쟁" + 0.002*"파괴" + 0.002*"신화" + 0.002*"신라" + 0.002*"고구려" + 0.002*"백제" + 0.002*"마법"
Topic 1: 0.004*"아버지" + 0.004*"사랑" + 0.003*"친구" + 0.002*"어머니" + 0.002*"기억" + 0.002*"작중" + 0.002*"대사" + 0.002*"여자" + 0.002*"좋아하" + 0.002*"남자"
Topic 0: 0.007*"영화