In [1]:
import os
import pickle
import pandas as pd
import numpy as np
from kiwipiepy import Kiwi
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from tqdm import tqdm
import warnings

warnings.filterwarnings('ignore')

BASE_DIR = os.path.join('..')
DATA_DIR = os.path.join(BASE_DIR, 'final_data')
CORPUS_PATH = os.path.join(BASE_DIR, 'data', 'corpus.pkl')

os.makedirs(DATA_DIR, exist_ok=True)

print("전체 데이터 로드 중")
with open(CORPUS_PATH, 'rb') as f:
    corpus_data = pickle.load(f)

df = pd.DataFrame(corpus_data)
if 'text' not in df.columns and 'body' in df.columns:
    df.rename(columns={'body': 'text'}, inplace=True)

print(f"전체 문서 수: {len(df)}개")

전체 데이터 로드 중
전체 문서 수: 50222개


In [2]:
kiwi = Kiwi(num_workers=0)

def preprocess_dual(text):
    if not isinstance(text, str):
        return [], []

    try:
        tokens = kiwi.tokenize(text)
        padded = []
        meaningful = []
        target_pos = ['NNG', 'NNP', 'VV', 'VA', 'MAG']

        for t in tokens:
            if t.tag in target_pos:
                padded.append(t.form)
                meaningful.append(t.form)
            else:
                padded.append('O' * len(t.form))
        return padded, meaningful
    except:
        return [], []

tqdm.pandas()
print("전체 데이터 전처리 중 (패딩 토큰 및 LDA 토큰 추출)")
df[['tokens_padded', 'tokens_lda']] = df['text'].progress_apply(
    lambda x: pd.Series(preprocess_dual(x))
)

df['doc_length'] = df['tokens_padded'].apply(lambda x: sum(len(t) for t in x))

전체 데이터 전처리 중 (패딩 토큰 및 LDA 토큰 추출)


100%|██████████| 50222/50222 [1:48:42<00:00,  7.70it/s]   


In [3]:
print("LDA 토픽 모델 학습 중 (전체 데이터)")
lda_tokens = df['tokens_lda'].tolist()
dictionary = corpora.Dictionary(lda_tokens)
dictionary.filter_extremes(no_below=20, no_above=0.5)
corpus = [dictionary.doc2bow(text) for text in lda_tokens]

lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=10,
    random_state=42,
    passes=5,
    alpha='auto'
)

def get_topic_probs(tokens):
    if not tokens:
        return [0.1] * 10
    bow = dictionary.doc2bow(tokens)
    topics = lda_model.get_document_topics(bow, minimum_probability=0.0)
    topic_vec = [0.0] * 10
    for topic_id, prob in topics:
        topic_vec[topic_id] = prob
    return topic_vec

LDA 토픽 모델 학습 중 (전체 데이터)


In [4]:
print("토픽 확률 추론 중")
df['topic_probs'] = df['tokens_lda'].progress_apply(get_topic_probs)

save_path = os.path.join(DATA_DIR, 'full_data_final.pkl')
model_path = os.path.join(DATA_DIR, 'lda_model_final.model')

df.to_pickle(save_path)
lda_model.save(model_path)

print("작업 완료")
print(f"저장 경로: {save_path}")
print("\n[LDA 토픽 예시]")
for idx, topic in lda_model.print_topics(3):
    print(f"Topic {idx}: {topic}")

토픽 확률 추론 중


100%|██████████| 50222/50222 [00:46<00:00, 1084.17it/s]


작업 완료
저장 경로: ..\final_data\full_data_final.pkl

[LDA 토픽 예시]
Topic 0: 0.017*"시즌" + 0.014*"경기" + 0.014*"팀" + 0.011*"선수" + 0.009*"기록" + 0.007*"홈런" + 0.007*"감독" + 0.006*"투수" + 0.006*"야구" + 0.006*"시리즈"
Topic 5: 0.005*"힘" + 0.004*"인간" + 0.004*"모습" + 0.004*"죽" + 0.004*"신" + 0.004*"검" + 0.003*"악마" + 0.003*"세계" + 0.003*"드" + 0.003*"엘"
Topic 1: 0.005*"모습" + 0.004*"먹" + 0.003*"집" + 0.003*"나" + 0.003*"친구" + 0.003*"모르" + 0.003*"맞" + 0.003*"살" + 0.002*"만나" + 0.002*"죽"
