In [1]:
import os
import pickle
import pandas as pd
import numpy as np
from kiwipiepy import Kiwi
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from tqdm import tqdm
import warnings

warnings.filterwarnings('ignore')

# 경로 설정
BASE_DIR = os.path.join('..')
DATA_DIR = os.path.join(BASE_DIR, 'final_data')
CORPUS_PATH = os.path.join(BASE_DIR, 'data', 'corpus.pkl')

os.makedirs(DATA_DIR, exist_ok=True)

print("전체 데이터 로드")
with open(CORPUS_PATH, 'rb') as f:
    corpus_data = pickle.load(f)

df = pd.DataFrame(corpus_data)
if 'text' not in df.columns and 'body' in df.columns:
    df.rename(columns={'body': 'text'}, inplace=True)

# 전체 데이터 사용 (샘플링 X)
print(f"전체 문서 수: {len(df)}개")

kiwi = Kiwi(num_workers=0)

전체 데이터 로드
전체 문서 수: 50222개


In [2]:
def preprocess_dual(text):
    if not isinstance(text, str):
        return [], []

    try:
        tokens = kiwi.tokenize(text)
        padded = []
        meaningful = []
        target_pos = ['NNG', 'NNP', 'VV', 'VA', 'MAG']

        for t in tokens:
            if t.tag in target_pos:
                if len(t.form) > 1:
                    padded.append(t.form)
                    meaningful.append(t.form)
                else:
                    padded.append('O' * len(t.form)) # 1음절
            else:
                padded.append('O' * len(t.form)) # 불용어 패딩
        return padded, meaningful
    except:
        return [], []

tqdm.pandas()
print("전체 데이터 전처리 (Padding + LDA 토큰 추출)")
df[['tokens_padded', 'tokens_lda']] = df['text'].progress_apply(
    lambda x: pd.Series(preprocess_dual(x))
)

전체 데이터 전처리 (Padding + LDA 토큰 추출)


100%|██████████| 50222/50222 [2:07:37<00:00,  6.56it/s]   


In [3]:
# 문서 길이 계산 (패딩 포함, x1 변수)
df['doc_length'] = df['tokens_padded'].apply(lambda x: sum(len(t) for t in x))

print("LDA 토픽 모델 학습 (전체 데이터)")
lda_tokens = df['tokens_lda'].tolist()
dictionary = corpora.Dictionary(lda_tokens)
dictionary.filter_extremes(no_below=20, no_above=0.5)
corpus = [dictionary.doc2bow(text) for text in lda_tokens]

# 토픽 수 10개 (도메인 분류용 x3)
lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=10,
    random_state=42,
    passes=5,
    alpha='auto'
)

def get_topic_probs(tokens):
    if not tokens:
        return [0.1] * 10
    bow = dictionary.doc2bow(tokens)
    topics = lda_model.get_document_topics(bow, minimum_probability=0.0)
    topic_vec = [0.0] * 10
    for topic_id, prob in topics:
        topic_vec[topic_id] = prob
    return topic_vec

print("토픽 확률 추론")
df['topic_probs'] = df['tokens_lda'].progress_apply(get_topic_probs)

# 저장
save_path = os.path.join(DATA_DIR, 'full_data.pkl')
model_path = os.path.join(DATA_DIR, 'lda_full.model')

df.to_pickle(save_path)
lda_model.save(model_path)

print("작업 완료")
print(f"저장 경로: {save_path}")
print("\n[LDA 토픽 예시]")
for idx, topic in lda_model.print_topics(3):
    print(f"Topic {idx}: {topic}")

LDA 토픽 모델 학습 (전체 데이터)
토픽 확률 추론


100%|██████████| 50222/50222 [00:48<00:00, 1030.79it/s]


작업 완료
저장 경로: ..\final_data\full_data.pkl

[LDA 토픽 예시]
Topic 8: 0.007*"대통령" + 0.006*"사건" + 0.005*"정부" + 0.005*"경찰" + 0.005*"미국" + 0.004*"당시" + 0.004*"한국" + 0.004*"주장" + 0.004*"문제" + 0.003*"박근혜"
Topic 1: 0.009*"시리즈" + 0.007*"캐릭터" + 0.006*"작품" + 0.005*"건담" + 0.005*"게임" + 0.005*"성우" + 0.004*"발매" + 0.004*"세계" + 0.004*"애니메이션" + 0.004*"설정"
Topic 4: 0.008*"모습" + 0.004*"인간" + 0.004*"결국" + 0.004*"모르" + 0.004*"주인공" + 0.004*"만나" + 0.004*"죽이" + 0.003*"당하" + 0.003*"이야기" + 0.003*"친구"
