In [6]:
from konlpy.tag import Okt
import re
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from itertools import zip_longest
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import networkx as nx
from collections import defaultdict
from tqdm import tqdm
import pickle
import glob
from datetime import datetime, timedelta
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from gensim.models.coherencemodel import CoherenceModel

pyLDAvis.enable_notebook()


In [15]:
def load_stopwords(file_path):
    """
    file_path 경로에서 불용어를 불러오는 함수
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        stopwords = f.read().splitlines()
    return stopwords


def preprocess_text(text, stopwords=None):
    """
    Okt를 사용한 텍스트 전처리 함수
    - 불용어 제거
    - 명사 추출
    """
    okt = Okt()
    # 정규 표현식을 사용한 숫자/특수문자 제거
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)

    nouns = okt.nouns(text)

    if stopwords:
        nouns = [word for word in nouns if word not in stopwords]
    
    # 길이가 1인 단어들 제거
    nouns = [word for word in nouns if len(word) > 1]

    processed_text = ' '.join(nouns)

    return processed_text


def load_and_merge_section_data(section_number, hours_back=8):
    """
    섹션 번호에 해당하는 모든 pkl 파일을 불러와서 기사 본문을 개별 문서로 병합하는 함수
    현재 시간에서 -hours_back 시간 전까지의 파일만 병합
    """
    current_time = datetime.now()
    start_time = current_time - timedelta(hours=hours_back)

    valid_hours = []
    for hour in range(hours_back + 1):  # 0부터 hours_back까지의 시간을 계산
        valid_hour = (start_time + timedelta(hours=hour)).strftime('%H')
        valid_hours.append(valid_hour)

    file_patterns = [f"./data/{section_number}/*_{hour}.pkl" for hour in valid_hours]

    merged_content = []
    for file_pattern in tqdm(file_patterns, desc=f"섹션 번호: {section_number} 파일"):
        file_list = glob.glob(file_pattern)

        for file_path in file_list:
            print(file_path)
            with open(file_path, 'rb') as file:
                data = pickle.load(file)
                for article in data:
                    merged_content.append(article['content'])

    print("병합한 기사 개수: " + str(len(merged_content)))

    return merged_content  # 각각의 기사를 리스트 형태로 반환

def visualize_lda_model(lda_model, corpus, dictionary):
    vis = gensimvis.prepare(lda_model, corpus, dictionary)
    return vis



def process_documents(section_numbers, stopwords_file_path, hours_back=8):

    stopwords = load_stopwords(stopwords_file_path)

    processed_documents = []
    
    for section_number in section_numbers:
        merged_content_list = load_and_merge_section_data(section_number, hours_back)
        for content in merged_content_list:
            processed_text = preprocess_text(content, stopwords)
            processed_documents.append(processed_text.split())

    return processed_documents


def optimalize_lda_model(corpus, dictionary, processed_documents, start=2, end=6, step=1):
    coherence_values = []
    lda_model_list = []
    for num_topics in range(start, end+1, step):
        lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
        lda_model_list.append(lda_model)
        coherence_model = CoherenceModel(model=lda_model, texts=processed_documents, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherence_model.get_coherence())
    optimal_model = lda_model_list[coherence_values.index(max(coherence_values))]
    return optimal_model

In [16]:
section_numbers = [100, 101, 102, 103, 104, 105]
stopwords_file_path = './data/korean_stopwords.txt'

processed_documents = process_documents(section_numbers, stopwords_file_path)

dictionary = corpora.Dictionary(processed_documents)

corpus = [dictionary.doc2bow(text) for text in processed_documents]

optimal_lda_model = optimalize_lda_model(corpus, dictionary, processed_documents)

topics = optimal_lda_model.print_topics(num_words=30)

for topic in topics:
    print(topic)



섹션 번호: 100 파일: 100%|██████████| 10/10 [00:00<00:00, 1830.85it/s]


./data/100\2024-09-20_09.pkl
./data/100\2024-09-20_10.pkl
./data/100\2024-09-24_11.pkl
병합한 기사 개수: 224


섹션 번호: 101 파일: 100%|██████████| 10/10 [00:00<00:00, 1250.13it/s]


./data/101\2024-09-20_09.pkl
./data/101\2024-09-20_10.pkl
./data/101\2024-09-24_11.pkl
병합한 기사 개수: 916


섹션 번호: 102 파일: 100%|██████████| 10/10 [00:00<00:00, 1034.43it/s]


./data/102\2024-09-20_09.pkl
./data/102\2024-09-20_10.pkl
./data/102\2024-09-24_11.pkl
병합한 기사 개수: 953


섹션 번호: 103 파일: 100%|██████████| 10/10 [00:00<?, ?it/s]


./data/103\2024-09-20_09.pkl
./data/103\2024-09-20_10.pkl
./data/103\2024-09-24_11.pkl
병합한 기사 개수: 142


섹션 번호: 104 파일: 100%|██████████| 10/10 [00:00<?, ?it/s]


./data/104\2024-09-20_09.pkl
./data/104\2024-09-20_10.pkl
./data/104\2024-09-24_11.pkl
병합한 기사 개수: 213


섹션 번호: 105 파일: 100%|██████████| 10/10 [00:00<?, ?it/s]


./data/105\2024-09-20_09.pkl
./data/105\2024-09-20_10.pkl
./data/105\2024-09-24_11.pkl
병합한 기사 개수: 114
(0, '0.007*"미국" + 0.004*"사건" + 0.004*"중국" + 0.003*"경찰" + 0.003*"발생" + 0.003*"국내" + 0.003*"위해" + 0.003*"대한" + 0.003*"사진" + 0.003*"시인" + 0.003*"대표" + 0.003*"경우" + 0.003*"기자" + 0.003*"서울" + 0.003*"때문" + 0.003*"이스라엘" + 0.003*"사업" + 0.003*"기업" + 0.002*"확인" + 0.002*"의원" + 0.002*"기초" + 0.002*"레바논" + 0.002*"관련" + 0.002*"매출" + 0.002*"대해" + 0.002*"사실" + 0.002*"아이폰" + 0.002*"이후" + 0.002*"여성" + 0.002*"사람"')
(1, '0.011*"대통령" + 0.006*"후보" + 0.005*"해리스" + 0.005*"대해" + 0.005*"사람" + 0.004*"트럼프" + 0.004*"미국" + 0.004*"대선" + 0.004*"생각" + 0.003*"때문" + 0.003*"대한" + 0.003*"민주당" + 0.003*"대표" + 0.003*"문제" + 0.003*"관련" + 0.003*"지금" + 0.003*"부통령" + 0.003*"우크라이나" + 0.003*"위해" + 0.003*"게임" + 0.003*"경우" + 0.003*"국가" + 0.003*"계획" + 0.002*"정부" + 0.002*"상황" + 0.002*"최근" + 0.002*"의원" + 0.002*"주장" + 0.002*"통해" + 0.002*"진행"')
(2, '0.006*"미국" + 0.005*"서울" + 0.003*"지역" + 0.003*"달러" + 0.003*"기자" + 0.003*"대한" + 0.003*"시장" + 

In [17]:
# 시각화 실행
visualization = visualize_lda_model(optimal_lda_model, corpus, dictionary)
pyLDAvis.save_html(visualization, 'optimal_lda_visualization.html')