In [None]:
from konlpy.tag import Okt
import re
from gensim import corpora
from gensim.models.ldamodel import LdaModel
import networkx as nx
from itertools import zip_longest
from collections import Counter
from tqdm import tqdm
from datetime import datetime, timedelta
import pickle
import glob
import pandas as pd

def load_stopwords(file_path):
    """
    Load stopwords from a file.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        stopwords = f.read().splitlines()
    return stopwords

def preprocess_text(text, stopwords=None):
    """
    Preprocess text using Okt for LDA model input.
    """
    okt = Okt()
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    
    nouns = okt.nouns(text)
    
    if stopwords:
        nouns = [word for word in nouns if word not in stopwords]

    return nouns

def preprocess_text_with_ngrams(text, stopwords=None, n=2):
    """
    Preprocess text using Okt and create n-grams for TextRank.
    """
    words = preprocess_text(text, stopwords)  # Single words for LDA

    # Create n-grams
    ngrams = zip(*[words[i:] for i in range(n)])
    ngram_list = [' '.join(ngram) for ngram in ngrams]
    
    return ngram_list


def load_and_merge_section_data(section_number, hours_back=9):
    """
    Load and merge content from files corresponding to the given section number.
    """
    current_time = datetime.now()
    start_time = current_time - timedelta(hours=hours_back)

    valid_hours = []
    for hour in range(hours_back + 1):
        valid_hour = (start_time + timedelta(hours=hour)).strftime('%H')
        valid_hours.append(valid_hour)

    file_patterns = [f"./data/{section_number}/*_{hour}.pkl" for hour in valid_hours]

    merged_content = []
    for file_pattern in tqdm(file_patterns, desc=f"Loading section {section_number} files"):
        file_list = glob.glob(file_pattern)

        for file_path in file_list:
            with open(file_path, 'rb') as file:
                data = pickle.load(file)
                for article in data:
                    merged_content.append(article['content'])

    merged_document = ' '.join(merged_content)
    return merged_document


def process_sections(section_numbers, stopwords_file_path, hours_back=9):
    """
    Process each section by merging and preprocessing the data.
    """
    stopwords = load_stopwords(stopwords_file_path)
    processed_documents = {}

    for section_number in tqdm(section_numbers, desc="Processing sections"):
        merged_content = load_and_merge_section_data(section_number, hours_back)
        processed_text = preprocess_text(merged_content, stopwords)  # 단어 단위로 전처리
        processed_documents[section_number] = processed_text

    return processed_documents


def process_sections_with_ngrams(section_numbers, stopwords_file_path, hours_back=9, n=2):
    """
    Process each section by merging and preprocessing the data.
    """
    stopwords = load_stopwords(stopwords_file_path)
    processed_documents = {}

    for section_number in tqdm(section_numbers, desc="Processing sections"):
        merged_content = load_and_merge_section_data(section_number, hours_back)
        processed_text = preprocess_text_with_ngrams(merged_content, stopwords, n)
        processed_documents[section_number] = processed_text

    return processed_documents

def prepare_data_for_lda(doc):
    """
    Prepare data for LDA model training.
    """
    dictionary = corpora.Dictionary([doc])
    corpus = [dictionary.doc2bow(doc)]
    return dictionary, corpus

def train_lda_model(dictionary, corpus, num_topics=5, passes=15):
    """
    Train the LDA model with the provided dictionary and corpus.
    """
    lda_model = LdaModel(corpus=corpus,
                         id2word=dictionary,
                         num_topics=num_topics,
                         random_state=100,
                         update_every=1,
                         passes=passes,
                         alpha='auto',
                         per_word_topics=True)
    return lda_model

def extract_topics_for_section(processed_docs, num_topics=1, num_words=10, passes=15):
    """
    Extract topics from each section using the LDA model.
    """
    section_topics = {}
    lda_models = {}

    for section, doc in tqdm(processed_docs.items(), desc="Extracting topics for sections"):
        dictionary, corpus = prepare_data_for_lda(doc)
        lda_model = train_lda_model(dictionary, corpus, num_topics=num_topics, passes=passes)
        topics = lda_model.show_topics(num_topics=num_topics, num_words=num_words, formatted=False)
        section_topics[section] = topics
        lda_models[section] = lda_model

    return section_topics, lda_models

def compute_relevance_with_ngrams(lda_model, topic_num, dictionary, lambda_=0.6, top_n=20):
    """
    Compute relevance scores for n-grams in a specific topicAiResponseDto.
    """
    topicAiResponseDto = lda_model.get_topic_terms(topic_num, topn=top_n)
    term_frequency = dictionary.dfs
    total_term_count = sum(term_frequency.values())

    relevance_scores = []
    for word_id, prob in topicAiResponseDto:
        term_freq = term_frequency[word_id]
        term_prob = term_freq / total_term_count
        relevance = lambda_ * prob + (1 - lambda_) * (prob / term_prob)
        relevance_scores.append((dictionary[word_id], relevance))

    relevance_scores = sorted(relevance_scores, key=lambda x: x[1], reverse=True)
    return relevance_scores

def textrank_with_ngrams(relevance_scores, top_n=10):
    """
    Apply TextRank on n-grams to rank keywords.
    """
    word_graph = nx.Graph()
    
    # Check if relevance_scores is empty
    if len(relevance_scores) == 0:
        print("Relevance scores are empty. No TextRank will be applied.")
        return []

    # Add nodes and edges only if n-grams are present
    for i, (word1, _) in enumerate(relevance_scores):
        word_graph.add_node(word1)
        for word2, _ in relevance_scores[i+1:]:
            # Ensure that we connect words that are actually in n-grams
            if word1 != word2:
                word_graph.add_edge(word1, word2, weight=1)

    # Ensure the graph is not empty
    if len(word_graph.nodes) == 0:
        print("No edges were added to the graph. No TextRank will be applied.")
        return []

    scores = nx.pagerank(word_graph)
    ranked_keywords = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    
    # Check the output
    if len(ranked_keywords) == 0:
        print("TextRank resulted in no ranked keywords.")
    
    return ranked_keywords

def generate_phrases_from_ngrams(ranked_keywords, relevance_scores, n=2):
    """
    Generate meaningful phrases from ranked keywords using n-grams.
    """
    phrases = []
    used_keywords = set()

    if len(ranked_keywords) == 0:
        print("Ranked keywords are empty. No phrases will be generated.")
        return []

    for i, (word1, _) in enumerate(relevance_scores):
        if word1 in used_keywords:
            continue
        phrase = [word1]
        for word2, _ in relevance_scores[i+1:]:
            if word2 not in used_keywords and word2 in [w for w, _ in ranked_keywords]:
                if len(phrase) < n:
                    phrase.append(word2)
                    used_keywords.add(word2)
                else:
                    break
        used_keywords.add(word1)
        if len(phrase) == n:
            phrases.append(' '.join(phrase))

    if len(phrases) == 0:
        print("No phrases were generated from n-grams.")
    
    return phrases[:top_n]


def finalize_labels_with_details(lda_models, section_topics, processed_docs, top_n=5, top_topic_words=5):
    """
    Finalize labels for each section based on LDA and TextRank results.
    Outputs a structured summary of topics and their corresponding labels and words.
    """
    final_details = []

    for section, topics in section_topics.items():
        lda_model = lda_models[section]
        dictionary = lda_model.id2word

        for topic_num, _ in enumerate(topics):
            # LDA 모델의 상위 단어들 추출
            topic_words_lda = [dictionary[word_id] for word_id, _ in lda_model.get_topic_terms(topic_num, topn=top_topic_words)]

            # Relevance 기반으로 상위 단어들 추출
            relevance_scores = compute_relevance_with_ngrams(lda_model, topic_num, dictionary)
            topic_words_relevance = [word for word, _ in relevance_scores[:top_topic_words]]

            # TextRank 기반으로 상위 레이블 생성
            ngrams = preprocess_text_with_ngrams(' '.join(processed_docs[section]), stopwords=None, n=2)
            ranked_keywords = textrank_with_ngrams([(ngram, 1.0) for ngram in ngrams], top_n)
            labels_textrank = generate_phrases_from_ngrams(ranked_keywords, relevance_scores, n=2)

            # 각 토픽에 대해 LDA, Relevance, TextRank 기반의 단어와 레이블들을 저장
            final_details.append({
                'Section': section,
                'Topic Number': topic_num,
                'LDA Top Words': ', '.join(topic_words_lda),
                'Relevance Top Words': ', '.join(topic_words_relevance),
                'TextRank Labels': ', '.join(labels_textrank)
            })

    return final_details


def display_final_table(final_details):
    """
    Display the final table of LDA, Relevance, and TextRank results.
    """
    df = pd.DataFrame(final_details)
    return df






In [None]:
# Usage example

# 1. 불용어 파일 경로 설정
stopwords_file_path = './data/korean_stopwords.txt'

# 2. 섹션 번호 설정
section_numbers = [100, 101, 102]  # 처리할 섹션 번호 리스트
hours_back = 9  # 최근 9시간의 데이터만 사용

# 3. 섹션 데이터 병합 및 전처리 (단어 단위)
processed_docs = process_sections(section_numbers, stopwords_file_path, hours_back)

# 4. LDA 모델을 사용하여 각 섹션별로 토픽 추출
num_topics = 3  # 각 섹션에서 추출할 토픽의 수
num_words = 10  # 각 토픽에서 추출할 상위 단어 수
section_topics, lda_models = extract_topics_for_section(processed_docs, num_topics, num_words)

# 5. 최종 토픽 레이블 및 세부 사항 생성
top_n = 5  # 각 토픽에 대해 생성할 레이블의 수
top_topic_words = 5  # 각 토픽에 대해 추출할 상위 단어의 수
final_details = finalize_labels_with_details(lda_models, section_topics, processed_docs, top_n, top_topic_words)

# 6. 결과 표로 출력
df = display_final_table(final_details)
print(df)