## LDA-BP

In [5]:
import pandas as pd
from gensim import corpora, models
import numpy as np
from gensim.models.coherencemodel import CoherenceModel
from sklearn.metrics.pairwise import cosine_similarity


class LDABPModel:
    def __init__(self, initial_num_topics=6, threshold=0.35, iterations=300, random_state=1):
        self.initial_num_topics = initial_num_topics
        self.threshold = threshold
        self.iterations = iterations
        self.random_state = random_state
        self.dictionary = None
        self.lda = None
        self.topic_vectors = None
        self.topics = None

    def prepare_dictionary(self, text_data_list):
        self.dictionary = corpora.Dictionary(text_data_list[0])
        for text_data in text_data_list[1:]:
            self.dictionary.add_documents(text_data)
        return self.dictionary

    def train_initial_lda(self, corpus):
        self.lda = models.LdaModel(
            corpus=corpus,
            id2word=self.dictionary,
            alpha='auto',
            eta='auto',
            iterations=self.iterations,
            num_topics=self.initial_num_topics,
            random_state=self.random_state
        )
        self.topic_vectors = self.lda.state.get_lambda()
        self.topics = list(range(self.initial_num_topics))
        return self.lda

    @staticmethod
    def get_doc_vector(doc, dictionary):
        vec = np.zeros(len(dictionary))
        word_counts = pd.Series(dictionary.doc2idx(doc)).value_counts().reset_index()
        for i in range(len(word_counts)):
            vec[word_counts.iloc[i, 0]] = word_counts.iloc[i, 1]
        return vec

    @staticmethod
    def doc2vec(texts, dictionary):
        vec_list = [LDABPModel.get_doc_vector(text, dictionary) for text in texts]
        return np.array(vec_list)

    @staticmethod
    def max_argmax(mat, axis=1):
        max_v = np.max(mat, axis=axis)
        max_idx = np.argmax(mat, axis=axis)
        return [[max_idx[i], max_v[i]] for i in range(len(max_v))]

    def get_prob(self, new_text):
        text_vec = self.doc2vec(new_text, self.dictionary)
        similar = cosine_similarity(text_vec, self.topic_vectors)
        return self.max_argmax(similar)

    def get_topic_count(self, doc_propabilities):
        topic_classify = dict.fromkeys(range(len(self.topics)), 0)
        doc_classify = []
        
        for idx, v in doc_propabilities:
            if v > self.threshold and idx < len(self.topics):
                topic_classify[idx] += 1
                doc_classify.append(idx)
            else: 
                topic_classify[len(self.topics)-1] += 1
                doc_classify.append(len(self.topics)-1)
        return topic_classify, doc_classify

    def get_coherence(self, new_topic_vectors, new_text):
        topic_word_num = 10
        tmp = new_topic_vectors.argsort(axis=1)[:, -topic_word_num:]
        topic_words = [[] for _ in range(tmp.shape[0])]
        
        for i in range(tmp.shape[0]):
            for j in range(tmp.shape[1]):
                try:
                    topic_words[i].append(self.dictionary[tmp[i, j]])
                except KeyError:
                    continue  
        
        coherence = CoherenceModel(
            topics=topic_words,
            corpus=[self.dictionary.doc2bow(text) for text in new_text],
            dictionary=self.dictionary,
            texts=new_text,
            coherence='c_v'
        )
        return coherence.get_coherence(), topic_words

    def forward(self, new_text, time_slice):
        current_topics = self.topics.copy()
        current_topics.append(f'slice{time_slice}')
        
        doc_threshold_low = len(new_text) // 140  # 主题保留的最小文档数
        doc_threshold_branch = len(new_text) // 4  # 分支的最小文档数
        
        doc_propabilities = self.get_prob(new_text)
        topic_classify, doc_classify = self.get_topic_count(doc_propabilities)
        
        branch_vectors = []
        delete_index = []
        
        for index in topic_classify:
            if topic_classify[index] > doc_threshold_branch:
                print(f'主题{index}进行分支')
                delete_index.append(index)
                current_topics.extend([f'{index}_{i}' for i in range(1, 3)])
                
                branch_docs = [new_text[i] for i, cls in enumerate(doc_classify) if cls == index]
                branch_corpus = [self.dictionary.doc2bow(text) for text in branch_docs]
                
                branch_lda = models.LdaModel(
                    corpus=branch_corpus,
                    id2word=self.dictionary,
                    alpha='auto',
                    eta='auto',
                    iterations=self.iterations,
                    num_topics=2,
                    random_state=self.random_state
                )
                branch_vectors.extend(branch_lda.state.get_lambda())
        
        new_topic_docs = [new_text[i] for i, cls in enumerate(doc_classify) if cls == len(self.topics)-1]
        new_topic_corpus = [self.dictionary.doc2bow(text) for text in new_topic_docs]
        
        new_topic_lda = models.LdaModel(
            corpus=new_topic_corpus,
            id2word=self.dictionary,
            alpha='auto',
            eta='auto',
            iterations=self.iterations,
            num_topics=1,
            random_state=self.random_state
        )
        new_topic_vector = new_topic_lda.state.get_lambda()[0]
        
        new_topic_vectors = []
        for key in list(topic_classify.keys())[:-1]:
            if key not in delete_index and topic_classify[key] >= doc_threshold_low:
                new_topic_vectors.append(self.topic_vectors[key])
        
        new_topic_vectors.append(new_topic_vector)
        new_topic_vectors.extend(branch_vectors)
        new_topic_vectors = np.array(new_topic_vectors)
        
        self.topics = [v for i, v in enumerate(current_topics) if i not in delete_index]
        
        coherence_score, topic_words = self.get_coherence(new_topic_vectors, new_text)
        self.topic_vectors = new_topic_vectors
        
        print(f'当前时间切片: {time_slice}')
        print(f'目前主题数: {len(self.topics)}')
        print(f'一致性分数: {coherence_score}')
        
        return len(self.topics), self.topics, topic_words, new_topic_vectors, coherence_score


if __name__ == "__main__":
    # text_col放具体的文本数据列名
    def load_text_data(file_paths, text_col='Processed_Text_No_Stop_Words'):
        """加载文本数据并转换为分词列表"""
        text_data_list = []
        for path in file_paths:
            df = pd.read_excel(path, engine='openpyxl')
            text_data = df[text_col].apply(lambda x: x.split()).tolist()
            text_data_list.append(text_data)
        return text_data_list

    # 替换为需要的文件路径
    file_paths = [
        '46-55processed_text_no_stop_words.xlsx',
        '56-65processed_text_no_stop_words.xlsx',
        '66-75processed_text_no_stop_words.xlsx',
        '76-85processed_text_no_stop_words.xlsx',
        '86-95processed_text_no_stop_words.xlsx',
        '96-05processed_text_no_stop_words.xlsx',
        '06-15processed_text_no_stop_words.xlsx',
        '16-23processed_text_no_stop_words.xlsx'
    ]
    
    all_text_data = load_text_data(file_paths)
    initial_text = all_text_data[0]  
    time_slices = all_text_data[1:]  
    
    # 训练模型
    ldabp = LDABPModel(initial_num_topics=6, threshold=0.35)
    ldabp.prepare_dictionary(all_text_data)
    
    # 构建初始语料库
    initial_corpus = [ldabp.dictionary.doc2bow(text) for text in initial_text]
    ldabp.train_initial_lda(initial_corpus)
    
    # 处理时间切片并生成一致性分数等结果
    results = []
    for i, time_slice_text in enumerate(time_slices, start=1):
        res = ldabp.forward(time_slice_text, i)
        results.append(res)

主题0进行分支


  m_lr_i = np.log(numerator / denominator)
  return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))


当前时间切片: 1
目前主题数: 8
一致性分数: nan
主题4进行分支
主题5进行分支
主题6进行分支
当前时间切片: 2
目前主题数: 12
一致性分数: 0.37153318538848795


  m_lr_i = np.log(numerator / denominator)
  return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))


当前时间切片: 3
目前主题数: 13
一致性分数: nan


  m_lr_i = np.log(numerator / denominator)
  return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))


当前时间切片: 4
目前主题数: 14
一致性分数: nan
主题7进行分支
当前时间切片: 5
目前主题数: 16
一致性分数: 0.4298116902031362


KeyboardInterrupt: 

In [6]:
# 分段LDA
import gensim
from gensim.corpora import Dictionary
import pandas as pd
file_paths = [
        '46-55processed_text_no_stop_words.xlsx',
        '56-65processed_text_no_stop_words.xlsx',
        '66-75processed_text_no_stop_words.xlsx',
        '76-85processed_text_no_stop_words.xlsx',
        '86-95processed_text_no_stop_words.xlsx',
        '96-05processed_text_no_stop_words.xlsx',
        '06-15processed_text_no_stop_words.xlsx',
        '16-23processed_text_no_stop_words.xlsx'
    ]
dfs = []
texts = []
for path in file_paths:
    # 加载Excel文件（统一使用openpyxl引擎避免版本问题）
    df = pd.read_excel(path, engine='openpyxl')
    dfs.append(df)
    # 提取预处理文本并转换为单词列表
    text = df['Processed_Text_No_Stop_Words'].apply(lambda x: x.split())
    texts.append(text)
df1_text, df2_text, df3_text, df4_text, df5_text, df6_text, df7_text, df8_text = texts
for text in texts:
    dictionary = Dictionary(text)
    corpus = [dictionary.doc2bow(text) for text in list(text)]
    lda_model = gensim.models.LdaModel(corpus=corpus, num_topics=6, id2word=dictionary, passes=10)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=text, dictionary=dictionary)
    coherence_score = coherence_model_lda.get_coherence()
    coherence_scores.append(coherence_score)

print(coherence_scores)

KeyboardInterrupt: 

In [None]:
# DTM
import pandas as pd
import warnings
from gensim import corpora
from gensim.models import DtmModel
from gensim.models import CoherenceModel

warnings.filterwarnings('ignore')

file_paths = [
    '46-55processed_text_no_stop_words.xlsx',
    '56-65processed_text_no_stop_words.xlsx',
    '66-75processed_text_no_stop_words.xlsx',
    '76-85processed_text_no_stop_words.xlsx',
    '86-95processed_text_no_stop_words.xlsx',
    '96-05processed_text_no_stop_words.xlsx',
    '06-15processed_text_no_stop_words.xlsx',
    '16-23processed_text_no_stop_words.xlsx'
]

NUM_TOPICS = 10          # 主题数量
MIN_WORD_FREQ = 5       # 最小词频
MAX_WORD_RATIO = 0.5    # 最大文档占比
ITERATIONS = 200        # 迭代次数
PASSES = 10             # 语料遍历次数

# ===================== 2. 加载并预处理数据 =====================
time_slice_docs = []  # 所有文档的单词列表
time_slices = []      # 每个时间切片的文档数量

for path in file_paths:
    df = pd.read_excel(path, engine='openpyxl')
    df['Processed_Text_No_Stop_Words'] = df['Processed_Text_No_Stop_Words'].fillna('')
    
    docs = df['Processed_Text_No_Stop_Words'].apply(lambda x: x.split()).tolist()
    
    time_slice_docs.extend(docs)
    time_slices.append(len(docs))

dictionary = corpora.Dictionary(time_slice_docs)
dictionary.filter_extremes(no_below=MIN_WORD_FREQ, no_above=MAX_WORD_RATIO)
corpus = [dictionary.doc2bow(doc) for doc in time_slice_docs]

dtm_model = DtmModel(
    corpus=corpus,
    id2word=dictionary,
    time_slices=time_slices,
    num_topics=NUM_TOPICS,
    iterations=ITERATIONS,
    passes=PASSES,
    random_state=42
)

coherence_model = CoherenceModel(
    model=dtm_model,
    texts=time_slice_docs,
    dictionary=dictionary,
    coherence='c_v'
)
coherence_score = coherence_model.get_coherence()

print(f"Coherence Score: {coherence_score:.4f}")


In [9]:
#OLDA
import pandas as pd
import warnings
from gensim import corpora
from gensim.models import LdaModel
from gensim.models import CoherenceModel

warnings.filterwarnings('ignore')

file_paths = [
    '46-55processed_text_no_stop_words.xlsx',
    '56-65processed_text_no_stop_words.xlsx',
    '66-75processed_text_no_stop_words.xlsx',
    '76-85processed_text_no_stop_words.xlsx',
    '86-95processed_text_no_stop_words.xlsx',
    '96-05processed_text_no_stop_words.xlsx',
    '06-15processed_text_no_stop_words.xlsx',
    '16-23processed_text_no_stop_words.xlsx'
]

NUM_TOPICS = 10
CHUNKSIZE = 2000
PASSES = 1
UPDATE_EVERY = 1
ITERATIONS = 50
RANDOM_STATE = 42

docs_list = []
for path in file_paths:
    # 加载数据+处理缺失值
    df = pd.read_excel(path, engine='openpyxl')
    df['Processed_Text_No_Stop_Words'] = df['Processed_Text_No_Stop_Words'].fillna('')
    # 转换为单词列表
    docs = df['Processed_Text_No_Stop_Words'].apply(lambda x: x.split()).tolist()
    docs_list.extend(docs)

dictionary = corpora.Dictionary(docs_list)
dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus_lda_c = [dictionary.doc2bow(doc) for doc in docs_list]
total_docs = len(docs_list)

olda_model = LdaModel(
    corpus=corpus_lda_c,
    num_topics=NUM_TOPICS,
    id2word=dictionary,
    chunksize=CHUNKSIZE,
    passes=PASSES,
    update_every=UPDATE_EVERY,
    iterations=ITERATIONS,
    random_state=RANDOM_STATE
)


coherence_model = CoherenceModel(
    model=olda_model,
    texts=docs_list,
    dictionary=dictionary,
    coherence='c_v'
)
coherence_score = coherence_model.get_coherence()

print(f"{coherence_score:.4f}")

0.4043


In [None]:
import pandas as pd
import warnings
from gensim import corpora
from gensim.models import LdaModel
from gensim.models import CoherenceModel

warnings.filterwarnings('ignore')

file_paths = [
    '46-55processed_text_no_stop_words.xlsx',
    '56-65processed_text_no_stop_words.xlsx',
    '66-75processed_text_no_stop_words.xlsx',
    '76-85processed_text_no_stop_words.xlsx',
    '86-95processed_text_no_stop_words.xlsx',
    '96-05processed_text_no_stop_words.xlsx',
    '06-15processed_text_no_stop_words.xlsx',
    '16-23processed_text_no_stop_words.xlsx'
]

# OLDA核心参数
NUM_TOPICS = 10
CHUNKSIZE = 2000
PASSES = 1
UPDATE_EVERY = 1
ITERATIONS = 50
RANDOM_STATE = 42

docs_by_period = []  
all_docs_accum = [] 

for path in file_paths:
    df = pd.read_excel(path, engine='openpyxl')
    df['Processed_Text_No_Stop_Words'] = df['Processed_Text_No_Stop_Words'].fillna('')
    period_docs = df['Processed_Text_No_Stop_Words'].apply(lambda x: x.split()).tolist()
    docs_by_period.append(period_docs)
    all_docs_accum.extend(period_docs) 

first_period_docs = docs_by_period[0]
dictionary = corpora.Dictionary(first_period_docs)
dictionary.filter_extremes(no_below=5, no_above=0.5)

first_corpus = [dictionary.doc2bow(doc) for doc in first_period_docs]

olda_model = LdaModel(
    corpus=first_corpus,
    num_topics=NUM_TOPICS,
    id2word=dictionary,
    chunksize=CHUNKSIZE,
    passes=PASSES,
    update_every=UPDATE_EVERY,
    iterations=ITERATIONS,
    random_state=RANDOM_STATE
)

coherence_1 = CoherenceModel(
    model=olda_model,
    texts=all_docs_accum[:len(first_period_docs)], 
    dictionary=dictionary,
    coherence='c_v'
).get_coherence()
print(coherence_1)

for i in range(1, len(docs_by_period)):
    current_period_docs = docs_by_period[i]
    current_period_name = i
    current_corpus = [dictionary.doc2bow(doc) for doc in current_period_docs]
    
    olda_model.update(corpus=current_corpus, chunks_as_numpy=True)
    
    accum_docs = all_docs_accum[:sum([len(d) for d in docs_by_period[:i+1]])]
    coherence_score = CoherenceModel(
        model=olda_model,
        texts=accum_docs,
        dictionary=dictionary,
        coherence='c_v'
    ).get_coherence()
    
    print(f"{current_period_name}: {coherence_score:.4f}")

0.3514216743184268
