In [1]:
import pandas as pd
import numpy as np
from gensim.models import LdaModel
from gensim import corpora
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# 假定已经有一个预处理函数
def nltk_preprocess(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text.lower())
    return [word for word in words if word.isalpha() and word not in stop_words]

# 加载之前训练的LDA模型和词典
lda_model = LdaModel.load(r'E:\Code for Project2024\lda_model\model')
dictionary = corpora.Dictionary.load(r'E:\Code for Project2024\lda_model\dictionary')

# 读取新的日志模板文件
structured_log_df = pd.read_csv(r'E:\Code for Project2024\Data_for_Train\L1\HDFS.log_for_L1.csv')

# 预处理日志数据
structured_log_df['Preprocessed'] = structured_log_df['EventTemplate'].apply(nltk_preprocess)

# 转换为词袋模型
structured_corpus = [dictionary.doc2bow(text) for text in structured_log_df['Preprocessed']]

# 使用LDA模型进行主题预测
structured_log_df['TopicDistribution'] = [lda_model.get_document_topics(bow) for bow in structured_corpus]

# 将概率分布向量转换为易于阅读的格式
def format_distribution(distribution):
    return {f"Topic {topic}": f"{prob:.2%}" for topic, prob in distribution}

# 将主题分布转换为易于阅读的字符串形式
structured_log_df['FormattedDistribution'] = structured_log_df['TopicDistribution'].apply(format_distribution)


# 展示预测结果的前几行
pd.set_option('display.max_colwidth', None) 
structured_log_df[['EventTemplate', 'FormattedDistribution']].head

# 将主题分布转换为最可能主题的独热码
def get_one_hot(topic_dist, num_topics):
    one_hot = np.zeros(num_topics, dtype=int)
    most_probable_topic_index = max(topic_dist, key=lambda x: x[1])[0]
    one_hot[most_probable_topic_index] = 1
    return one_hot

# 获取LDA模型的主题数量
num_topics = lda_model.num_topics

# 计算每个日志模板最可能的主题的独热码
structured_log_df['OneHotTopic'] = structured_log_df['TopicDistribution'].apply(lambda x: get_one_hot(x, num_topics))

# 保存到CSV文件
output_csv_path = r'E:\Code for Project2024\Data_for_Train\L1\log_templates_one_hot.csv'
structured_log_df.to_csv(output_csv_path, columns=['EventTemplate', 'OneHotTopic'], index=False)

print(f"已将日志模板的独热码主题保存到CSV文件：{output_csv_path}")


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


已将日志模板的独热码主题保存到CSV文件：E:\Code for Project2024\Data_for_Train\L1\log_templates_one_hot.csv
