In [14]:
import re
import jieba

from collections import defaultdict


def load_corpus(corpus_file: str) -> list[str]:
    """
    加载并清洗语料库
    :param corpus_file: 语料库文件
    :return: 清洗后的语料库
    """
    result = []
    with open(corpus_file, 'r') as f:
        for line in f:
            new_line = re.sub(r'[^,，。、、:：！!\u4e00-\u9fa5]+', '', line)
            result.append(new_line)
    return result


def build_word_frequency(corpus, max_word_length=5):
    """
    构建词频字典
    :param corpus: 语料库
    :param max_word_length: 最大词长
    :return: 词频字典
    """
    word_freq: defaultdict[str, int] = defaultdict(int)
    for line in corpus:
        for token in jieba.cut(line):
            if len(token) > max_word_length or len(token) < 1:
                continue
            word_freq[token] += 1
    return word_freq


def select_high_frequency_words(word_freq, frequency_threshold=5):
    """
    选择高频词
    :param word_freq: 词频字典
    :param frequency_threshold: 频率阈值
    :return: 高频词
    """
    high_frequency_words: list[str] = []
    for word, freq in word_freq.items():
        if freq >= frequency_threshold:
            high_frequency_words.append(word)
    return high_frequency_words


def save_dictionary(word_list, file_path):
    """
    保存词典
    :param word_list: 词表
    :param file_path: 保存路径
    """
    with open(file_path, 'w') as f:
        for word in word_list:
            f.write(word + '\n')


corpus_file = 'corpus.txt'
dictionary_file = 'word_dictionary.txt'

corpus = load_corpus(corpus_file)
word_freq = build_word_frequency(corpus)
high_frequency_words = select_high_frequency_words(word_freq, 5)
save_dictionary(high_frequency_words, dictionary_file)