# 基础邮件分类器

实现了一个基础的邮件分类器，使用朴素贝叶斯算法对垃圾邮件进行分类。
- 文本预处理和分词
- 高频词特征提取
- 朴素贝叶斯分类
- 邮件分类预测


In [None]:
## 导入必要的库
import re
import os
from jieba import cut # type: ignore
from itertools import chain
from collections import Counter
import numpy as np
from sklearn.naive_bayes import MultinomialNB # type: ignore


  import pkg_resources
  from scipy.sparse import csr_matrix, issparse


In [2]:
## 文本预处理函数
def get_words(filename):
    """读取文本并过滤无效字符和长度为1的词"""
    words = []
    with open(filename, 'r', encoding='utf-8') as fr:
        for line in fr:
            line = line.strip()
            # 过滤无效字符
            line = re.sub(r'[.【】0-9、——。，！~\*]', '', line)
            # 使用jieba.cut()方法对文本切词处理
            line = cut(line)
            # 过滤长度为1的词
            line = filter(lambda word: len(word) > 1, line)
            words.extend(line)
    return words


In [3]:
## 构建词库和特征提取
all_words = []

def get_top_words(top_num):
    """遍历邮件建立词库后返回出现次数最多的词"""
    filename_list = ['邮件_files/{}.txt'.format(i) for i in range(151)]
    # 遍历邮件建立词库
    for filename in filename_list:
        all_words.append(get_words(filename))
    # itertools.chain()把all_words内的所有列表组合成一个列表
    # collections.Counter()统计词个数
    freq = Counter(chain(*all_words))
    return [i[0] for i in freq.most_common(top_num)]

# 获取最常见的100个词
top_words = get_top_words(100)
print(f"提取了 {len(top_words)} 个高频词作为特征")
print("前10个高频词:", top_words[:10])


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\CMH\AppData\Local\Temp\jieba.cache
Loading model cost 1.130 seconds.
Prefix dict has been built successfully.


提取了 100 个高频词作为特征
前10个高频词: ['华为', '我们', '企业', '人工智能', '智能', '技术', '中国', '实现', '发展', '可以']


In [4]:
## 构建特征向量和训练模型
# 构建词-个数映射表
vector = []
for words in all_words:
    # 统计每个高频词在当前文档中出现的次数
    word_map = list(map(lambda word: words.count(word), top_words))
    vector.append(word_map)

vector = np.array(vector)
print(f"特征矩阵形状: {vector.shape}")

# 0-126.txt为垃圾邮件标记为1；127-151.txt为普通邮件标记为0
labels = np.array([1]*127 + [0]*24)
print(f"标签分布: 垃圾邮件 {sum(labels)} 个, 普通邮件 {len(labels) - sum(labels)} 个")

# 训练朴素贝叶斯模型
model = MultinomialNB()
model.fit(vector, labels)
print("模型训练完成!")


特征矩阵形状: (151, 100)
标签分布: 垃圾邮件 127 个, 普通邮件 24 个
模型训练完成!


In [5]:
## 定义预测函数并测试
def predict(filename):
    """对未知邮件分类"""
    # 构建未知邮件的词向量
    words = get_words(filename)
    current_vector = np.array(
        tuple(map(lambda word: words.count(word), top_words)))
    # 预测结果
    result = model.predict(current_vector.reshape(1, -1))
    return '垃圾邮件' if result == 1 else '普通邮件'

# 对测试邮件进行分类
test_files = ['151.txt', '152.txt', '153.txt', '154.txt', '155.txt']
print("测试邮件分类结果:")
for file in test_files:
    try:
        result = predict(f'邮件_files/{file}')
        print(f'{file}分类情况: {result}')
    except FileNotFoundError:
        print(f'{file}: 文件不存在')


测试邮件分类结果:
151.txt分类情况: 垃圾邮件
152.txt分类情况: 垃圾邮件
153.txt分类情况: 垃圾邮件
154.txt分类情况: 垃圾邮件
155.txt分类情况: 普通邮件
