导入依赖

In [1]:
import re
import numpy as np
from jieba import cut
from collections import Counter
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

定义预处理函数

In [2]:
def get_words(filename):
    """读取文本并过滤无效字符，返回分词字符串"""
    words = []
    with open(filename, 'r', encoding='utf-8') as fr:
        for line in fr:
            line = re.sub(r'[.【】0-9、——。，！~\*]', '', line.strip())
            words.extend([word for word in cut(line) if len(word) > 1])
    return ' '.join(words)


定义特征提取器

In [3]:
def extract_features(corpus, feature_type, top_num):
    """根据类型返回特征矩阵和转换器"""
    if feature_type == 'frequency':
        all_terms = ' '.join(corpus).split()
        top_words = [w for w, _ in Counter(all_terms).most_common(top_num)]
        vectorizer = CountVectorizer(vocabulary=top_words)
    elif feature_type == 'tfidf':
        vectorizer = TfidfVectorizer(
            tokenizer=lambda x: x.split(),
            max_features=top_num
        )
    else:
        raise ValueError("无效的特征类型")
    
    return vectorizer.fit_transform(corpus).toarray(), vectorizer

参数设置

In [4]:
feature_type = 'tfidf'  # 可选项：'frequency' 或 'tfidf'
top_num = 100

训练模型

In [5]:
# 读取数据
filenames = [f'邮件_files/{i}.txt' for i in range(151)]
corpus = [get_words(f) for f in filenames]
labels = np.array([1]*127 + [0]*24)

# 特征提取
X, vectorizer = extract_features(corpus, feature_type, top_num)

# 训练分类器
model = MultinomialNB()
model.fit(X, labels)

# %% [code]

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\EXAMINE\AppData\Local\Temp\jieba.cache
Loading model cost 0.369 seconds.
Prefix dict has been built successfully.


定义预测函数

In [6]:
def predict_file(filename):
    """预测单个文件"""
    text = get_words(filename)
    features = vectorizer.transform([text]).toarray()
    return '垃圾邮件' if model.predict(features)[0] == 1 else '普通邮件'

执行预测并显示结果

In [7]:
for file_id in range(151, 156):
    filename = f'邮件_files/{file_id}.txt'
    print(f'{filename} 分类结果：{predict_file(filename)}')

邮件_files/151.txt 分类结果：垃圾邮件
邮件_files/152.txt 分类结果：垃圾邮件
邮件_files/153.txt 分类结果：垃圾邮件
邮件_files/154.txt 分类结果：垃圾邮件
邮件_files/155.txt 分类结果：垃圾邮件
