In [3]:
import pandas as pd
import numpy as np
import os
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from scipy.special import digamma, betaln, gammaln

# 读取数据

In [4]:
data_file=[] # 每一封邮件的内容
label=[] # 对应邮件内容的label

In [5]:
def read_datas(directory):
    ham_path=directory+'/ham'
    spam_path=directory+'/spam'
    # 读取邮件内容
    for file_name in os.listdir(ham_path):    
        path=ham_path+'/'+file_name
        with open(path,'r',encoding='utf-8',errors='replace') as file:
            content=file.read()
            data_file.append(content)
            label.append(0) # 0表示正常邮件
    
    for file_name in os.listdir(spam_path): 
        path=spam_path+'/'+file_name
        with open(path,'r',encoding='utf-8',errors='replace') as file:
            content=file.read()
            data_file.append(content)
            label.append(1) # 1表示垃圾邮件

In [6]:
# 取前五个数据集作为总体数据集
for str in ['enron1','enron2','enron3','enron4','enron5']:
    directory='./dataset/'+str
    read_datas(directory)

print(f'数据集共{len(data_file)}封邮件')
print(f'其中,垃圾邮件有{sum(label)}封,正常邮件有{len(label)-sum(label)}封')

数据集共27716封邮件
其中,垃圾邮件有12671封,正常邮件有15045封


## 数据预处理

In [7]:
stop_words = set([
            'the', 'a', 'an', 'and', 'or', 'but', 'if', 'because', 'as', 'what',
            'which', 'this', 'that', 'these', 'those', 'then', 'just', 'so', 'than',
            'such', 'both', 'through', 'about', 'for', 'is', 'of', 'while', 'during',
            'to', 'from', 'in', 'on', 'by', 'with', 'at', 'be', 'was', 'were', 'are','subject'
        ])

def clean_text(text):
    text=text.lower()
    text = re.sub(r'\S+@\S+', '', text) # 移除邮箱地址
    text = re.sub(r'[^a-zA-Z\s]', '', text) # 移除标点符号和数字
    tokens = text.split() #分词
    tokens = [token for token in tokens if token not in stop_words] #移除停用词
    return ' '.join(tokens)


In [8]:
token_file=[]
for text in data_file:
    token=clean_text(text)
    token_file.append(token)
print(data_file[0])
print(token_file[0])

datafile={'text':token_file,'label':label}
data=pd.DataFrame(data=datafile)
print(data.head())

Subject: christmas tree farm pictures

christmas tree farm pictures
                                                text  label
0                       christmas tree farm pictures      0
1  vastar resources inc gary production high isla...      0
2  calpine daily gas nomination calpine daily gas...      0
3  re issue fyi see note below already done stell...      0
4  meter nov allocation fyi forwarded lauri allen...      0


## 特征提取与划分数据集

In [9]:
# 特征转换
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(data['text']).toarray()  # X: (n_samples, n_features)
y = data['label'].values

# 划分训练集/测试集（7:3）
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print(f"训练集大小: {X_train.shape}")
print(f"测试集大小: {X_test.shape}")


训练集大小: (19401, 5000)
测试集大小: (8315, 5000)


# 构建变分贝叶斯模型

In [24]:
class SpamClassifier:
    def __init__(self):
        self.alpha0 = 1
        self.beta0 = 1
        self.lambda0 = 1
        self.max_iter = 100
        self.tol = 1e-6

    def compute_elbo(self,alpha, beta, lambda0, lambda1, alpha0, beta0, lambda0_prior, X, y):
        n, k = X.shape
        n0 = np.sum(y==0)
        n1 = np.sum(y==1)
        
        # 计算L1: pi的贡献
        L1 = (alpha0-1)*digamma(alpha) + (beta0-1)*digamma(beta) - (alpha-1)*digamma(alpha) - (beta-1)*digamma(beta)
        L1 -= betaln(alpha, beta) - betaln(alpha0, beta0)
        
        # 计算L2: theta0和theta1的贡献
        sum_lambda0 = np.sum(lambda0)
        sum_lambda1 = np.sum(lambda1)
        L2 = 0
        # 类别0
        L2 += k*(lambda0_prior-1)*digamma(lambda0_prior) - (sum_lambda0 - k)*digamma(sum_lambda0)
        L2 += np.sum((lambda0_prior - lambda0)*digamma(lambda0))
        L2 += gammaln(k*lambda0_prior) - np.sum(gammaln(lambda0_prior)) - (gammaln(sum_lambda0) - np.sum(gammaln(lambda0)))
        # 类别1
        L2 += k*(lambda0_prior-1)*digamma(lambda0_prior) - (sum_lambda1 - k)*digamma(sum_lambda1)
        L2 += np.sum((lambda0_prior - lambda1)*digamma(lambda1))
        L2 += gammaln(k*lambda0_prior) - np.sum(gammaln(lambda0_prior)) - (gammaln(sum_lambda1) - np.sum(gammaln(lambda1)))
        
        # 计算L3: 标签似然的贡献
        L3 = n1*digamma(alpha) + n0*digamma(beta) - n*digamma(alpha + beta)
        
        # 计算L4: 特征似然的贡献
        L4 = 0
        # 类别0
        X0 = X[y==0]
        L4 += np.sum(X0 * digamma(lambda0)[np.newaxis, :]) - n0 * np.sum(X0 * digamma(sum_lambda0))
        # 类别1
        X1 = X[y==1]
        L4 += np.sum(X1 * digamma(lambda1)[np.newaxis, :]) - n1 * np.sum(X1 * digamma(sum_lambda1))
        
        return L1 + L2 + L3 + L4
    
    def train(self, X_train,y_train):
        n, k = X_train.shape
        n1 = np.sum(y_train)  # 垃圾邮件数
        n0 = n - n1           # 非垃圾邮件数
    
        # 初始化变分参数
        alpha = self.alpha0 + n1
        beta = self.beta0 + n0
        lambda0_j = self.lambda0 + np.sum(X_train[y_train==0], axis=0)  # 类别0的变分参数
        lambda1_j = self.lambda0 + np.sum(X_train[y_train==1], axis=0)  # 类别1的变分参数
    
        # 计算初始ELBO
        prev_elbo = self.compute_elbo(alpha, beta, lambda0_j, lambda1_j, self.alpha0, self.beta0, self.lambda0, X_train, y_train)
    
        for iter in range(self.max_iter):
           # 更新变分参数（此处因共轭性，一次更新即可收敛，迭代仅为验证ELBO）
            alpha_new = self.alpha0 + n1
            beta_new = self.beta0 + n0
            lambda0_j_new = self.lambda0 + np.sum(X_train[y_train==0], axis=0)
            lambda1_j_new = self.lambda0 + np.sum(X_train[y_train==1], axis=0)
            # 计算新ELBO
            curr_elbo = self.compute_elbo(alpha_new, beta_new, lambda0_j_new, lambda1_j_new, self.alpha0, self.beta0, self.lambda0, X_train, y_train)
            
            # 检查收敛
            if np.abs(curr_elbo - prev_elbo) < self.tol:
                break
            
            # 更新参数
            alpha, beta = alpha_new, beta_new
            lambda0_j, lambda1_j = lambda0_j_new, lambda1_j_new
            prev_elbo = curr_elbo
        
        return {'alpha': alpha, 'beta': beta, 'lambda0': lambda0_j, 'lambda1': lambda1_j}
    
    def predict(self,model, X_test, S=200):
        alpha, beta = model['alpha'], model['beta']
        lambda0, lambda1 = model['lambda0'], model['lambda1']
        sum_lambda0, sum_lambda1 =  np.sum(model['lambda0']), np.sum(model['lambda1'])
        
        # 1. 计算变分参数的均值（解析解）
        pi_mean = alpha / (alpha + beta)  # E[pi]
        theta0_mean = lambda0 / sum_lambda0  # E[theta0_j]
        theta1_mean = lambda1 / sum_lambda1  # E[theta1_j]
        
        # 2. 计算对数似然 log p(x|theta_c) = sum(x_j * log(theta_cj))（批量计算）
        # 稀疏矩阵乘法：X_test (n_test, k) @ log_theta (k, 1) → (n_test, 1)
        log_theta0 = np.log(theta0_mean + 1e-10)  # 加小值避免log(0)
        log_theta1 = np.log(theta1_mean + 1e-10)
        
        # 批量计算所有测试样本的对数似然
        log_p_x0 = X_test @ log_theta0.reshape(-1, 1)  # (n_test, 1)
        log_p_x1 = X_test @ log_theta1.reshape(-1, 1)  # (n_test, 1)
        
        # 3. 计算后验概率 p(y=1|x) ≈ [pi_mean * exp(log_p_x1)] / [pi_mean*exp(...) + (1-pi_mean)*exp(...)]
        # 取指数避免数值下溢：exp(a)/(exp(a)+exp(b)) = 1/(1+exp(b-a))
        log_ratio = log_p_x1 - log_p_x0  # log(p(x|1)/p(x|0))
        pi_ratio = pi_mean / (1 - pi_mean + 1e-10)  # 避免分母为0
        log_posterior_ratio = np.log(pi_ratio) + log_ratio
        
        #  sigmoid函数转换为概率（数值稳定）
        y_pred_prob = 1 / (1 + np.exp(-log_posterior_ratio))
        y_pred_prob = y_pred_prob.flatten()  # 转为一维数组
        
        # 4. 阈值判断（默认0.5）
        y_pred = (y_pred_prob > 0.5).astype(int)
        return y_pred, y_pred_prob

    def evaluate(self,model,X_test,y_test):
        pred_y,_=self.predict(model, X_test)
        accuracy=accuracy_score(y_test,pred_y)
        print(f'准确率为{accuracy:.2f}')
        cm=confusion_matrix(y_test,pred_y)
        print("混淆矩阵为")
        print(cm)

        return accuracy,pred_y

In [25]:
classifier=SpamClassifier()
model=classifier.train(X_train,y_train) # 模型训练

In [26]:
accuracy,pred_y=classifier.evaluate(model,X_test,y_test) # 模型评估

准确率为0.98
混淆矩阵为
[[4419   95]
 [  70 3731]]
