In [133]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score
import os
import re
import pandas as pd
import numpy as np
from collections import Counter

# 读取邮件

In [134]:
# 假设的文件路径
train_dir = "train-mails"
test_dir = "test-mails"

def load_data(directory):
    """ 从指定目录加载邮件数据，返回邮件内容列表和标签列表 """
    files = os.listdir(directory)
    emails = []
    labels = []

    for file in files:
        with open(os.path.join(directory, file), 'r', encoding='utf-8') as f:
            email_content = f.read()
            # 保留字母并转换为小写
            email_content = re.sub('[^a-zA-Z]', ' ', email_content).lower()
            emails.append(email_content)
            # 根据文件名判断是否为垃圾邮件
            labels.append(1 if 'spm' in file else 0)
    
    return emails, labels

# 加载训练和测试数据

In [135]:
train_emails, train_labels = load_data(train_dir)
test_emails, test_labels = load_data(test_dir)
#train_emails[0],train_labels.count(1), train_labels.count(0)

# 数据处理
去除除了字母之外的东西
转化为小写
制作字典（训练集的字典）和映射

In [136]:
def text_encoder(emails,word_index_map=None):
    num_sample=len(emails)
    emails_cleaned=[]
    all_words=[]
    for email in emails:
        cleaned_word=re.sub('[^a-zA-z]', ' ', email).lower()
        emails_cleaned.append(cleaned_word)
        all_words+=cleaned_word.split()
    vocabulary=Counter(all_words)
    if word_index_map is None:
        word_index_map={word:index for index,word in enumerate(vocabulary.keys())}
    
    vectors=np.zeros((num_sample,len(word_index_map)))
    for i in range(num_sample):
        words=emails_cleaned[i].split()
        for word in words:
            if word in word_index_map:
                vectors[i,word_index_map[word]]+=1
    return vectors,word_index_map
train_features,word_index_map=text_encoder(train_emails)
test_features,_=text_encoder(test_emails,word_index_map)#word_index_map传入训练集的字典
train_features.shape, test_features.shape

((867, 26137), (291, 26137))

In [137]:
#验证数据的大小是否正确
train_features[0].shape,train_features[1].shape

((26137,), (26137,))

# 手写朴素贝叶斯
#### 先验概率

In [138]:
prior=np.zeros(2)
prior[1]=np.mean(train_labels)
prior[0]=1-prior[1]
prior#先验概率

array([0.83391003, 0.16608997])

#### 条件概率

In [139]:
train_labels=np.array(train_labels)
test_labels=np.array(test_labels)
cond_prob=np.zeros((train_features.shape[1],2))#train_features.shape[1]是词汇表的长度

In [140]:
for i in range(train_features.shape[1]):
    cond_prob[i,1]=np.sum(train_features[:,i]*train_labels)+1
    cond_prob[i,0]=np.sum(train_features[:,i]*(1-train_labels))+1

#归一化cond_prob
cond_prob_std=cond_prob/np.sum(cond_prob,axis=0)
cond_prob_std.sum(axis=0)

array([1., 1.])

#### prediction on Test Set(用取对数之后的后验概率)

In [141]:
predictions=np.zeros(test_features.shape[0])#test_features.shape[0]是测试集的样本数
for i in range(test_features.shape[0]):
    p1=np.sum(np.log(cond_prob_std[:,1])*test_features[i,:])+np.log(prior[1])
    p0=np.sum(np.log(cond_prob_std[:,0])*test_features[i,:])+np.log(prior[0])
    if p1>p0:
        predictions[i]=1
    else:
        predictions[i]=0

# 求测试集上的正确率

In [142]:
accuracy=np.sum(predictions==test_labels)/test_labels.shape[0]
accuracy

0.993127147766323

# 求测试集上的F1 index

In [143]:
#F_1score
TP=np.sum((predictions==1)&(test_labels==1))
FP=np.sum((predictions==1)&(test_labels==0))
FN=np.sum((predictions==0)&(test_labels==1))
precision=TP/(TP+FP)
recall=TP/(TP+FN)
f1=2*precision*recall/(precision+recall)
f1

0.9791666666666666

# 打印测试集的部分label看看和我的预测是不是一样

In [144]:
df_pred=pd.DataFrame({'label':test_labels, 'prediction':predictions})
df_pred.iloc[-10:,:]

Unnamed: 0,label,prediction
281,1,1.0
282,1,1.0
283,1,1.0
284,1,1.0
285,1,1.0
286,1,1.0
287,1,1.0
288,1,1.0
289,1,1.0
290,1,1.0


# 用sklearn库看看他能达到怎样的效果，和我的比比

In [145]:
# 训练朴素贝叶斯分类器
classifier = MultinomialNB()
classifier.fit(train_features, train_labels)

In [146]:
# 预测测试数据
predictions = classifier.predict(test_features)

# 计算准确率和 F1 分数
accuracy = accuracy_score(test_labels, predictions)
f1 = f1_score(test_labels, predictions)

accuracy, f1

(0.993127147766323, 0.9791666666666666)

In [147]:
#把测试数据的label和预测结果放在一起
import pandas as pd
test_result = pd.DataFrame({'label':test_labels, 'prediction':predictions})
test_result.iloc[-20:,:]

Unnamed: 0,label,prediction
271,1,1
272,1,1
273,1,0
274,1,1
275,1,1
276,1,1
277,1,1
278,1,1
279,1,1
280,1,1


In [148]:
test_result.iloc[:20,:]

Unnamed: 0,label,prediction
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0
