In [3]:
import os
import codecs
import jieba
import string
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import naive_bayes as bayes
from sklearn.model_selection import train_test_split

#### 导入数据集

In [4]:
email_frame = pd.read_excel("./chinesespam.xlsx", sheet_name=0)

In [18]:
with open("./stopwords.txt", "r", encoding='UTF8') as fr:
    stopwords = fr.read().split("\n")

#### 去掉停用词，将文本重新整理

In [25]:
processed_texts = []
for text in email_frame['text']:
    words = []
    seg_list = jieba.cut(text)
    for seg in seg_list:
        if (seg.isalpha())& (seg not in stopwords):
            words.append(seg)
    sentence = " ".join(words)
    processed_texts.append(sentence)

email_frame['text'] = processed_texts

In [26]:
email_frame.head(5)

Unnamed: 0,type,text
0,ham,讲 孔子 后人 故事 领导 回到 家乡 儿子 感情 贪财 孙子 孔为 和睦 领导 弟弟 魏宗...
1,ham,起诉 起诉 理由 MM 莫不是 生活 电影 中 结婚 感情 感情 何来 感情 传统 家庭 责...
2,ham,负债 不要紧 负得起 责任 欠 多钱 至少 当初 拿出 爱心 网友 交待 心 实在 能力 一...
3,ham,公司 内部 推荐 机会 视频 编解码器 pc dsp arm 优化 工作 wmv 编解码 做...
4,ham,鼓励 姐姐 解答 更好 赫赫 女生 追 男生 例子 想 请 帮帮忙 闹 分手 一个月 期间 ...


#### 文本内容转换为矩阵形式

In [64]:
# CountVectorizer 会将文本中的词语转换为词频矩阵
vectorizer = CountVectorizer(binary=False)   # 默认为 False，一个关键词在一篇文档中可能出现 n 次，如果 binary=True，非零的 n 将全部置为 1                  

# 标记并创建索引（返回的内容其实是一个字典，为每个词都标记了一个索引）
vectorizer.fit(email_frame['text'])

# 查看创建的索引
vocabulary = vectorizer.vocabulary_

# 应用索引并转化为 frame
vector = vectorizer.transform(email_frame['text'])
result = pd.DataFrame(vector.toarray())

In [65]:
result.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5972,5973,5974,5975,5976,5977,5978,5979,5980,5981
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


##### 将 frame 的标题设置为有意义的标题

In [66]:
keys = []
values = []
for key, value in vectorizer.vocabulary_.items():
    keys.append(key)
    values.append(value)
df = pd.DataFrame(data={"key": keys, "value": values})
columms = df.sort_values("value")["key"].values.tolist() 
result.columns = columms

In [68]:
textmatrix = result

In [69]:
textmatrix.head() 

Unnamed: 0,aac,aav,abc,account,actionscript,activex,address,adenoviruses,age,agreement,...,黑镜头,黑龙江省,默认,默默,黯然,鼎力支持,鼎韵,鼓励,鼠标,齐全
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


##### 提取出常用词

In [80]:
features = pd.DataFrame(textmatrix.apply(sum, axis=0))

# 只提取出总出现次数大于 5 的词作为特征
extracted_features = [features.index[i] for i in range(features.shape[0]) if features.iloc[i, 0]>5]
textmatrix = textmatrix[extracted_features]

#### 拆分训练集和测试集

In [85]:
train, test, trainlabel, testlabel = train_test_split(textmatrix, email_frame['type'], test_size=0.2)

#### 先验分布为伯努利分布的朴素贝叶斯

In [87]:
clf = bayes.BernoulliNB(alpha=1, binarize=True)
model = clf.fit(train, trainlabel)

In [89]:
model.predict(test)

array(['ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham'], dtype='<U4')

In [88]:
model.score(test, testlabel)

0.8

#### 先验分布为高斯分布的朴素贝叶斯

In [91]:
clf = bayes.GaussianNB()
model = clf.fit(train, trainlabel)

In [92]:
model.predict(test)

array(['ham', 'spam', 'spam', 'spam', 'ham', 'spam', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'spam', 'spam', 'ham', 'ham', 'spam', 'ham',
       'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'spam', 'ham',
       'spam', 'ham', 'spam'], dtype='<U4')

In [93]:
model.score(test, testlabel)

0.9333333333333333

#### 先验分布为多项式分布的朴素贝叶斯

In [94]:
clf = bayes.MultinomialNB(alpha=1)
model = clf.fit(train, trainlabel)

In [95]:
model.predict(test)

array(['ham', 'spam', 'spam', 'spam', 'ham', 'spam', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'spam', 'spam', 'ham', 'ham', 'ham', 'ham',
       'ham', 'ham', 'ham', 'spam', 'ham', 'ham', 'ham', 'spam', 'ham',
       'ham', 'ham', 'spam'], dtype='<U4')

In [96]:
model.score(test, testlabel)

1.0