作业内容：使用sklearn框架库中的朴素贝叶斯算法，对垃圾邮件分类任务中的数据集进行分类实现
数据集下载地址：https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
其中"ham"，表示信息不是垃圾信息，以及"spam"，表示信息是垃圾信息。

In [72]:
from sklearn.naive_bayes import BernoulliNB
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB

In [73]:
# 加载数据集
df = pd.read_csv('smsspamcollection/SMSSpamCollection', sep='\t', names=['label', 'sms_message'])
df['label'] = df.label.map({'ham':0, 'spam':1})
df.head()


Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [74]:
x_train, x_test, y_train, y_test = train_test_split(df['sms_message'], 
                                                    df['label'], 
                                                    random_state=1,test_size=0.2)
print("train size:",len(x_train))
print("test size",len(x_test))

train size: 4457
test size 1115


In [75]:
# 数据预处理           
count_vector = CountVectorizer(stop_words='english')
training_data = count_vector.fit_transform(x_train)
testing_data = count_vector.transform(x_test)


In [76]:

# 伯努利朴素贝叶斯
naive_bayes = BernoulliNB()
# 拟合数据
naive_bayes.fit(training_data, y_train)
# 预测结果
predictions = naive_bayes.predict(testing_data)
                    

In [77]:
print('准确率: ', format(accuracy_score(y_test, predictions)))
print('精确率: ', format(precision_score(y_test, predictions)))
print('召回率: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

准确率:  0.9775784753363229
精确率:  0.9919354838709677
召回率:  0.8367346938775511
F1 score:  0.9077490774907749


In [78]:
# 多项分布朴素贝叶斯
naive_bayes = MultinomialNB()
# 拟合数据
naive_bayes.fit(training_data, y_train)
# 预测结果
predictions = naive_bayes.predict(testing_data)

In [79]:
print('准确率: ', format(accuracy_score(y_test, predictions)))
print('精确率: ', format(precision_score(y_test, predictions)))
print('召回率: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

准确率:  0.9910313901345291
精确率:  0.9790209790209791
召回率:  0.9523809523809523
F1 score:  0.9655172413793104


In [80]:
# 高斯朴素贝叶斯
naive_bayes = GaussianNB()

# 拟合数据
naive_bayes.fit(training_data.toarray(), y_train)
# # 预测结果
# predictions = naive_bayes.predict(testing_data)

GaussianNB(priors=None, var_smoothing=1e-09)

In [81]:
print('准确率: ', format(accuracy_score(y_test, predictions)))
print('精确率: ', format(precision_score(y_test, predictions)))
print('召回率: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

准确率:  0.9910313901345291
精确率:  0.9790209790209791
召回率:  0.9523809523809523
F1 score:  0.9655172413793104
