#### 一、数据处理

##### 1. 读取数据

In [1]:
import os
def load_data(root_path, m_type):
    content_list = []
    for i in range(1,6):
        path = os.path.join(root_path, "enron%d/%s/" % (i, m_type))

        file_list = os.listdir(path)
        for file_name in file_list:
            file_path = os.path.join(path, file_name)
            if os.path.isfile(file_path):
                with open(file_path, encoding='utf-8', errors='ignore') as f:
                    content = ''.join([line.strip() for line in f.readlines()])
                content_list.append(content)
    return content_list


In [2]:
import random
random.seed(99)
sample_frac = 0.5
ham = load_data('./data', 'ham')
ham = random.sample(ham, round(sample_frac * len(ham)))
spam = load_data('./data', 'spam')
spam = random.sample(spam, round(sample_frac * len(spam)))
print("非垃圾邮件数：", len(ham))
print("垃圾邮件数：", len(spam))

非垃圾邮件数： 7522
垃圾邮件数： 6336


##### 2. 抽取特征

In [4]:
import numpy as np
import tflearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer  

In [5]:
max_document_length = 100
def  get_features_by_tf(X, y):
    vp = tflearn.data_utils.VocabularyProcessor(
        max_document_length=max_document_length,
        min_frequency=0,
        vocabulary=None,
        tokenizer_fn=None)
    X = vp.fit_transform(X, unused_y=None)
    X = np.array(list(X))
    return X, y

In [6]:
def get_features_by_wordbag_tfidf(X, y):
    vectorizer = CountVectorizer(binary=False,
                                 decode_error='ignore',
                                 strip_accents='ascii',
                                 max_features=10000,
                                 stop_words='english',
                                 max_df=1.0,
                                 min_df=1 )
    X = vectorizer.fit_transform(X)
    X = X.toarray()
    transformer = TfidfTransformer(smooth_idf=False)
    tfidf = transformer.fit_transform(X)
    X = tfidf.toarray()
    return  X, y

In [7]:
from sklearn.model_selection import train_test_split
X = ham + spam
y=[0] * len(ham) + [1] * len(spam)
proX, proy = get_features_by_wordbag_tfidf(X, y)
Xtrain, Xtest, ytrain, ytest = train_test_split(proX, proy, test_size = 0.3, random_state = 99)
print("训练集:", len(ytrain))
print("测试集:", len(ytest))


训练集: 9700
测试集: 4158


#### 二、模型

In [8]:
from sklearn import metrics
def evaluate(ytest, ypred):
    print("accuracy:\t", metrics.accuracy_score(ytest, ypred))
    print("precision:\t", metrics.precision_score(ytest, ypred))
    print("recall:\t", metrics.recall_score(ytest, ypred))
    print("confusion matrix:\n", metrics.confusion_matrix(ytest, ypred))

##### 1. 朴素贝叶斯

In [9]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(Xtrain,ytrain)

GaussianNB()

In [10]:
print("训练集：")
evaluate(ytrain, gnb.predict(Xtrain))

训练集：
accuracy:	 0.9956701030927835
precision:	 0.9934611048478016
recall:	 0.9970581579542883
confusion matrix:
 [[5252   29]
 [  13 4406]]


In [11]:
print("测试集")
evaluate(ytest, gnb.predict(Xtest))

测试集
accuracy:	 0.9677729677729677
precision:	 0.967488201363398
recall:	 0.9624413145539906
confusion matrix:
 [[2179   62]
 [  72 1845]]


##### 2. 支持向量机

In [12]:
from sklearn import svm
svm_model = svm.SVC()
svm_model.fit(Xtrain, ytrain)

SVC()

In [13]:
print("训练集：")
evaluate(ytrain, svm_model.predict(Xtrain))

训练集：
accuracy:	 0.9989690721649485
precision:	 0.9981920903954802
recall:	 0.9995474089160443
confusion matrix:
 [[5273    8]
 [   2 4417]]


In [14]:
print("测试集")
evaluate(ytest, svm_model.predict(Xtest))

测试集
accuracy:	 0.9843674843674843
precision:	 0.9705284552845529
recall:	 0.9963484611371936
confusion matrix:
 [[2183   58]
 [   7 1910]]


##### 3. MLP

In [15]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(alpha=1e-5,
                    hidden_layer_sizes = (64, 32, 16),
                    random_state = 99)
mlp.fit(Xtrain, ytrain)

MLPClassifier(alpha=1e-05, hidden_layer_sizes=(64, 32, 16), random_state=99)

In [16]:
print("训练集：")
evaluate(ytrain, mlp.predict(Xtrain))

训练集：
accuracy:	 0.9996907216494846
precision:	 0.9993215739484396
recall:	 1.0
confusion matrix:
 [[5278    3]
 [   0 4419]]


In [17]:
print("测试集")
evaluate(ytest, mlp.predict(Xtest))

测试集
accuracy:	 0.9850889850889851
precision:	 0.9833246482542991
recall:	 0.9843505477308294
confusion matrix:
 [[2209   32]
 [  30 1887]]


#### 4. CNN

In [18]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from tflearn.layers.core import input_data, fully_connected, dropout
from tflearn.layers.conv import conv_1d, global_max_pool
from tflearn.layers.merge_ops import merge
import tensorflow as tf
proX, proy = get_features_by_tf(X, y)
Xtrain, Xtest, ytrain, ytest = train_test_split(proX, proy, test_size = 0.3, random_state = 99)
# Converting labels to binary vectors
y_train = to_categorical(ytrain, num_classes=2)
y_test = to_categorical(ytest, num_classes=2)

# Building convolutional network
network = input_data(shape=[None,max_document_length], name='input')
network = tflearn.embedding(network, input_dim=1000000, output_dim=128)
branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2")
branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2")
branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2")
network = merge([branch1, branch2, branch3], mode='concat', axis=1)
network = tf.expand_dims(network, 2)
network = global_max_pool(network)
network = dropout(network, 0.8)
network = fully_connected(network, 2, activation='softmax')
network = tflearn.regression(network, optimizer='adam', learning_rate=0.001,
                        loss='categorical_crossentropy', name='target')
# Training
model = tflearn.DNN(network, tensorboard_verbose=0)
model.fit(Xtrain, y_train,
            n_epoch=5, shuffle=True, validation_set=(Xtest, y_test),
            show_metric=True, batch_size=128,run_id="spam")

Training Step: 379  | total loss: [1m[32m0.02518[0m[0m | time: 103.237s
| Adam | epoch: 005 | loss: 0.02518 - acc: 0.9976 -- iter: 9600/9700
Training Step: 380  | total loss: [1m[32m0.02415[0m[0m | time: 106.247s
| Adam | epoch: 005 | loss: 0.02415 - acc: 0.9979 | val_loss: 0.07756 - val_acc: 0.9735 -- iter: 9700/9700
--


In [19]:
print("训练集：")
evaluate(ytrain, np.argmax(model.predict(Xtrain), axis=1))

训练集：
accuracy:	 0.9997938144329896
precision:	 0.9995476136620675
recall:	 1.0
confusion matrix:
 [[5279    2]
 [   0 4419]]


In [20]:
print("测试集：")
evaluate(ytest, np.argmax(model.predict(Xtest), axis=1))

测试集：
accuracy:	 0.9735449735449735
precision:	 0.9683773976153447
recall:	 0.9744392279603548
confusion matrix:
 [[2180   61]
 [  49 1868]]
