https://www.kdnuggets.com/2017/03/email-spam-filtering-an-implementation-with-python-and-scikit-learn.html

In [1]:
import os
import numpy as np
from collections import Counter
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

def make_Dictionary(train_dir):
    emails = [os.path.join(train_dir,f) for f in os.listdir(train_dir)]    
    all_words = []       
    for mail in emails:    
        with open(mail) as m:
            for i,line in enumerate(m):
                if i == 2:
                    words = line.split()
                    all_words += words
    
    dictionary = Counter(all_words)
    
    for item in list(dictionary):
        if item.isalpha() == False: 
            del dictionary[item]
        elif len(item) == 1:
            del dictionary[item]
    dictionary = dictionary.most_common(5000)
    return dictionary
    
def extract_features(mail_dir): 
    files = [os.path.join(mail_dir,fi) for fi in os.listdir(mail_dir)]
    features_matrix = np.zeros((len(files),5000))
    docID = 0;
    for fil in files:
        with open(fil) as fi:
            for i,line in enumerate(fi):
                if i == 2:
                    words = line.split()
                    for word in words:
                        wordID = 0
                        for j,d in enumerate(dictionary):
                            if d[0] == word:
                                wordID = j
                                features_matrix[docID,wordID] = words.count(word)
            docID = docID + 1     
    return features_matrix
    
# Create a dictionary of words with its frequency

train_dir = 'train-mails'
dictionary = make_Dictionary(train_dir)

# Prepare feature vectors per training mail and its labels

train_labels = np.zeros(702)
train_labels[351:701] = 1
train_matrix = extract_features(train_dir)

# Training SVM and Naive bayes classifier and its variants

model1 = LinearSVC()
model2 = LogisticRegression()

model1.fit(train_matrix,train_labels)
model2.fit(train_matrix,train_labels)

# Test the unseen mails for Spam

test_dir = 'test-mails'
test_matrix = extract_features(test_dir)
test_labels = np.zeros(260)
test_labels[130:260] = 1

result1 = model1.predict(test_matrix)
result2 = model2.predict(test_matrix)

print (confusion_matrix(test_labels,result1))
print (confusion_matrix(test_labels,result2))

[[51 79]
 [58 72]]
[[65 65]
 [64 66]]


> training result is bad

### New method

In [2]:
import os
import re
import csv
import operator
import collections


In [3]:
# split word to ( , )

train_dir = 'train-mails'
emails = [os.path.join(train_dir,f) for f in os.listdir(train_dir)]

all_words = []
for email in emails:
    with open(email, encoding = 'ISO-8859-1') as m:
        a = []
        for i,line in enumerate(m):
            if i == 2: #Bodt of email is only 2 line of text file
                words_ = line
                pattern3 = re.compile("[^\w\d]+")
                words_ = pattern3.sub(' ',words_)
                words = words_.split()
                for j in range(len(words)):
                    a.append(words[j])
    all_words.append(a)
print ('size of letter :', len(all_words))

size of letter : 702


In [4]:
dictionary = make_Dictionary(train_dir)
print ("size of dictionary :", len(dictionary))
np.save('dictionary.npy', dictionary)

dictionary = np.load('dictionary.npy')

size of dictionary : 5000


In [5]:
# word filter

example_doc = []

for letter in all_words:
    example_line = ''
    for w in letter:
        if w in dictionary:
            example_line += w+' '
        
    example_doc.append(example_line)
print (example_doc[0])

several submit overlook oversight fine example phenomenon english though mary retire perhap most submission date perhap current work various mean english preposition case difference various oversight overlook stem largely compound over over first seem fairly limit power analysis reveal number distinction mean quite indeed addition physical location over indicate power relationship both force authority value judgement general notion move another thing over skip over analysis oversight overlook one sense over mean authority over sense mean over perhap similar argument part compound instance own interest seem carry authority mean over analysis clear cut interest someone case half opinion own knowledge engineer center machine translation cmu want opinion carnegie mellon university pittsburgh pa usa pay 


In [6]:
# converting data to vectors

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df = 0.0, analyzer="word")
vectorizer.fit(example_doc)

X_train =  vectorizer.transform(example_doc)
print ("sparse matrix :", X_train.shape)

sparse matrix : (702, 5259)


In [7]:
model1 = LinearSVC()
model2 = LogisticRegression()

model1.fit(X_train,train_labels)
model2.fit(X_train,train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [8]:
# test

test_dir = 'test-mails'
emails = [os.path.join(test_dir,f) for f in os.listdir(test_dir)]

all_words = []
for email in emails:
    with open(email, encoding = 'ISO-8859-1') as m:
        a = []
        for i,line in enumerate(m):
            if i == 2: #Bodt of email is only 2 line of text file
                words_ = line
                pattern3 = re.compile("[^\w\d]+")
                words_ = pattern3.sub(' ',words_)
                words = words_.split()
                for j in range(len(words)):
                    a.append(words[j])
    all_words.append(a)
print ('size of letter :', len(all_words))

size of letter : 260


In [9]:
# word filter

example_doc = []

for letter in all_words:
    example_line = ''
    for w in letter:
        if w in dictionary:
            example_line += w+' '
        
    example_doc.append(example_line)
print (example_doc[0])

vacation most place earth florida special online promotional vacation package is limited don wait our website information price package info sorry offer available travel agent resale offer available world wide full detail http ma trip html must internet explorer 


In [10]:
# converting data to vectors

X_test =  vectorizer.transform(example_doc)
print ("sparse matrix :", X_test.shape)

sparse matrix : (260, 5259)


In [11]:
result1 = model1.predict(X_test)
result2 = model2.predict(X_test)

print (confusion_matrix(test_labels,result1))
print (confusion_matrix(test_labels,result2))

[[70 60]
 [60 70]]
[[66 64]
 [51 79]]


> still bad...

### Enron-data-set

In [2]:
import os
import pickle
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC

N = 33716 #5172

def make_Dictionary(root_dir):
    emails_dirs = [os.path.join(root_dir,f) for f in os.listdir(root_dir)]    
    all_words = []       
    for emails_dir in emails_dirs:
        dirs = [os.path.join(emails_dir,f) for f in os.listdir(emails_dir)]
        for d in dirs:
            emails = [os.path.join(d,f) for f in os.listdir(d)]
            for mail in emails:
                with open(mail, encoding = 'ISO-8859-1') as m:
                    for line in m:
                        words = line.split()
                        all_words += words
    dictionary = Counter(all_words)
    list_to_remove = dictionary.keys()
    
    for item in list(dictionary):
        if item.isalpha() == False: 
            del dictionary[item]
        elif len(item) == 1:
            del dictionary[item]
    dictionary = dictionary.most_common(3000)
    
    np.save('dict_enron.npy',dictionary) 
    
    return dictionary
    
def extract_features(root_dir): 
    emails_dirs = [os.path.join(root_dir,f) for f in os.listdir(root_dir)]  
    docID = 0
    features_matrix = np.zeros((N,3000))
    train_labels = np.zeros(N)
    for emails_dir in emails_dirs:
        dirs = [os.path.join(emails_dir,f) for f in os.listdir(emails_dir)]
        for d in dirs:
            emails = [os.path.join(d,f) for f in os.listdir(d)]
            for mail in emails:
                with open(mail, encoding = 'ISO-8859-1') as m:
                    all_words = []
                    for line in m:
                        words = line.split()
                        all_words += words
                    for word in all_words:
                        wordID = 0
                        for i,d in enumerate(dictionary):
                            if d[0] == word:
                                wordID = i
                                features_matrix[docID,wordID] = all_words.count(word)
                train_labels[docID] = int(mail.split(".")[-2] == 'spam')
                docID = docID + 1                
    return features_matrix, train_labels


In [5]:
#Create a dictionary of words with its frequency

root_dir = 'Enron-data-set'
dictionary = make_Dictionary(root_dir)


#Prepare feature vectors per training mail and its labels

features_matrix, labels = extract_features(root_dir)
np.save('enron_features_matrix.npy',features_matrix)
np.save('enron_labels.npy',labels)

FileNotFoundError: [WinError 3] 系統找不到指定的路徑。: 'Enron-data-set'

In [14]:
#train_matrix = np.load('enron_features_matrix.npy');
#labels = np.load('enron_labels.npy');
print (features_matrix.shape)
print (labels.shape)
print (sum(labels==0),sum(labels==1))
X_train, X_test, y_train, y_test = train_test_split(features_matrix, labels, test_size=0.40)

## Training models and its variants

model1 = LinearSVC()
model2 = MultinomialNB()

model1.fit(X_train,y_train)
model2.fit(X_train,y_train)

result1 = model1.predict(X_test)
result2 = model2.predict(X_test)

print (confusion_matrix(y_test, result1))
print (confusion_matrix(y_test, result2))

(33716, 3000)
(33716,)
16545 17171
[[6448  154]
 [ 111 6774]]
[[6394  208]
 [ 152 6733]]


In [18]:
filename = 'spam_model.sav'

pickle.dump(model1, open(filename, 'wb'))

### real rowdata

In [3]:
# load the model from disk
filename = 'spam_model.sav'

loaded_model = pickle.load(open(filename, 'rb'))

In [4]:
def extract_features_realdata(root_dir):
    emails = [os.path.join(root_dir,f) for f in os.listdir(root_dir)]  
    docID = 0
    features_matrix = np.zeros((len(emails),3000))
    for mail in emails:
        print (mail)
        with open(mail, encoding = 'ISO-8859-1') as m:
            all_words = []
            for line in m:
                words = line.split()
                all_words += words
                for word in all_words:
                    wordID = 0
                    for i,d in enumerate(dictionary):
                        if d[0] == word:
                            wordID = i
                            features_matrix[docID,wordID] = all_words.count(word)
        docID = docID + 1                
    return features_matrix

In [35]:
real_dir = "test"
real_matrix = extract_features_realdata(real_dir)
print (real_matrix.shape)

test/0014.1999-12-15.farmer.ham.txt
test/Sample_000001_00000115.txt
test/Sample_000001_00000039.txt
test/0021.1999-12-15.farmer.ham.txt
test/0011.1999-12-14.farmer.ham.txt
test/0012.1999-12-14.farmer.ham.txt
test/0010.1999-12-14.farmer.ham.txt
test/Sample_000001_00000063.txt
test/Sample_000001_00000089.txt
test/0016.1999-12-15.farmer.ham.txt
test/0015.1999-12-15.farmer.ham.txt
test/0020.1999-12-15.farmer.ham.txt
test/Sample_000001_00000019.txt
test/Sample_000001_00000141.txt
test/0013.1999-12-14.farmer.ham.txt
test/0009.1999-12-14.farmer.ham.txt
test/0019.1999-12-15.farmer.ham.txt
test/0022.1999-12-16.farmer.ham.txt
test/Sample_000001_00000211.txt
test/Sample_000001_00000170.txt
test/Sample_000001_00000186.txt
test/Sample_000001_00000021.txt
test/Sample_000001_00000154.txt
test/Sample_000001_00000164.txt
test/Sample_000001_00000074.txt
test/Sample_000001_00000007.txt
test/Sample_000001_00000030.txt
test/Sample_000001_00000114.txt
test/Sample_000001_00000150.txt
(29, 3000)


In [36]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.predict(real_matrix)
print(result)

[ 0.  0.  1.  0.  0.  0.  0.  1.  1.  0.  0.  0.  1.  1.  0.  0.  0.  0.
  1.  1.  1.  1.  1.  1.  1.  0.  1.  1.  1.]
